In [115]:
import csv
import json
import pandas as pd
import numpy as np
import random
from collections import defaultdict

In [116]:
train_set = pd.read_csv('../data/final/train_reconstructed.csv')

X_train = train_set.iloc[:,:-1].values
y_train = train_set['edge'].values

print('Training set X: {}'.format(X_train[:2]))
print('Training set Y: {}'.format(y_train[:2]))

Training set X: [[22.          2.          0.          3.         12.          0.46153846
   3.          0.375       0.          0.        ]
 [14.          0.          1.          5.         13.          0.56521739
   2.          0.33333333  0.          0.        ]]
Training set Y: [1 1]


In [117]:
test_set = pd.read_csv('../data/final/dev-test.csv')

X_test = test_set.iloc[:,:-1].values
y_test = test_set['edge'].values

print('Test set X: {}'.format(X_test[:2]))
print('Test set Y: {}'.format(y_test[:2]))

Test set X: [[ 9.          0.          1.          2.          6.          0.3
   0.          0.          0.          0.        ]
 [12.          0.          0.          2.          8.          0.34782609
   1.          0.14285714  0.          0.        ]]
Test set Y: [0 1]


In [118]:
from sklearn.dummy import DummyClassifier

ds_clf = DummyClassifier(strategy="most_frequent") # Define our model, set parameter strategy to 'most_frequent'
ds_clf.fit(X_train, y_train) # Use model.fit to train with our dataset 
Y_predict = ds_clf.predict(X_test) # Use model.predict to make prediction
print("Prediction :", Y_predict[:10])
print("Accuracy for train set:", ds_clf.score(X_train,y_train))
print("Accuracy for dev set:", ds_clf.score(X_test, y_test)) # Use model.score to evaluate our model.

Prediction : [1 1 1 1 1 1 1 1 1 1]
Accuracy for train set: 0.5011504396885933
Accuracy for dev set: 0.5


In [119]:
from sklearn.naive_bayes import BernoulliNB
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
bnb_train_results = []
bnb_test_results = []

for a in alpha:
    bnb = BernoulliNB(alpha=a)
    bnb.fit(X_train, y_train)
    
    train_acc = bnb.score(X_train, y_train)
    test_acc = bnb.score(X_test, y_test)
    
    print("alpha:{:.2f}, acc_train:{:.4f}, acc_test:{:.4f}".format(a, train_acc, test_acc))
    bnb_train_results.append(train_acc)
    bnb_test_results.append(test_acc)
    
    # Prob of being one
    Y_train_proba = bnb.predict_proba(X_test)
    #print("Probabilities :", Y_proba[:10,1])

alpha:0.10, acc_train:0.9599, acc_test:0.7641
alpha:0.20, acc_train:0.9599, acc_test:0.7641
alpha:0.30, acc_train:0.9599, acc_test:0.7641
alpha:0.40, acc_train:0.9599, acc_test:0.7641
alpha:0.50, acc_train:0.9599, acc_test:0.7641
alpha:0.60, acc_train:0.9599, acc_test:0.7641
alpha:0.70, acc_train:0.9599, acc_test:0.7641
alpha:0.80, acc_train:0.9599, acc_test:0.7641
alpha:0.90, acc_train:0.9599, acc_test:0.7641
alpha:1.00, acc_train:0.9599, acc_test:0.7641


In [156]:
from sklearn.naive_bayes import MultinomialNB
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
mnb_train_results_acc = []
mnb_test_results_acc = []
mnb_train_results_auc = []
mnb_test_results_auc = []

for a in alpha:
    mnb = MultinomialNB(alpha=a)
    mnb.fit(X_train, y_train)

    train_acc = mnb.score(X_train, y_train)
    test_acc = mnb.score(X_test, y_test)
    
    # Prob of edge=1
    y_train_proba = mnb.predict_proba(X_train)
    y_test_proba = mnb.predict_proba(X_test)
    
    # get roc fpr and tpr
    fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_proba[:,1])
    fpr1, tpr1, thresholds = metrics.roc_curve(y_test, y_test_proba[:,1])
    auc_train = metrics.auc(fpr, tpr)
    auc_test = metrics.auc(fpr1, tpr1)
    print("RF : depth:{:.2f}, acc_train:{:.4f}, acc_test:{:.4f}, AUC_train:{:.4f}, AUC_test:{:.4f}".format(a, 
                                                                                                    train_acc, 
                                                                                                    test_acc,
                                                                                                    auc_train,
                                                                                                    auc_test))                 

    mnb_train_results_acc.append(train_acc)
    mnb_test_results_acc.append(test_acc)
    mnb_train_results_auc.append(auc_train)
    mnb_test_results_auc.append(auc_test)

RF : depth:0.10, acc_train:0.9071, acc_test:0.6745, AUC_train:0.9632, AUC_test:0.7647
RF : depth:0.20, acc_train:0.9071, acc_test:0.6745, AUC_train:0.9632, AUC_test:0.7647
RF : depth:0.30, acc_train:0.9071, acc_test:0.6745, AUC_train:0.9632, AUC_test:0.7647
RF : depth:0.40, acc_train:0.9071, acc_test:0.6745, AUC_train:0.9632, AUC_test:0.7647
RF : depth:0.50, acc_train:0.9071, acc_test:0.6745, AUC_train:0.9632, AUC_test:0.7647
RF : depth:0.60, acc_train:0.9071, acc_test:0.6745, AUC_train:0.9632, AUC_test:0.7647
RF : depth:0.70, acc_train:0.9070, acc_test:0.6745, AUC_train:0.9632, AUC_test:0.7647
RF : depth:0.80, acc_train:0.9070, acc_test:0.6745, AUC_train:0.9632, AUC_test:0.7647
RF : depth:0.90, acc_train:0.9070, acc_test:0.6745, AUC_train:0.9632, AUC_test:0.7647
RF : depth:1.00, acc_train:0.9070, acc_test:0.6745, AUC_train:0.9632, AUC_test:0.7647


In [160]:
from sklearn.naive_bayes import GaussianNB

gnb_train_results_acc = []
gnb_test_results_acc = []
gnb_train_results_auc = []
gnb_test_results_auc = []

gnb = GaussianNB()
gnb.fit(X_train, y_train)

train_acc = gnb.score(X_train, y_train)
test_acc = gnb.score(X_test, y_test)

# Prob of edge=1
y_train_proba = gnb.predict_proba(X_train)
y_test_proba = gnb.predict_proba(X_test)

# get roc fpr and tpr
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_proba[:,1])
fpr1, tpr1, thresholds = metrics.roc_curve(y_test, y_test_proba[:,1])
auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fpr1, tpr1)
print("GNB: acc_train:{:.4f}, acc_test:{:.4f}, AUC_train:{:.4f}, AUC_test:{:.4f}".format(train_acc, 
                                                                                         test_acc,
                                                                                         auc_train,
                                                                                         auc_test))                 

gnb_train_results_acc.append(train_acc)
gnb_test_results_acc.append(test_acc)
gnb_train_results_auc.append(auc_train)
gnb_test_results_auc.append(auc_test)

GNB: acc_train:0.9515, acc_test:0.7877, AUC_train:0.9851, AUC_test:0.8111


In [165]:
from sklearn.ensemble import RandomForestClassifier

depth = range(1,10+1)
rf_train_results_acc = []
rf_test_results_acc = []
rf_train_results_auc = []
rf_test_results_auc = []

for d in depth:
    rf = RandomForestClassifier(max_depth=d, n_estimators=100)
    rf.fit(X_train, y_train)
    train_acc = clf.score(X_train,y_train)
    # Prob of edge=1
    y_train_proba = rf.predict_proba(X_train)
    y_test_proba = rf.predict_proba(X_test)

    # get roc fpr and tpr
    fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_proba[:,1])
    fpr1, tpr1, thresholds = metrics.roc_curve(y_test, y_test_proba[:,1])
    auc_train = metrics.auc(fpr, tpr)
    auc_test = metrics.auc(fpr1, tpr1)
    print("RF : depth:{:.2f}, acc_train:{:.4f}, acc_test:{:.4f}, AUC_train:{:.4f}, AUC_test:{:.4f}".format(d, 
                                                                                                    train_acc, 
                                                                                                    test_acc,
                                                                                                    auc_train,
                                                                                                    auc_test))                 

    rf_train_results_acc.append(train_acc)
    rf_test_results_acc.append(test_acc)
    rf_train_results_auc.append(auc_train)
    rf_test_results_auc.append(auc_test)

RF : depth:1.00, acc_train:0.9719, acc_test:0.7877, AUC_train:0.9801, AUC_test:0.8254
RF : depth:2.00, acc_train:0.9719, acc_test:0.7877, AUC_train:0.9864, AUC_test:0.8305
RF : depth:3.00, acc_train:0.9719, acc_test:0.7877, AUC_train:0.9892, AUC_test:0.8329
RF : depth:4.00, acc_train:0.9719, acc_test:0.7877, AUC_train:0.9900, AUC_test:0.8291
RF : depth:5.00, acc_train:0.9719, acc_test:0.7877, AUC_train:0.9909, AUC_test:0.8368
RF : depth:6.00, acc_train:0.9719, acc_test:0.7877, AUC_train:0.9919, AUC_test:0.8371
RF : depth:7.00, acc_train:0.9719, acc_test:0.7877, AUC_train:0.9925, AUC_test:0.8388
RF : depth:8.00, acc_train:0.9719, acc_test:0.7877, AUC_train:0.9934, AUC_test:0.8376
RF : depth:9.00, acc_train:0.9719, acc_test:0.7877, AUC_train:0.9944, AUC_test:0.8379
RF : depth:10.00, acc_train:0.9719, acc_test:0.7877, AUC_train:0.9956, AUC_test:0.8363


In [164]:
from sklearn.linear_model import LogisticRegression
lr_train_results_acc = []
lr_test_results_acc = []
lr_train_results_auc = []
lr_test_results_auc = []

lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)

# Prob of edge=1
y_train_proba = lr.predict_proba(X_train)
y_test_proba = lr.predict_proba(X_test)

# get roc fpr and tpr
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_proba[:,1])
fpr1, tpr1, thresholds = metrics.roc_curve(y_test, y_test_proba[:,1])
auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fpr1, tpr1)
print("LR : acc_train:{:.4f}, acc_test:{:.4f}, AUC_train:{:.4f}, AUC_test:{:.4f}".format(train_acc, 
                                                                                         test_acc,
                                                                                         auc_train,
                                                                                         auc_test))                 

lr_train_results_acc.append(train_acc)
lr_test_results_acc.append(test_acc)
lr_train_results_auc.append(auc_train)
lr_test_results_auc.append(auc_test)

LR : acc_train:0.9719, acc_test:0.7877, AUC_train:0.9904, AUC_test:0.8408


In [167]:
# use selected model and hyperparameter to calculate evaluation metrics: AUC, Accuracy
from sklearn import metrics

# GNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
train_acc = gnb.score(X_train, y_train)
test_acc = gnb.score(X_test, y_test)
# Prob of edge=1
y_test_proba = gnb.predict_proba(X_test)
# get roc fpr and tpr
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_proba[:,1])
# results
print("GNB: alpha:----, acc_train:{:.4f}, acc_test:{:.4f}, AUC_test:{:.4f}".format(
                                                                                train_acc, 
                                                                                test_acc,
                                                                                metrics.auc(fpr, tpr)))
# MNB
mnb = MultinomialNB(alpha=0.5)
mnb.fit(X_train, y_train)
train_acc = mnb.score(X_train, y_train)
test_acc = mnb.score(X_test, y_test)
# Prob of edge=1
y_test_proba = mnb.predict_proba(X_test)
# get roc fpr and tpr
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_proba[:,1])
# results
print("MNB: alpha:{:.2f}, acc_train:{:.4f}, acc_test:{:.4f}, AUC_test:{:.4f}".format(a, 
                                                                            train_acc, 
                                                                            test_acc,
                                                                            metrics.auc(fpr, tpr)))

# random forest
rf = RandomForestClassifier(max_depth=7, n_estimators=100)
rf.fit(X_train, y_train)
train_acc = rf.score(X_train,y_train)
test_acc = rf.score(X_test, y_test)
# Prob of edge=1
y_test_proba = rf.predict_proba(X_test)
# get roc fpr and tpr
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_proba[:,1])
print("RF : depth:{:.2f}, acc_train:{:.4f}, acc_test:{:.4f}, AUC_test:{:.4f}".format(a, 
                                                                                     train_acc, 
                                                                                     test_acc,
                                                                                     metrics.auc(fpr, tpr)))

GNB: alpha:----, acc_train:0.9515, acc_test:0.7877, AUC_test:0.8111
MNB: alpha:1.00, acc_train:0.9071, acc_test:0.6745, AUC_test:0.7647
RF : depth:1.00, acc_train:0.9672, acc_test:0.7842, AUC_test:0.8380


In [None]:
# Training full model
X_big = np.concatenate((X_train,X_test),0)
y_big = np.concatenate((y_train,y_test),0)

test_final = pd.read_csv('../data/final/test-final.csv')
test_final = test_final.values

In [None]:
clf = RandomForestClassifier(n_estimators=100,max_depth=2)
clf.fit(X_big, y_big)
print("Probabilities :",  clf.predict_proba(test_final)[:,1])
pred = clf.predict_proba(test_final)[:,1]
# print("Accuracy for train set:", clf.score(X_train,y_train))
# print("Accuracy for dev set: ", clf.score(X_test, y_test)) # Use model.score to evaluate our model.

submission = {
    'Id': range(1,len(pred)+1),
    'Predicted': pred
}

submission_df = pd.DataFrame(data=submission)
submission_df.to_csv('../data/final/sub.csv', index=False)