In [1]:
import csv
import json
import pandas as pd
import numpy as np
import random
from collections import defaultdict

In [2]:
train_set = pd.read_csv('../data/final/train_reconstructed.csv')

X_train = train_set.iloc[:,:-1].values
y_train = train_set['edge'].values

print('Training set X: {}'.format(X_train[:2]))
print('Training set Y: {}'.format(y_train[:2]))

Training set X: [[22.          2.          0.          3.         12.          0.46153846
   3.          0.375       0.          0.        ]
 [14.          0.          1.          5.         13.          0.56521739
   2.          0.33333333  0.          0.        ]]
Training set Y: [1 1]


In [3]:
test_set = pd.read_csv('../data/final/dev-test.csv')

X_test = test_set.iloc[:,:-1].values
y_test = test_set['edge'].values

print('Test set X: {}'.format(X_test[:2]))
print('Test set Y: {}'.format(y_test[:2]))

Test set X: [[ 9.          0.          1.          2.          6.          0.3
   0.          0.          0.          0.        ]
 [12.          0.          0.          2.          8.          0.34782609
   1.          0.14285714  0.          0.        ]]
Test set Y: [0 1]


In [29]:
from sklearn.dummy import DummyClassifier

ds_clf = DummyClassifier(strategy="most_frequent") # Define our model, set parameter strategy to 'most_frequent'
ds_clf.fit(X_train, y_train) # Use model.fit to train with our dataset 
Y_predict = ds_clf.predict(X_test) # Use model.predict to make prediction
print("Prediction :", Y_predict[:10])
print("Accuracy for train set:", ds_clf.score(X_train,y_train))
print("Accuracy for dev set:", ds_clf.score(X_test, y_test)) # Use model.score to evaluate our model.

Prediction : [1 1 1 1 1 1 1 1 1 1]
Accuracy for train set: 0.5012873732331459
Accuracy for dev set: 0.5


In [61]:
from sklearn.naive_bayes import BernoulliNB
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
bnb_train_results = []
bnb_test_results = []

for a in alpha:
    bnb = BernoulliNB(alpha=a)
    bnb.fit(X_train, y_train)
    
    train_acc = bnb.score(X_train, y_train)
    test_acc = bnb.score(X_test, y_test)
    
    print("alpha:{:.2f}, acc_train:{:.4f}, acc_test:{:.4f}".format(a, train_acc, test_acc))
    bnb_train_results.append(train_acc)
    bnb_test_results.append(test_acc)
    
    # Prob of being one
    Y_train_proba = bnb.predict_proba(X_test)
    #print("Probabilities :", Y_proba[:10,1])

alpha:0.10, acc_train:0.9577, acc_test:0.7628
alpha:0.20, acc_train:0.9577, acc_test:0.7628
alpha:0.30, acc_train:0.9577, acc_test:0.7628
alpha:0.40, acc_train:0.9577, acc_test:0.7628
alpha:0.50, acc_train:0.9577, acc_test:0.7628
alpha:0.60, acc_train:0.9577, acc_test:0.7628
alpha:0.70, acc_train:0.9577, acc_test:0.7628
alpha:0.80, acc_train:0.9577, acc_test:0.7628
alpha:0.90, acc_train:0.9577, acc_test:0.7628
alpha:1.00, acc_train:0.9577, acc_test:0.7628


In [62]:
from sklearn.naive_bayes import MultinomialNB
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
mnb_train_results = []
mnb_test_results = []

for a in alpha:
    mnb = MultinomialNB(alpha=a)
    mnb.fit(X_train, y_train)

    train_acc = mnb.score(X_train, y_train)
    test_acc = mnb.score(X_test, y_test)
    
    print("alpha:{:.2f}, acc_train:{:.4f}, acc_test:{:.4f}".format(a, train_acc, test_acc))
    mnb_train_results.append(train_acc)
    mnb_test_results.append(test_acc)
    
    # Prob of being one
    Y_train_proba = mnb.predict_proba(X_test)
    #print("Probabilities :", Y_proba[:10,1])

alpha:0.10, acc_train:0.9061, acc_test:0.6724
alpha:0.20, acc_train:0.9061, acc_test:0.6724
alpha:0.30, acc_train:0.9061, acc_test:0.6724
alpha:0.40, acc_train:0.9061, acc_test:0.6724
alpha:0.50, acc_train:0.9061, acc_test:0.6724
alpha:0.60, acc_train:0.9061, acc_test:0.6724
alpha:0.70, acc_train:0.9061, acc_test:0.6724
alpha:0.80, acc_train:0.9061, acc_test:0.6724
alpha:0.90, acc_train:0.9061, acc_test:0.6724
alpha:1.00, acc_train:0.9061, acc_test:0.6724


In [68]:
from sklearn.naive_bayes import GaussianNB

gnb_train_results = []
gnb_test_results = []


gnb = GaussianNB()
gnb.fit(X_train, y_train)

train_acc = gnb.score(X_train, y_train)
test_acc = gnb.score(X_test, y_test)

print("acc_train:{:.4f}, acc_test:{:.4f}".format(train_acc, test_acc))
gnb_train_results.append(train_acc)
gnb_test_results.append(test_acc)
    
# Prob of being one
Y_train_proba = gnb.predict_proba(X_test)
#print("Probabilities :", Y_proba[:10,1])

acc_train:0.9512, acc_test:0.7875


In [64]:
from sklearn.ensemble import RandomForestClassifier

depth = range(1,10+1)
rf_train_results = []
rf_test_results = []

for d in depth:
    clf = RandomForestClassifier(max_depth=d, n_estimators=100)
    clf.fit(X_train, y_train)
    train_acc = clf.score(X_train,y_train)
    test_acc = clf.score(X_test, y_test)
    print("depth:{:2d}, n_tree:{:3d} acc_train:{:.4f}, acc_test:{:.4f}".format(d, 100, train_acc, test_acc))
    rf_train_results.append(train_acc)
    rf_test_results.append(test_acc)

depth: 1, n_tree:100 acc_train:0.9598, acc_test:0.8070
depth: 2, n_tree:100 acc_train:0.9602, acc_test:0.8070
depth: 3, n_tree:100 acc_train:0.9610, acc_test:0.8070
depth: 4, n_tree:100 acc_train:0.9625, acc_test:0.8023
depth: 5, n_tree:100 acc_train:0.9639, acc_test:0.7879
depth: 6, n_tree:100 acc_train:0.9655, acc_test:0.7861
depth: 7, n_tree:100 acc_train:0.9665, acc_test:0.7857
depth: 8, n_tree:100 acc_train:0.9679, acc_test:0.7813
depth: 9, n_tree:100 acc_train:0.9693, acc_test:0.7741
depth:10, n_tree:100 acc_train:0.9717, acc_test:0.7756


In [71]:
from sklearn.svm import SVC

C = [0.1, 0.5, 1, 1.5, 2]
svm_train_results = []
svm_test_results = []

for c_param in C:
    clf = SVC(C=c_param)
    clf.fit(X_train, y_train)
    train_acc = clf.score(X_train,y_train)
    test_acc = clf.score(X_test, y_test)
    print("C{:.2f} acc_train:{:.4f}, acc_test:{:.4f}".format(c_param, train_acc, test_acc))
    svm_train_results.append(train_acc)
    svm_test_results.append(test_acc)

C0.10 acc_train:0.9435, acc_test:0.7224
C0.50 acc_train:0.9579, acc_test:0.7647
C1.00 acc_train:0.9580, acc_test:0.8005
C1.50 acc_train:0.9584, acc_test:0.8025
C2.00 acc_train:0.9583, acc_test:0.8031


In [74]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

train_acc = clf.score(X_train,y_train)
test_acc = clf.score(X_test, y_test)
print("acc_train:{:.4f}, acc_test:{:.4f}".format(train_acc, test_acc))

acc_train:0.9599, acc_test:0.7879


In [92]:
# use selected model and hyperparameter to calculate evaluation metrics: AUC, Accuracy
from sklearn import metrics

# GNB
gnb = BernoulliNB(alpha=0.5)
gnb.fit(X_train, y_train)

train_acc = gnb.score(X_train, y_train)
test_acc = gnb.score(X_test, y_test)

# Prob of edge=1
y_test_proba = gnb.predict_proba(X_test)
# get roc fpr and tpr
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_proba[:,1])
# results
print("GNB: alpha:----, acc_train:{:.4f}, acc_test:{:.4f}, AUC_test:{:.4f}".format(
                                                                                train_acc, 
                                                                                test_acc,
                                                                                metrics.auc(fpr, tpr)))
# MNB
mnb = MultinomialNB(alpha=0.5)
mnb.fit(X_train, y_train)

train_acc = mnb.score(X_train, y_train)
test_acc = mnb.score(X_test, y_test)

# Prob of edge=1
y_test_proba = mnb.predict_proba(X_test)
# get roc fpr and tpr
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_proba[:,1])
# results
print("MNB: alpha:{:.2f}, acc_train:{:.4f}, acc_test:{:.4f}, AUC_test:{:.4f}".format(a, 
                                                                                train_acc, 
                                                                                test_acc,
                                                                                metrics.auc(fpr, tpr)))



GNB: alpha:----, acc_train:0.9577, acc_test:0.7628, AUC_test:0.8150
MNB: alpha:1.00, acc_train:0.9061, acc_test:0.6724, AUC_test:0.7642
