# Trying regressions with selected features based on entropy

In [49]:
import csv
import json
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from sklearn.metrics import roc_auc_score

In [50]:
train_set = pd.read_csv('../data/final/train_reconstructed.csv')

X_train = train_set.iloc[:,:-1].values
y_train = train_set['edge'].values

print('Training set X: {}'.format(X_train[:2]))
print('Training set Y: {}'.format(y_train[:2]))
X_train.shape

Training set X: [[22.          2.          0.          3.         12.          0.46153846
   3.          0.375       0.          0.        ]
 [14.          0.          1.          5.         13.          0.56521739
   2.          0.33333333  0.          0.        ]]
Training set Y: [1 1]


(95154, 10)

In [51]:
test_set = pd.read_csv('../data/final/dev-test.csv')

X_test = test_set.iloc[:,:-1].values
y_test = test_set['edge'].values

print('Test set X: {}'.format(X_test[:2]))
print('Test set Y: {}'.format(y_test[:2]))
X_test.shape

Test set X: [[ 9.          0.          1.          2.          6.          0.3
   0.          0.          0.          0.        ]
 [12.          0.          0.          2.          8.          0.34782609
   1.          0.14285714  0.          0.        ]]
Test set Y: [0 1]


(4866, 10)

In [52]:
# Perform univariate feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X_big = np.concatenate((X_train,X_test),0)
y_big = np.concatenate((y_train,y_test),0)

X_big.shape

(100020, 10)

In [53]:
X_new = SelectKBest(chi2, k=10).fit_transform(X_big, y_big)
X_new.shape

(100020, 10)

In [54]:
# Split big dataset back to training and test
X_train = X_new[:95154,]
print(X_train.shape)

X_test = X_new[95154:,]
print(X_test.shape)

(95154, 10)
(4866, 10)


In [55]:
from sklearn.naive_bayes import BernoulliNB
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

for a in alpha:
    clf = BernoulliNB(alpha=a)
    clf.fit(X_train, y_train)

    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)

    Y_proba = clf.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test,Y_proba)

    print("alpha:{:.2f}, acc_train:{:.4f}, acc_test:{:.4f}, AUC:{:.4f}".format(a, train_acc, test_acc, auc))


alpha:0.10, acc_train:0.9598, acc_test:0.7641, AUC:0.8147
alpha:0.20, acc_train:0.9598, acc_test:0.7641, AUC:0.8148
alpha:0.30, acc_train:0.9598, acc_test:0.7641, AUC:0.8148
alpha:0.40, acc_train:0.9598, acc_test:0.7641, AUC:0.8149
alpha:0.50, acc_train:0.9598, acc_test:0.7641, AUC:0.8150
alpha:0.60, acc_train:0.9598, acc_test:0.7641, AUC:0.8152
alpha:0.70, acc_train:0.9598, acc_test:0.7641, AUC:0.8152
alpha:0.80, acc_train:0.9598, acc_test:0.7641, AUC:0.8152
alpha:0.90, acc_train:0.9598, acc_test:0.7641, AUC:0.8155
alpha:1.00, acc_train:0.9598, acc_test:0.7641, AUC:0.8155


In [56]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

train_acc = gnb.score(X_train, y_train)
test_acc = gnb.score(X_test, y_test)

Y_proba = gnb.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test,Y_proba)

print("GNB: acc_train:{:.4f}, acc_test:{:.4f}, AUC_test:{:.4f}".format(train_acc,test_acc,auc))

GNB: acc_train:0.9528, acc_test:0.7885, AUC_test:0.8122


In [57]:
from sklearn.ensemble import RandomForestClassifier

depth = range(1,10+1)

for d in depth:
    rf = RandomForestClassifier(max_depth=d, n_estimators=100)
    rf.fit(X_train, y_train)
    train_acc = rf.score(X_train,y_train)
    test_acc = rf.score(X_test,y_test)
    # Prob of edge=1
    Y_proba = rf.predict_proba(X_test)[:,1]

    auc_test = roc_auc_score(y_test, Y_proba)
    print("RF : depth:{:.2f}, acc_train:{:.4f}, acc_test:{:.4f}, AUC_test:{:.4f}".format(d, 
                                                                                            train_acc, 
                                                                                            test_acc,
                                                                                            auc_test))

RF : depth:1.00, acc_train:0.9609, acc_test:0.8070, AUC_test:0.8232
RF : depth:2.00, acc_train:0.9614, acc_test:0.8070, AUC_test:0.8281
RF : depth:3.00, acc_train:0.9621, acc_test:0.8070, AUC_test:0.8320
RF : depth:4.00, acc_train:0.9634, acc_test:0.8039, AUC_test:0.8366
RF : depth:5.00, acc_train:0.9650, acc_test:0.7877, AUC_test:0.8364
RF : depth:6.00, acc_train:0.9661, acc_test:0.7891, AUC_test:0.8381
RF : depth:7.00, acc_train:0.9673, acc_test:0.7861, AUC_test:0.8381
RF : depth:8.00, acc_train:0.9687, acc_test:0.7848, AUC_test:0.8388
RF : depth:9.00, acc_train:0.9704, acc_test:0.7857, AUC_test:0.8383
RF : depth:10.00, acc_train:0.9722, acc_test:0.7859, AUC_test:0.8387


In [58]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)

train_acc = lr.score(X_train,y_train)
test_acc = lr.score(X_test,y_test)

Y_test = lr.predict_proba(X_test)[:,1]

auc = roc_auc_score(y_test,Y_proba)

print("LR : acc_train:{:.4f}, acc_test:{:.4f}, AUC_test:{:.4f}".format(train_acc, test_acc,auc_test))

LR : acc_train:0.9615, acc_test:0.7900, AUC_test:0.8387


In [59]:
from sklearn.ensemble import StackingClassifier

estimators = [
                ('rf', RandomForestClassifier(max_depth=10, n_estimators=100)),
                ('bnb', BernoulliNB(alpha=0.8),
                ('lr',LogisticRegression(max_iter=500)))
            ]

clf = StackingClassifier(estimators=estimators, final_estimator=None)
clf.fit(X_train, y_train)

train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)

Y_test = clf.predict_proba(X_test)[:,1]

auc = roc_auc_score(y_test,Y_proba)

print("STACKING : acc_train:{:.4f}, acc_test:{:.4f}, AUC_test:{:.4f}".format(train_acc, test_acc,auc_test))

STACKING : acc_train:0.9749, acc_test:0.7861, AUC_test:0.8387


In [60]:
# from sklearn.svm import SVC

# clf = SVC(kernel='rbf')
# clf.fit(X_train,y_train)