In [243]:
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_score
from sklearn.model_selection import train_test_split

In [138]:
with open('train_14F.txt','rb') as file:
    dataframe_train=pickle.load(file)
train, test = train_test_split(dataframe_train, test_size=0.2, random_state=1)
scaler = StandardScaler()

In [139]:
train.head()

Unnamed: 0,Label,Shortest_path,AAI,JC,PA,RA,HP,LHN,PageRank_Src,PageRank_Sink,ECentrality_Sour,ECentrality_Sink,Degree_Centrality_Sour,Degree_Centrality_Sink
5004,1,1,2.144076,0.142857,1170,0.1856,0.346154,0.007692,3.442924e-07,2.601575e-07,0.0006336294,0.000175,3.164079e-05,1.130028e-05
17676,1,1,3.049736,0.014627,70616,0.052476,0.339286,0.000269,2.636462e-07,2.499888e-07,0.0005706631,5.4e-05,0.0002820961,2.732614e-05
29485,1,1,0.717,0.013514,4794,0.017789,0.235294,0.000834,6.148255e-07,2.153731e-07,0.008175566,0.000155,0.000244497,1.397126e-05
6476,0,0,0.0,0.0,0,0.0,0.0,0.0,2.109946e-07,2.049483e-07,3.667891e-09,1e-06,2.054597e-07,2.054597e-07
15508,0,3,0.0,0.0,148,0.0,0.0,0.0,2.057044e-07,2.047606e-07,1.521246e-05,8.2e-05,1.58204e-05,4.109194e-07


In [140]:
# Train set
x_train = train.drop('Label',axis=1)
y_train = train.Label

# test set
x_test = test.drop('Label',axis=1)
y_test = test.Label

## 1. Naive Bayes

In [232]:
from sklearn.naive_bayes import GaussianNB

In [234]:
gnb_model = GaussianNB()
gnb_model.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [235]:
labelnb = gnb_model.predict(x_test)
probnb = gnb_model.predict_proba(x_test)

In [256]:
nb=roc_auc_score(y_test,labelnb)
ac1=precision_score(y_test,labelnb)
print('The ROC of Naive Bayes: %5.3f' %(nb))

The ROC of Naive Bayes: 0.975


## 2. Logistic Regression

In [141]:
from sklearn.linear_model import LogisticRegression

In [142]:
LR_model = LogisticRegression(penalty ='l2')
LR_model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [143]:
label_predict=LR_model.predict(x_test)
prob_preidct=LR_model.predict_proba(x_test)

In [259]:
rl=roc_auc_score(y_test,label_predict)
ac2=precision_score(y_test,label_predict)
print('The ROC of Logistic Regression: %5.3f' %(rl))

The ROC of Logistic Regression: 0.954


## 3. Decision Tree and Random Forest

In [145]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [146]:
# Decision Tree
dt_model=DecisionTreeClassifier()

# Random Forest
rf_model=RandomForestClassifier()

In [147]:
# Decision Tree
dt_model.fit(x_train, y_train)

# Random Forest
rf_model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [148]:
# Decision Tree
labeldt=dt_model.predict(x_test)
probdt = dt_model.predict_proba(x_test)

# Random Forest
labelrf = rf_model.predict(x_test)
probrf = rf_model.predict_proba(x_test)

In [257]:
dt=roc_auc_score(y_test,labeldt)
ac3=precision_score(y_test,labeldt)
print('The ROC of Decision Tree: %5.3f' %(dt))
rf=roc_auc_score(y_test,labelrf)
ac4=precision_score(y_test,labelrf)
print('The ROC of Random Forest: %5.3f' %(rf))

The ROC of Decision Tree: 1.000
The ROC of Random Forest: 1.000


## 4. KNN

In [150]:
from sklearn.neighbors import KNeighborsClassifier

In [151]:
knn = KNeighborsClassifier()

In [152]:
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [153]:
labelknn = knn.predict(x_test)
probrknn = knn.predict_proba(x_test)

In [260]:
kn=roc_auc_score(y_test,labelknn)
ac5=precision_score(y_test,labelknn)
print('The ROC of KNN: %5.3f' %(kn))

The ROC of KNN: 0.940


## 5. SGDClassifier with Loss='log' 

In [155]:
from sklearn.linear_model import SGDClassifier

In [156]:
x_train = scaler.fit_transform(x_train)

x_test = scaler.transform(x_test)

In [157]:
SGDcl=SGDClassifier(loss='log',max_iter=1000, tol=1e-3,fit_intercept=False)  
SGDcl.fit(x_train,y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=False,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [158]:
label_predictC=SGDcl.predict(x_test)
prob_preidctC=SGDcl.predict_proba(x_test)

In [261]:
cl=roc_auc_score(y_test,label_predictC)
ac6=precision_score(y_test,label_predictC)
print('The ROC of SGDClassifier: %5.3f' %(cl))

The ROC of SGDClassifier: 0.953


## 6. SVM

In [160]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

In [161]:
SVM_model = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True))
SVM_model.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=True,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [162]:
labelsvm = SVM_model.predict(x_test)
probrsvm = SVM_model.predict_proba(x_test)

In [262]:
svm=roc_auc_score(y_test,labelsvm)
ac7=precision_score(y_test,labelsvm)
print('The ROC of SVM: %5.3f' %(svm))

The ROC of SVM: 1.000


## 7. MultiLayer Perceptron

In [164]:
from sklearn.neural_network import MLPClassifier

In [165]:
mlp_model = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10),activation='logistic', random_state=1,max_iter=3000)

In [166]:
mlp_model.fit(x_train, y_train)

MLPClassifier(activation='logistic', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=10, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=3000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [167]:
probmlp=mlp_model.predict_proba(x_test)
labelmlp=mlp_model.predict(x_test)

In [263]:
mlp=roc_auc_score(y_test,labelmlp)
ac8=precision_score(y_test,labelmlp)
print('The ROC of MLP: %5.3f' %(mlp))

The ROC of MLP: 1.000


## Show Results

In [266]:
score = [nb,rl,dt,rf,kn,cl,svm,mlp]
precision=[ac1,ac2,ac3,ac4,ac5,ac6,ac7,ac8]
index1= ['Naive Bayes','Logistic Regression','Decision Tree','Random Forest','K-Nearest Neighbor','SGDClassifier','SVM','Multilayer Perceptron']
result = pd.DataFrame(score,index=index1, columns=['ROC'])
result['Precision'] = precision
result

Unnamed: 0,ROC,Precision
Naive Bayes,0.974536,1.0
Logistic Regression,0.95436,0.95724
Decision Tree,1.0,1.0
Random Forest,1.0,1.0
K-Nearest Neighbor,0.940386,0.945135
SGDClassifier,0.953244,0.959268
SVM,1.0,1.0
Multilayer Perceptron,0.999915,0.999836


### Label the Given DataSet with Probability

Untill now **Random Forest** gives the higtest ROC score 0.7622

In [169]:
import pandas as pd

In [170]:
with open('teacher_test.txt','rb') as file:
    Test=pickle.load(file)

In [212]:
with open('ID.txt','rb') as file:
    ID=pickle.load(file)

In [173]:
def submission(probs, filename):
    with open(filename, 'w') as file:
        file.write('Id,Predicted\n')
        for i, p in zip(ID, probs):
            file.write("{},{}\n".format(i, p[1]))

In [217]:
# the models without transformed X
clf1 = [LR_model,dt_model,rf_model,knn,gnb_model]
prob1=[]
for clf in clf1:
    prob=clf.predict_proba(Test)
    prob1.append(prob)

In [224]:
submission(prob1[0], 'LR_submit.csv')
submission(prob1[1], 'DT_submit.csv')
submission(prob1[2], 'RF_submit.csv')
submission(prob1[3], 'KNN_submit.csv')
submission(prob1[4], 'NB_submit.csv')

In [225]:
# the models with transformed X
clf2=[SDGcl,SVM_model,mlp_model]
Test_=scaler.transform(Test)
prob2=[]
for clf in clf2:
    prob=clf.predict_proba(Test_)
    prob2.append(list(prob))

In [226]:
submission(prob2[0], 'SGD_submit.csv')
submission(prob2[1], 'SVM_submit.csv')
submission(prob2[2], 'MLP_submit.csv')

In [267]:
dd=pd.read_csv('SGD_submit.csv')
dd.head()

Unnamed: 0,Id,Predicted
0,1,0.000142
1,2,0.001889
2,3,0.081376
3,4,0.05945
4,5,0.002524
