In [None]:
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.metrics import roc_auc_score, precision_score
from sklearn.model_selection import train_test_split

In [None]:
with open('Train_10000.txt','rb') as file:
    dataframe_train=pickle.load(file)
train, test = train_test_split(dataframe_train, test_size=0.2, random_state=10)
scaler = StandardScaler()

## Understanding the Distribution of Training Dataset

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

LI=[]
for name in dataframe_train.columns:
        fig, ax = plt.subplots(figsize=(5, 4))
        for group in [0,1]:
            try:
                sns.distplot(dataframe_train.loc[dataframe_train.Label == group, name],kde=True, label=group)
            except:
                LI.append(name)  
        ax.set_ylabel('Total Count')
        ax.set_title(name)
        ax.legend()

## Delete High Corrlated Features

In [None]:
# Defore deletion
cor= dataframe_train.corr(method='pearson')

fig, ax =plt.subplots(figsize=(8, 10))
plt.title("Correlation Plot")
sns.heatmap(cor, mask=np.zeros_like(cor, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
plt.show()

In [None]:
# Deleted features for Train data
dels=list(dataframe_train.iloc[:,6:-2].columns)+['Shortest_path']#,'Source_following','Sink_following','HD','AAI']

In [None]:
# Train set
X_train = train.drop(dels,axis=1)
y_train = train.Label

# test set
X_test = test.drop(dels,axis=1)
y_test = test.Label

In [None]:
# Correlation after Deletion
cor= X_train.corr(method='pearson')
fig, ax =plt.subplots(figsize=(8, 10))
plt.title("Correlation Plot")
sns.heatmap(cor, mask=np.zeros_like(cor, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
plt.show()

## Feature Selection

statictial selection
https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif

In [None]:
k_= 'all' # control the number of features

In [None]:
fs = SelectKBest(score_func=chi2, k=k_)
fs.fit(X_train,y_train)

col=list(X_train.columns)
# the score for all features
for i in range(len(fs.scores_)):
    print('%s: %3f' % (col[i], fs.scores_[i]))
    

In [None]:
x_train = fs.transform(X_train)

x_test = fs.transform(X_test)

In [None]:
#x_train.shape

## 1. Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb_model = GaussianNB()
gnb_model.fit(x_train,y_train)

In [None]:
labelnb = gnb_model.predict(x_test)
probnb = gnb_model.predict_proba(x_test)

In [None]:
nb=roc_auc_score(y_test,labelnb)
ac1=precision_score(y_test,labelnb)
print('The ROC of Naive Bayes: %5.3f' %(nb))

## 2. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LR_model = LogisticRegression(penalty ='none',max_iter=500)
LR_model.fit(x_train,y_train)

In [None]:
label_predict=LR_model.predict(x_test)
prob_preidct=LR_model.predict_proba(x_test)

In [None]:
rl=roc_auc_score(y_test,label_predict)
ac2=precision_score(y_test,label_predict)
print('The ROC of Logistic Regression: %5.3f' %(rl))

## 3. Decision Tree and Random Forest

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Decision Tree
dt_model=DecisionTreeClassifier()

# Random Forest
rf_model=RandomForestClassifier(criterion='entropy',n_estimators=200,max_features=5)

In [None]:
# Decision Tree
dt_model.fit(x_train, y_train)

# Random Forest
rf_model.fit(x_train, y_train)

In [None]:
# Decision Tree
labeldt=dt_model.predict(x_test)
probdt = dt_model.predict_proba(x_test)

# Random Forest
labelrf = rf_model.predict(x_test)
probrf = rf_model.predict_proba(x_test)

In [None]:
dt=roc_auc_score(y_test,labeldt)
ac3=precision_score(y_test,labeldt)
print('The ROC of Decision Tree: %5.3f' %(dt))

In [None]:
rf=roc_auc_score(y_test,labelrf)
ac4=precision_score(y_test,labelrf)
print('The ROC of Random Forest: %5.3f' %(rf))

## 4. KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=20)

In [None]:
knn.fit(x_train, y_train)

In [None]:
labelknn = knn.predict(x_test)
probrknn = knn.predict_proba(x_test)

In [None]:
kn=roc_auc_score(y_test,labelknn)
ac5=precision_score(y_test,labelknn)
print('The ROC of KNN: %5.3f' %(kn))

## 5. SGDClassifier with Loss='log' 

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
x_train = scaler.fit_transform(x_train)

x_test = scaler.transform(x_test)

In [None]:
SGDcl=SGDClassifier(loss='log',max_iter=1000, tol=1e-3)  
SGDcl.fit(x_train,y_train)

In [None]:
label_predictC=SGDcl.predict(x_test)
prob_preidctC=SGDcl.predict_proba(x_test)

In [None]:
cl=roc_auc_score(y_test,label_predictC)
ac6=precision_score(y_test,label_predictC)
print('The ROC of SGDClassifier: %5.3f' %(cl))

## 6. SVM 

In [None]:
from sklearn.svm import SVC,SVR
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
estimator = SVR(kernel="linear")

In [None]:
#x_train=selector.transform(X_train)
SVM_model = SVC(gamma='scale',probability=True)
SVM_model.fit(x_train, y_train)

In [None]:
labelsvm = SVM_model.predict(x_test)
probrsvm = SVM_model.predict_proba(x_test)

In [None]:
svm=roc_auc_score(y_test,labelsvm)
ac7=precision_score(y_test,labelsvm)
print('The ROC of SVM: %5.3f' %(svm))

## 7. MultiLayer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp_model = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10),activation='logistic', random_state=1,max_iter=3000)

In [None]:
mlp_model.fit(x_train, y_train)

In [None]:
probmlp=mlp_model.predict_proba(x_test)
labelmlp=mlp_model.predict(x_test)

In [None]:
mlp=roc_auc_score(y_test,labelmlp)
ac8=precision_score(y_test,labelmlp)
print('The ROC of MLP: %5.3f' %(mlp))

## 8. RFE Model

In [None]:
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
estimator = SVR(kernel="linear")

In [None]:
RFE_model = RFE(estimator, n_features_to_select=k_, step=1)
RFE_model.fit(X_train, y_train)

In [None]:
# Generate the label prediction
labelrfe = RFE_model.predict(x_test)
l=list(labelrfe)
ll=[]
for elem in l:
    if elem <=0.5:
        ll.append(0)
    else:
        ll.append(1)  

In [None]:
rfe=roc_auc_score(list(y_test),ll)
ac9=precision_score(list(y_test),ll)
print('The ROC score: %5.3f' %(rfe))

## Show Results

In [None]:
score = [nb,rl,dt,rf,kn,cl,svm,mlp,rfe]
precision=[ac1,ac2,ac3,ac4,ac5,ac6,ac7,ac8,ac9]
index1= ['Naive Bayes','Logistic Regression','Decision Tree','Random Forest','K-Nearest Neighbor','SGDClassifier','SVM','Multilayer Perceptron','Recursive Feature Elimination']
result = pd.DataFrame(score,index=index1, columns=['ROC'])
result['Precision'] = precision
result

## Labelling the Given Dataset with Probability

Untill now **Random Forest** gives the higtest ROC score 0.7622

In [None]:
import pandas as pd

In [None]:
with open('Test_22.txt','rb') as file:
    test=pickle.load(file)

In [None]:
# deleted features
delt=list(test.iloc[:,6:-2].columns)+['Shortest_path']
test=test.drop(delt,axis=1)

In [None]:
Test = fs.transform(test)
Test.shape

In [None]:
with open('ID.txt','rb') as file:
    ID=pickle.load(file)

In [None]:
def submission(probs, filename):
    with open(filename, 'w') as file:
        file.write('Id,Predicted\n')
        for i, p in zip(ID, probs):
            file.write("{},{}\n".format(i, p[1]))

In [None]:
def submission1(probs, filename):
    with open(filename, 'w') as file:
        file.write('Id,Predicted\n')
        for i, p in zip(ID, probs):
            file.write("{},{}\n".format(i, p))

In [None]:
# the models without scaler transformed
clf1 = [LR_model,dt_model,rf_model,knn,gnb_model]
prob1=[]
for clf in clf1:
    prob=clf.predict_proba(Test)
    prob1.append(prob)

In [None]:
submission(prob1[0], 'LR_submit.csv')
submission(prob1[1], 'DT_submit.csv')
submission(prob1[2], 'RF_submit.csv')
submission(prob1[3], 'KNN_submit.csv')
submission(prob1[4], 'NB_submit.csv')

In [None]:
# the models with scaler transformed
clf2=[SGDcl,SVM_model,mlp_model]
Test_=scaler.transform(Test)
prob2=[]
for clf in clf2:
    prob=clf.predict_proba(Test_)
    prob2.append(list(prob))

In [None]:
submission(prob2[0], 'SGD_submit.csv')
submission(prob2[1], 'SVM_submit.csv')
submission(prob2[2], 'MLP_submit.csv')

In [None]:
#submission1(probrfe, 'RFE_submit.csv')

In [None]:
csvs=['NB_submit.csv','LR_submit.csv','DT_submit.csv','RF_submit.csv','KNN_submit.csv','SGD_submit.csv','SVM_submit.csv','MLP_submit.csv','RFE_submit.csv']
P=[]
N=[]
for csv in csvs:
    dd=pd.read_csv(csv)
    l=list(dd.Predicted)
    z=[]
    o=[]
    for elem in l:
        if elem <=0.5:
            z.append(elem)
        else:
            o.append(elem)
    P.append(len(o))
    N.append(len(z))
result['Positive'] = P
result['Negative'] = N
result

## Self Test on Different Dataset

In [None]:
with open('total_5850.txt','rb') as file:
    self_test=pickle.load(file)
self_test_sample=self_test.sample(n = 2000, random_state = 1)
self_test_sample=self_test_sample.drop(delt,axis=1)

In [None]:
self_test_sample_label=self_test_sample.Label
self_test_sample=self_test_sample.drop('Label',axis=1)

In [None]:
self_test_sample = fs.transform(self_test_sample)

In [None]:
model=[rf_model,LR_model,dt_model,knn,rf_model,SGDcl,SVM_model,mlp_model]
one=[]
two=[]
for mod in model:
    labelrf_self = mod.predict(self_test_sample)
    probrf_self = mod.predict_proba(self_test_sample)
    rf_self=roc_auc_score(self_test_sample_label,labelrf_self)
    ac4_self=precision_score(self_test_sample_label,labelrf_self)
    one.append(rf_self)
    two.append(ac4_self)   

In [None]:
rult=pd.DataFrame(index=index1)
rult['ROC_Train']=score
rult['ROC_Test']=one
rult['Precision_Train']=precision
rult['Precision_Test']=two
rult