## Feature Selection using Random Shuffling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score

## Read Data

In [2]:
data = pd.read_csv('../DoHBrwTest.csv')
data.shape

(53860, 35)

In [3]:
data.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,is_intrusion
0,7.0,2.0,52742,443,18355.0,0.046455,55,1183.941449,66,1420.729738,...,1.0,0.0,0.0,0.046455,0.046455,0.046455,-10.0,-10.0,0.0,0
1,7.0,2.0,54640,443,18365.0,96.750105,42044,434.562836,44920,464.288902,...,0.561877,0.0017,0.041234,0.033549,0.026931,0.026952,0.481463,0.159978,1.229096,0
2,7.0,2.0,56611,443,18373.0,96.365606,41539,431.056284,44577,462.582054,...,0.549156,0.000956,0.030926,0.026551,0.026848,0.026879,-0.028834,-0.010614,1.164778,0
3,7.0,2.0,56611,443,18374.0,121.35682,60659,499.840058,67897,559.48236,...,0.55657,0.001013,0.031829,0.027571,0.026862,0.026941,0.066819,0.019791,1.154439,0
4,7.0,2.0,56611,443,18375.0,104.669253,30409,290.524668,30718,293.476825,...,0.331633,0.001226,0.035013,0.029797,0.026867,0.026908,0.251063,0.082517,1.175049,0


### Train - Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),
    data['is_intrusion'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((37702, 34), (16158, 34))

In [5]:
# Reset the indexes of the returned datasets
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

### Train ML algo with all features

In [6]:
rf = RandomForestClassifier(
    n_estimators=50, max_depth=2, random_state=2909, n_jobs=4)

rf.fit(X_train, y_train)

# print roc-auc in train and testing sets
print('train auc score: ',
      roc_auc_score(y_train, (rf.predict_proba(X_train.fillna(0)))[:, 1]))
print('test auc score: ',
      roc_auc_score(y_test, (rf.predict_proba(X_test.fillna(0)))[:, 1]))

train auc score:  0.9992995804653517
test auc score:  0.999453661981075


### Shuffling resources and assessing performance loss

In [7]:
# overall train roc-auc: using all the features
train_roc = roc_auc_score(y_train, (rf.predict_proba(X_train))[:, 1])

# list to capture the performance shift
performance_shift = []

# selection  logic
for feature in X_train.columns:

    X_train_c = X_train.copy()

    # shuffle individual feature
    X_train_c[feature] = X_train_c[feature].sample(
        frac=1, random_state=10).reset_index(drop=True)

    # make prediction with shuffled feature and calculate roc-auc
    shuff_roc = roc_auc_score(y_train, rf.predict_proba(X_train_c)[:, 1])
    
    drift = train_roc - shuff_roc

    # save the drop in roc-auc
    performance_shift.append(drift)

In [8]:
# list of performances
performance_shift

[0.0003651522014044284,
 1.7772712931352075e-06,
 4.581867033226494e-06,
 5.383180101736773e-05,
 0.021139722576375197,
 -4.742129646861937e-05,
 -3.862739920335034e-05,
 5.1540867500809995e-05,
 0.0007866223289872964,
 4.70514596684346e-06,
 6.903620283138512e-05,
 5.167441967879327e-06,
 -3.30284809676451e-05,
 0.00011411519956094907,
 0.00018739425235803076,
 5.537278768841247e-06,
 4.355855654947227e-06,
 1.9991733736679507e-05,
 7.781982685239885e-05,
 -0.0002581974532379627,
 -5.250655248079106e-05,
 -5.547552013096535e-06,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -2.4563327525317646e-05,
 3.353186994670132e-05,
 3.175459865356611e-05,
 0.0,
 0.0,
 2.9997873850495793e-06,
 -1.94677982685576e-05]

In [9]:
# Transform the list into a pandas Series for easy manipulation
feature_importance = pd.Series(performance_shift)

# add variable names in the index
feature_importance.index = X_train.columns

feature_importance.head()

SourceIP           0.000365
DestinationIP      0.000002
SourcePort         0.000005
DestinationPort    0.000054
TimeStamp          0.021140
dtype: float64

In [10]:
# Sort the dataframe according to the drop in performance # caused by feature shuffling
feature_importance.sort_values(ascending=False)

TimeStamp                                 0.021140
FlowBytesReceived                         0.000787
SourceIP                                  0.000365
PacketLengthMode                          0.000187
PacketLengthMedian                        0.000114
PacketTimeVariance                        0.000078
PacketLengthVariance                      0.000069
DestinationPort                           0.000054
FlowSentRate                              0.000052
ResponseTimeTimeMean                      0.000034
ResponseTimeTimeMedian                    0.000032
PacketLengthCoefficientofVariation        0.000020
PacketLengthSkewFromMedian                0.000006
PacketLengthStandardDeviation             0.000005
FlowReceivedRate                          0.000005
SourcePort                                0.000005
PacketLengthSkewFromMode                  0.000004
ResponseTimeTimeSkewFromMode              0.000003
DestinationIP                             0.000002
PacketTimeSkewFromMode         

In [11]:
# List the top 10 features that caused the major drop in the roc-auc (aka model performance)

feature_importance.sort_values(ascending=False).head(10)

TimeStamp               0.021140
FlowBytesReceived       0.000787
SourceIP                0.000365
PacketLengthMode        0.000187
PacketLengthMedian      0.000114
PacketTimeVariance      0.000078
PacketLengthVariance    0.000069
DestinationPort         0.000054
FlowSentRate            0.000052
ResponseTimeTimeMean    0.000034
dtype: float64

In [12]:
# original number of features (rows in this case)
feature_importance.shape[0]

34

In [13]:
# number of features that cause a drop in performance when shuffled

feature_importance[feature_importance>0].shape[0]

19

23 out of the 41 features caused a drop in the performance of the random forests when their values were permuted. This means that we could select those features and discard the rest, and should keep the original random forest performance. 

In [14]:
# print the important features

feature_importance[feature_importance>0].index

Index(['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort',
       'TimeStamp', 'FlowSentRate', 'FlowBytesReceived', 'FlowReceivedRate',
       'PacketLengthVariance', 'PacketLengthStandardDeviation',
       'PacketLengthMedian', 'PacketLengthMode', 'PacketLengthSkewFromMedian',
       'PacketLengthSkewFromMode', 'PacketLengthCoefficientofVariation',
       'PacketTimeVariance', 'ResponseTimeTimeMean', 'ResponseTimeTimeMedian',
       'ResponseTimeTimeSkewFromMode'],
      dtype='object')

### Select features

In [15]:
# Building a random forests only with the selected features capture the selected features

selected_features = feature_importance[feature_importance > 0].index

# train a new random forests using only the selected features
rf = RandomForestClassifier(n_estimators=50,
                            max_depth=2,
                            random_state=2909,
                            n_jobs=4)

rf.fit(X_train[selected_features], y_train)

# print roc-auc in train and testing sets
print('train auc score: ', roc_auc_score(y_train, (rf.predict_proba(X_train[selected_features]))[:,1]))
print('test auc score: ', roc_auc_score(y_test, (rf.predict_proba(X_test[selected_features]))[:,1]))

train auc score:  0.9999999794535109
test auc score:  1.0


The random forests with the selected features show a similar performance (or even slightly higher) to the random forests built using all of the features. And it provides a simpler, faster and more reliable model.

In [16]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [17]:
X_train.shape, X_test.shape

((37702, 19), (16158, 19))

## Standardize Data




In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers




In [22]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation




In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression





In [24]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1).fit(X_train, y_train)

CPU times: user 73 ms, sys: 202 ms, total: 275 ms
Wall time: 2.65 s


In [25]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.7216114308394522
F1 Score: 0.7551661547053895
FPR: 0.4542759745071884
TPR: 0.9244509016323391


### Naive Bayes





In [26]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-09).fit(X_train, y_train)

CPU times: user 40.4 ms, sys: 9.21 ms, total: 49.6 ms
Wall time: 47.8 ms


In [27]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.46441754316332606
F1 Score: 0.6342692974848223
FPR: 1.0
TPR: 1.0


### Random Forest




In [29]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 5.17 s, sys: 46.4 ms, total: 5.21 s
Wall time: 5.21 s


In [30]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
FPR: 1.0
TPR: 1.0


### KNN





In [31]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance').fit(X_train, y_train)

CPU times: user 8.38 ms, sys: 1.34 ms, total: 9.72 ms
Wall time: 8.01 ms


In [32]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.22909307402262355
f1: 0.32070786556150105
fpr: 0.912034978508967
tpr: 0.39184685069652164


### CatBoost




In [33]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.5804378	total: 72.9ms	remaining: 3.57s
1:	learn: 0.4787832	total: 90.5ms	remaining: 2.17s
2:	learn: 0.4169377	total: 109ms	remaining: 1.71s
3:	learn: 0.3578408	total: 126ms	remaining: 1.45s
4:	learn: 0.3069717	total: 145ms	remaining: 1.3s
5:	learn: 0.2680144	total: 163ms	remaining: 1.19s
6:	learn: 0.2247380	total: 182ms	remaining: 1.11s
7:	learn: 0.1932684	total: 200ms	remaining: 1.05s
8:	learn: 0.1690601	total: 219ms	remaining: 996ms
9:	learn: 0.1471479	total: 237ms	remaining: 948ms
10:	learn: 0.1325914	total: 255ms	remaining: 904ms
11:	learn: 0.1198930	total: 273ms	remaining: 863ms
12:	learn: 0.1046867	total: 292ms	remaining: 830ms
13:	learn: 0.0950989	total: 310ms	remaining: 796ms
14:	learn: 0.0847987	total: 326ms	remaining: 762ms
15:	learn: 0.0781709	total: 344ms	remaining: 731ms
16:	learn: 0.0737168	total: 363ms	remaining: 705ms
17:	learn: 0.0699039	total: 381ms	remaining: 678ms
18:	learn: 0.0663385	total: 400ms	remaining: 652ms
19:	learn: 0.0633386	total: 417ms	remain

In [34]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
FPR: 1.0
TPR: 1.0


## Model Evaluation





In [35]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../KDDTest.csv")
test_df.shape

(22543, 42)

In [36]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion'])

In [37]:
X_eval = X_eval[selected_features]

In [38]:
X_eval.shape

(22543, 23)

### Model Evaluation - Logistic Regression





In [39]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=0.1, n_jobs=-1, random_state=42)

In [40]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [41]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9541958978735228
Testing accuracy is  0.7216114308394522


In [42]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.7216114308394522
F1 Score: 0.7551661547053895
Precision Score: 0.6382840620758836
Recall Score: 0.9244509016323391
Confusion Matrix:
 [[ 7364  6130]
 [  884 10817]]


### Cross validation - Logistic Regression





In [43]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.82602 (+/- 0.01807)
F1 Score: 0.84294 (+/- 0.01758)
Precision: 0.86717 (+/- 0.03095)
Recall: 0.82060 (+/- 0.04153)


### Model Evaluation - Naive Bayes





In [44]:
modelNB = GaussianNB(var_smoothing=1e-09)
modelNB.fit(X_train, y_train)

GaussianNB()

In [45]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [46]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.9129464064221002
Testing accuracy is  0.46441754316332606


In [47]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.46441754316332606
F1 Score: 0.6342692974848223
Precision Score: 0.46441754316332606
Recall Score: 1.0
Confusion Matrix:
 [[    0 13494]
 [    0 11701]]


### Cross validation - Naive Bayes






In [49]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.78619 (+/- 0.03889)
F1 Score: 0.79034 (+/- 0.04827)
Precision: 0.89172 (+/- 0.02340)
Recall: 0.71135 (+/- 0.09363)


### Model Evaluation - Random Forest




In [52]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [53]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [54]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  0.9999702313027774
Testing accuracy is  0.5355824568366739


In [55]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
Precision Score: 0.28684856807120773
Recall Score: 0.5355824568366739
Confusion Matrix:
 [[13494     0]
 [11701     0]]


### Cross validation - Random Forest




In [56]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98620 (+/- 0.00378)
F1 Score: 0.98788 (+/- 0.00331)
Precision: 0.98808 (+/- 0.00453)
Recall: 0.98769 (+/- 0.00367)


### Model Evaluation - KNN




In [57]:
modelKNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=1, n_neighbors=2,
                     weights='distance')

In [58]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [59]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9999702313027774
Testing accuracy is  0.22909307402262355


In [60]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.22909307402262355
F1 Score: 0.32070786556150105
Precision Score: 0.27143026284631777
Recall Score: 0.39184685069652164
Confusion Matrix:
 [[ 1187 12307]
 [ 7116  4585]]


### Cross validation - KNN




In [61]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.97742 (+/- 0.00705)
F1 Score: 0.98016 (+/- 0.00613)
Precision: 0.98053 (+/- 0.01008)
Recall: 0.97982 (+/- 0.00533)


### Model Evaluation - CatBoost





In [62]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.5804378	total: 18.7ms	remaining: 914ms
1:	learn: 0.4787832	total: 36.6ms	remaining: 879ms
2:	learn: 0.4169377	total: 54.3ms	remaining: 851ms
3:	learn: 0.3578408	total: 70.5ms	remaining: 811ms
4:	learn: 0.3069717	total: 88.1ms	remaining: 793ms
5:	learn: 0.2680144	total: 105ms	remaining: 772ms
6:	learn: 0.2247380	total: 123ms	remaining: 757ms
7:	learn: 0.1932684	total: 142ms	remaining: 744ms
8:	learn: 0.1690601	total: 159ms	remaining: 726ms
9:	learn: 0.1471479	total: 177ms	remaining: 708ms
10:	learn: 0.1325914	total: 194ms	remaining: 688ms
11:	learn: 0.1198930	total: 212ms	remaining: 671ms
12:	learn: 0.1046867	total: 230ms	remaining: 655ms
13:	learn: 0.0950989	total: 247ms	remaining: 636ms
14:	learn: 0.0847987	total: 264ms	remaining: 616ms
15:	learn: 0.0781709	total: 281ms	remaining: 597ms
16:	learn: 0.0737168	total: 298ms	remaining: 579ms
17:	learn: 0.0699039	total: 314ms	remaining: 559ms
18:	learn: 0.0663385	total: 332ms	remaining: 541ms
19:	learn: 0.0633386	total: 349ms	re

<catboost.core.CatBoostClassifier at 0x7f80f06a3460>

In [63]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [64]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.995901842682358
Testing accuracy is  0.5355824568366739


In [65]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
Precision Score: 0.28684856807120773
Recall Score: 0.5355824568366739
Confusion Matrix:
 [[13494     0]
 [11701     0]]


### Cross validation - CatBoost






In [67]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6099204	total: 11.5ms	remaining: 564ms
1:	learn: 0.5389880	total: 23ms	remaining: 553ms
2:	learn: 0.4829256	total: 34.4ms	remaining: 538ms
3:	learn: 0.4245444	total: 45.7ms	remaining: 525ms
4:	learn: 0.3939788	total: 56.9ms	remaining: 512ms
5:	learn: 0.3532837	total: 68.3ms	remaining: 501ms
6:	learn: 0.3251590	total: 79.5ms	remaining: 488ms
7:	learn: 0.2990027	total: 90.1ms	remaining: 473ms
8:	learn: 0.2801524	total: 101ms	remaining: 462ms
9:	learn: 0.2551476	total: 113ms	remaining: 450ms
10:	learn: 0.2365535	total: 124ms	remaining: 439ms
11:	learn: 0.2210805	total: 135ms	remaining: 426ms
12:	learn: 0.2091443	total: 146ms	remaining: 416ms
13:	learn: 0.1976915	total: 157ms	remaining: 404ms
14:	learn: 0.1860839	total: 169ms	remaining: 393ms
15:	learn: 0.1753556	total: 180ms	remaining: 383ms
16:	learn: 0.1664201	total: 192ms	remaining: 372ms
17:	learn: 0.1552737	total: 203ms	remaining: 360ms
18:	learn: 0.1467833	total: 214ms	remaining: 349ms
19:	learn: 0.1401141	total: 224ms	r

18:	learn: 0.1441059	total: 213ms	remaining: 348ms
19:	learn: 0.1373312	total: 224ms	remaining: 336ms
20:	learn: 0.1310751	total: 235ms	remaining: 324ms
21:	learn: 0.1234993	total: 246ms	remaining: 313ms
22:	learn: 0.1193641	total: 257ms	remaining: 301ms
23:	learn: 0.1151547	total: 268ms	remaining: 290ms
24:	learn: 0.1117801	total: 278ms	remaining: 278ms
25:	learn: 0.1074485	total: 289ms	remaining: 267ms
26:	learn: 0.1049006	total: 300ms	remaining: 256ms
27:	learn: 0.1023182	total: 311ms	remaining: 244ms
28:	learn: 0.0999359	total: 322ms	remaining: 233ms
29:	learn: 0.0975297	total: 333ms	remaining: 222ms
30:	learn: 0.0951438	total: 344ms	remaining: 211ms
31:	learn: 0.0927897	total: 356ms	remaining: 200ms
32:	learn: 0.0902388	total: 367ms	remaining: 189ms
33:	learn: 0.0870094	total: 379ms	remaining: 178ms
34:	learn: 0.0846120	total: 390ms	remaining: 167ms
35:	learn: 0.0833646	total: 402ms	remaining: 156ms
36:	learn: 0.0815325	total: 413ms	remaining: 145ms
37:	learn: 0.0801510	total: 424

29:	learn: 0.0994253	total: 348ms	remaining: 232ms
30:	learn: 0.0962219	total: 360ms	remaining: 220ms
31:	learn: 0.0937515	total: 370ms	remaining: 208ms
32:	learn: 0.0914156	total: 382ms	remaining: 197ms
33:	learn: 0.0893254	total: 393ms	remaining: 185ms
34:	learn: 0.0869698	total: 404ms	remaining: 173ms
35:	learn: 0.0853424	total: 414ms	remaining: 161ms
36:	learn: 0.0837277	total: 426ms	remaining: 150ms
37:	learn: 0.0819006	total: 437ms	remaining: 138ms
38:	learn: 0.0799357	total: 448ms	remaining: 126ms
39:	learn: 0.0783811	total: 459ms	remaining: 115ms
40:	learn: 0.0771800	total: 470ms	remaining: 103ms
41:	learn: 0.0758937	total: 481ms	remaining: 91.6ms
42:	learn: 0.0745731	total: 492ms	remaining: 80.1ms
43:	learn: 0.0732704	total: 503ms	remaining: 68.6ms
44:	learn: 0.0725616	total: 514ms	remaining: 57.1ms
45:	learn: 0.0713997	total: 525ms	remaining: 45.7ms
46:	learn: 0.0704370	total: 537ms	remaining: 34.3ms
47:	learn: 0.0700840	total: 548ms	remaining: 22.8ms
48:	learn: 0.0693889	tot

48:	learn: 0.0663366	total: 544ms	remaining: 11.1ms
49:	learn: 0.0655018	total: 555ms	remaining: 0us
0:	learn: 0.6099204	total: 11.7ms	remaining: 574ms
1:	learn: 0.5389880	total: 23.3ms	remaining: 560ms
2:	learn: 0.4829256	total: 34.5ms	remaining: 540ms
3:	learn: 0.4245444	total: 45.4ms	remaining: 522ms
4:	learn: 0.3939788	total: 56.7ms	remaining: 510ms
5:	learn: 0.3532837	total: 67.9ms	remaining: 498ms
6:	learn: 0.3251590	total: 79.6ms	remaining: 489ms
7:	learn: 0.2990027	total: 90.6ms	remaining: 476ms
8:	learn: 0.2801524	total: 102ms	remaining: 463ms
9:	learn: 0.2551476	total: 113ms	remaining: 450ms
10:	learn: 0.2365535	total: 124ms	remaining: 440ms
11:	learn: 0.2210805	total: 135ms	remaining: 428ms
12:	learn: 0.2091443	total: 147ms	remaining: 418ms
13:	learn: 0.1976915	total: 158ms	remaining: 405ms
14:	learn: 0.1860839	total: 170ms	remaining: 396ms
15:	learn: 0.1753556	total: 181ms	remaining: 385ms
16:	learn: 0.1664201	total: 193ms	remaining: 374ms
17:	learn: 0.1552737	total: 204ms	

9:	learn: 0.2504151	total: 120ms	remaining: 480ms
10:	learn: 0.2295737	total: 132ms	remaining: 466ms
11:	learn: 0.2091229	total: 143ms	remaining: 454ms
12:	learn: 0.1956858	total: 156ms	remaining: 444ms
13:	learn: 0.1857945	total: 169ms	remaining: 433ms
14:	learn: 0.1757594	total: 181ms	remaining: 422ms
15:	learn: 0.1664076	total: 192ms	remaining: 408ms
16:	learn: 0.1585034	total: 204ms	remaining: 395ms
17:	learn: 0.1484996	total: 215ms	remaining: 382ms
18:	learn: 0.1441059	total: 226ms	remaining: 369ms
19:	learn: 0.1373312	total: 237ms	remaining: 356ms
20:	learn: 0.1310751	total: 248ms	remaining: 343ms
21:	learn: 0.1234993	total: 260ms	remaining: 331ms
22:	learn: 0.1193641	total: 271ms	remaining: 318ms
23:	learn: 0.1151547	total: 283ms	remaining: 306ms
24:	learn: 0.1117801	total: 294ms	remaining: 294ms
25:	learn: 0.1074485	total: 306ms	remaining: 282ms
26:	learn: 0.1049006	total: 317ms	remaining: 270ms
27:	learn: 0.1023182	total: 328ms	remaining: 258ms
28:	learn: 0.0999359	total: 339m

28:	learn: 0.1018875	total: 325ms	remaining: 235ms
29:	learn: 0.0994253	total: 336ms	remaining: 224ms
30:	learn: 0.0962219	total: 347ms	remaining: 213ms
31:	learn: 0.0937515	total: 358ms	remaining: 201ms
32:	learn: 0.0914156	total: 369ms	remaining: 190ms
33:	learn: 0.0893254	total: 381ms	remaining: 179ms
34:	learn: 0.0869698	total: 392ms	remaining: 168ms
35:	learn: 0.0853424	total: 403ms	remaining: 157ms
36:	learn: 0.0837277	total: 414ms	remaining: 145ms
37:	learn: 0.0819006	total: 425ms	remaining: 134ms
38:	learn: 0.0799357	total: 436ms	remaining: 123ms
39:	learn: 0.0783811	total: 447ms	remaining: 112ms
40:	learn: 0.0771800	total: 458ms	remaining: 101ms
41:	learn: 0.0758937	total: 470ms	remaining: 89.4ms
42:	learn: 0.0745731	total: 481ms	remaining: 78.2ms
43:	learn: 0.0732704	total: 491ms	remaining: 67ms
44:	learn: 0.0725616	total: 502ms	remaining: 55.8ms
45:	learn: 0.0713997	total: 513ms	remaining: 44.6ms
46:	learn: 0.0704370	total: 524ms	remaining: 33.5ms
47:	learn: 0.0700840	total:

40:	learn: 0.0746947	total: 460ms	remaining: 101ms
41:	learn: 0.0732493	total: 471ms	remaining: 89.8ms
42:	learn: 0.0717308	total: 482ms	remaining: 78.5ms
43:	learn: 0.0710176	total: 493ms	remaining: 67.2ms
44:	learn: 0.0702863	total: 505ms	remaining: 56.1ms
45:	learn: 0.0691793	total: 516ms	remaining: 44.8ms
46:	learn: 0.0679866	total: 527ms	remaining: 33.6ms
47:	learn: 0.0669607	total: 538ms	remaining: 22.4ms
48:	learn: 0.0663366	total: 549ms	remaining: 11.2ms
49:	learn: 0.0655018	total: 559ms	remaining: 0us
0:	learn: 0.6099204	total: 11.4ms	remaining: 559ms
1:	learn: 0.5389880	total: 22.6ms	remaining: 544ms
2:	learn: 0.4829256	total: 33.9ms	remaining: 531ms
3:	learn: 0.4245444	total: 44.8ms	remaining: 516ms
4:	learn: 0.3939788	total: 56ms	remaining: 504ms
5:	learn: 0.3532837	total: 66.9ms	remaining: 490ms
6:	learn: 0.3251590	total: 78.2ms	remaining: 480ms
7:	learn: 0.2990027	total: 89.5ms	remaining: 470ms
8:	learn: 0.2801524	total: 101ms	remaining: 461ms
9:	learn: 0.2551476	total: 1

8:	learn: 0.2742213	total: 102ms	remaining: 463ms
9:	learn: 0.2504151	total: 113ms	remaining: 452ms
10:	learn: 0.2295737	total: 124ms	remaining: 439ms
11:	learn: 0.2091229	total: 135ms	remaining: 426ms
12:	learn: 0.1956858	total: 145ms	remaining: 414ms
13:	learn: 0.1857945	total: 156ms	remaining: 402ms
14:	learn: 0.1757594	total: 168ms	remaining: 391ms
15:	learn: 0.1664076	total: 179ms	remaining: 380ms
16:	learn: 0.1585034	total: 190ms	remaining: 368ms
17:	learn: 0.1484996	total: 200ms	remaining: 356ms
18:	learn: 0.1441059	total: 212ms	remaining: 346ms
19:	learn: 0.1373312	total: 223ms	remaining: 335ms
20:	learn: 0.1310751	total: 234ms	remaining: 324ms
21:	learn: 0.1234993	total: 245ms	remaining: 312ms
22:	learn: 0.1193641	total: 256ms	remaining: 301ms
23:	learn: 0.1151547	total: 268ms	remaining: 290ms
24:	learn: 0.1117801	total: 279ms	remaining: 279ms
25:	learn: 0.1074485	total: 290ms	remaining: 268ms
26:	learn: 0.1049006	total: 301ms	remaining: 256ms
27:	learn: 0.1023182	total: 312ms

20:	learn: 0.1363229	total: 238ms	remaining: 328ms
21:	learn: 0.1301656	total: 249ms	remaining: 316ms
22:	learn: 0.1253690	total: 259ms	remaining: 305ms
23:	learn: 0.1222043	total: 271ms	remaining: 294ms
24:	learn: 0.1170350	total: 283ms	remaining: 283ms
25:	learn: 0.1125149	total: 294ms	remaining: 271ms
26:	learn: 0.1093256	total: 305ms	remaining: 260ms
27:	learn: 0.1049069	total: 316ms	remaining: 248ms
28:	learn: 0.1018875	total: 327ms	remaining: 237ms
29:	learn: 0.0994253	total: 339ms	remaining: 226ms
30:	learn: 0.0962219	total: 350ms	remaining: 214ms
31:	learn: 0.0937515	total: 361ms	remaining: 203ms
32:	learn: 0.0914156	total: 372ms	remaining: 192ms
33:	learn: 0.0893254	total: 383ms	remaining: 180ms
34:	learn: 0.0869698	total: 394ms	remaining: 169ms
35:	learn: 0.0853424	total: 405ms	remaining: 158ms
36:	learn: 0.0837277	total: 417ms	remaining: 146ms
37:	learn: 0.0819006	total: 428ms	remaining: 135ms
38:	learn: 0.0799357	total: 439ms	remaining: 124ms
39:	learn: 0.0783811	total: 450

40:	learn: 0.0746947	total: 456ms	remaining: 100ms
41:	learn: 0.0732493	total: 466ms	remaining: 88.8ms
42:	learn: 0.0717308	total: 477ms	remaining: 77.6ms
43:	learn: 0.0710176	total: 487ms	remaining: 66.5ms
44:	learn: 0.0702863	total: 499ms	remaining: 55.4ms
45:	learn: 0.0691793	total: 509ms	remaining: 44.3ms
46:	learn: 0.0679866	total: 520ms	remaining: 33.2ms
47:	learn: 0.0669607	total: 531ms	remaining: 22.1ms
48:	learn: 0.0663366	total: 543ms	remaining: 11.1ms
49:	learn: 0.0655018	total: 553ms	remaining: 0us
0:	learn: 0.6099204	total: 11.3ms	remaining: 553ms
1:	learn: 0.5389880	total: 23ms	remaining: 551ms
2:	learn: 0.4829256	total: 34.4ms	remaining: 538ms
3:	learn: 0.4245444	total: 45.5ms	remaining: 523ms
4:	learn: 0.3939788	total: 56.8ms	remaining: 511ms
5:	learn: 0.3532837	total: 67.8ms	remaining: 497ms
6:	learn: 0.3251590	total: 78.9ms	remaining: 484ms
7:	learn: 0.2990027	total: 89.7ms	remaining: 471ms
8:	learn: 0.2801524	total: 101ms	remaining: 458ms
9:	learn: 0.2551476	total: 1

12:	learn: 0.1956858	total: 145ms	remaining: 412ms
13:	learn: 0.1857945	total: 155ms	remaining: 399ms
14:	learn: 0.1757594	total: 166ms	remaining: 388ms
15:	learn: 0.1664076	total: 178ms	remaining: 378ms
16:	learn: 0.1585034	total: 189ms	remaining: 368ms
17:	learn: 0.1484996	total: 201ms	remaining: 357ms
18:	learn: 0.1441059	total: 212ms	remaining: 346ms
19:	learn: 0.1373312	total: 223ms	remaining: 335ms
20:	learn: 0.1310751	total: 235ms	remaining: 324ms
21:	learn: 0.1234993	total: 246ms	remaining: 313ms
22:	learn: 0.1193641	total: 257ms	remaining: 302ms
23:	learn: 0.1151547	total: 268ms	remaining: 291ms
24:	learn: 0.1117801	total: 280ms	remaining: 280ms
25:	learn: 0.1074485	total: 291ms	remaining: 268ms
26:	learn: 0.1049006	total: 302ms	remaining: 257ms
27:	learn: 0.1023182	total: 313ms	remaining: 246ms
28:	learn: 0.0999359	total: 324ms	remaining: 235ms
29:	learn: 0.0975297	total: 335ms	remaining: 224ms
30:	learn: 0.0951438	total: 347ms	remaining: 212ms
31:	learn: 0.0927897	total: 358

24:	learn: 0.1170350	total: 278ms	remaining: 278ms
25:	learn: 0.1125149	total: 289ms	remaining: 267ms
26:	learn: 0.1093256	total: 300ms	remaining: 255ms
27:	learn: 0.1049069	total: 311ms	remaining: 245ms
28:	learn: 0.1018875	total: 323ms	remaining: 234ms
29:	learn: 0.0994253	total: 334ms	remaining: 222ms
30:	learn: 0.0962219	total: 345ms	remaining: 211ms
31:	learn: 0.0937515	total: 356ms	remaining: 200ms
32:	learn: 0.0914156	total: 367ms	remaining: 189ms
33:	learn: 0.0893254	total: 378ms	remaining: 178ms
34:	learn: 0.0869698	total: 389ms	remaining: 167ms
35:	learn: 0.0853424	total: 400ms	remaining: 156ms
36:	learn: 0.0837277	total: 412ms	remaining: 145ms
37:	learn: 0.0819006	total: 423ms	remaining: 134ms
38:	learn: 0.0799357	total: 434ms	remaining: 122ms
39:	learn: 0.0783811	total: 445ms	remaining: 111ms
40:	learn: 0.0771800	total: 456ms	remaining: 100ms
41:	learn: 0.0758937	total: 466ms	remaining: 88.8ms
42:	learn: 0.0745731	total: 478ms	remaining: 77.8ms
43:	learn: 0.0732704	total: 4

46:	learn: 0.0679866	total: 520ms	remaining: 33.2ms
47:	learn: 0.0669607	total: 530ms	remaining: 22.1ms
48:	learn: 0.0663366	total: 542ms	remaining: 11.1ms
49:	learn: 0.0655018	total: 552ms	remaining: 0us


In [68]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98106 (+/- 0.00465)
F1 Score: 0.98347 (+/- 0.00405)
Precision: 0.97717 (+/- 0.00683)
Recall: 0.98987 (+/- 0.00650)
