## Exhaustive Feature Selection

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

## Read Data

In [19]:
data = pd.read_csv('../DoHBrwTest.csv')
data.shape

(53860, 35)

In [20]:
data.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,is_intrusion
0,7.0,2.0,52742,443,18355.0,0.046455,55,1183.941449,66,1420.729738,...,1.0,0.0,0.0,0.046455,0.046455,0.046455,-10.0,-10.0,0.0,0
1,7.0,2.0,54640,443,18365.0,96.750105,42044,434.562836,44920,464.288902,...,0.561877,0.0017,0.041234,0.033549,0.026931,0.026952,0.481463,0.159978,1.229096,0
2,7.0,2.0,56611,443,18373.0,96.365606,41539,431.056284,44577,462.582054,...,0.549156,0.000956,0.030926,0.026551,0.026848,0.026879,-0.028834,-0.010614,1.164778,0
3,7.0,2.0,56611,443,18374.0,121.35682,60659,499.840058,67897,559.48236,...,0.55657,0.001013,0.031829,0.027571,0.026862,0.026941,0.066819,0.019791,1.154439,0
4,7.0,2.0,56611,443,18375.0,104.669253,30409,290.524668,30718,293.476825,...,0.331633,0.001226,0.035013,0.029797,0.026867,0.026908,0.251063,0.082517,1.175049,0


### Train - Test Split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),
    data['is_intrusion'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((37702, 34), (16158, 34))

### Remove correlated features

The Exhaustive Feature Selection takes a long time to run, so to speed it up we will reduce the feature space by removing correlated features first.

In [22]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_test, 0.6)
print('correlated features: ', len(set(corr_features)) )

correlated features:  15


In [32]:
corr_features

{'DestinationPort',
 'FlowBytesReceived',
 'FlowReceivedRate',
 'PacketLengthCoefficientofVariation',
 'PacketLengthMean',
 'PacketLengthSkewFromMode',
 'PacketLengthStandardDeviation',
 'PacketTimeCoefficientofVariation',
 'PacketTimeMean',
 'PacketTimeMedian',
 'PacketTimeStandardDeviation',
 'PacketTimeVariance',
 'ResponseTimeTimeMedian',
 'ResponseTimeTimeMode',
 'ResponseTimeTimeStandardDeviation'}

In [23]:
# removed correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((37702, 19), (16158, 19))

###  Exhaustive Feature Selection

In [24]:
# 1. the algorithm to create, in this case RandomForests
# 2. the number of minimum features we want our model to have
# 3. the number of maximum features we want our model to have 
# 4. the evaluation metric: in this case the roc_auc
# 5. the cross-validation

efs = EFS(RandomForestClassifier(n_estimators=5,
                                 n_jobs=4,
                                 random_state=0,
                                 max_depth=2),
          min_features=3,
          max_features=4,
          scoring='roc_auc',
          print_progress=True,
          cv=2)

efs = efs.fit(np.array(X_test), y_test)

Features: 4845/4845

The log above means that the search evaluated 3276 feature combinations!

In [25]:
efs.best_idx_

(0, 1, 3)

In [26]:
selected_feat = X_test.columns[list(efs.best_idx_)]
selected_feat

Index(['SourceIP', 'DestinationIP', 'TimeStamp'], dtype='object')

### Compare performance of feature subsets

In [27]:
# function to train random forests and evaluate the performance

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [28]:
# evaluate performance of classifier using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 1.0


In [29]:
# and for comparison, we train random forests using
# all features (except the correlated ones, which we removed already)

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 1.0


Even with 3 features, the performance is not super far off of that model built using 27 features.

In [30]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [31]:
X_train.shape, X_test.shape

((37702, 3), (16158, 3))

## Standardize Data




In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers




In [16]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation



In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression




In [18]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1).fit(X_train, y_train)

CPU times: user 56.5 ms, sys: 180 ms, total: 236 ms
Wall time: 1.46 s


In [19]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.46441754316332606
F1 Score: 0.6342692974848223
FPR: 1.0
TPR: 1.0


### Naive Bayes




In [20]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-09).fit(X_train, y_train)

CPU times: user 138 ms, sys: 5.99 ms, total: 144 ms
Wall time: 17.8 ms


In [21]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.470410795792816
F1 Score: 0.6294125815858909
FPR: 0.9613902475174152
TPR: 0.9683787710452099


### Random Forest




In [22]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 4.5 s, sys: 56.5 ms, total: 4.56 s
Wall time: 3.75 s


In [23]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.5302639412581861
F1 Score: 0.4009495500051521
FPR: 0.044390099303394104
TPR: 0.039740193145884964


### KNN




In [24]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance').fit(X_train, y_train)

CPU times: user 5.97 ms, sys: 1.63 ms, total: 7.6 ms
Wall time: 5.28 ms


In [25]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.08358801349474101
f1: 0.14081047891936144
fpr: 0.9841410997480362
tpr: 0.16169558157422442


### CatBoost





In [26]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.6590119	total: 62.8ms	remaining: 3.08s
1:	learn: 0.6280406	total: 69.6ms	remaining: 1.67s
2:	learn: 0.5986940	total: 76ms	remaining: 1.19s
3:	learn: 0.5720143	total: 82.7ms	remaining: 952ms
4:	learn: 0.5467693	total: 89.7ms	remaining: 807ms
5:	learn: 0.5228950	total: 96.9ms	remaining: 710ms
6:	learn: 0.5005614	total: 103ms	remaining: 635ms
7:	learn: 0.4796689	total: 110ms	remaining: 577ms
8:	learn: 0.4598923	total: 116ms	remaining: 530ms
9:	learn: 0.4419481	total: 123ms	remaining: 491ms
10:	learn: 0.4245110	total: 129ms	remaining: 457ms
11:	learn: 0.4082962	total: 135ms	remaining: 429ms
12:	learn: 0.3929668	total: 142ms	remaining: 405ms
13:	learn: 0.3784173	total: 149ms	remaining: 384ms
14:	learn: 0.3646854	total: 156ms	remaining: 363ms
15:	learn: 0.3517412	total: 162ms	remaining: 344ms
16:	learn: 0.3396057	total: 168ms	remaining: 326ms
17:	learn: 0.3280424	total: 175ms	remaining: 310ms
18:	learn: 0.3169731	total: 181ms	remaining: 295ms
19:	learn: 0.3066439	total: 187ms	rem

In [27]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.39615002976781105
F1 Score: 0.35575344514217877
FPR: 0.3631984585741811
TPR: 0.11862233997094265


## Model Evaluation




In [28]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../KDDTest.csv")
test_df.shape

(22543, 42)

In [29]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion'])

In [30]:
X_eval = X_eval[selected_feat]

In [31]:
X_eval.shape

(22543, 3)

### Model Evaluation - Logistic Regression



In [32]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=0.1, n_jobs=-1, random_state=42)

In [33]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [34]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.8161782946505651
Testing accuracy is  0.46441754316332606


In [35]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.46441754316332606
F1 Score: 0.6342692974848223
Precision Score: 0.46441754316332606
Recall Score: 1.0
Confusion Matrix:
 [[    0 13494]
 [    0 11701]]


### Cross validation - Logistic Regression





In [36]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.74112 (+/- 0.01798)
F1 Score: 0.78188 (+/- 0.05280)
Precision: 0.75387 (+/- 0.08661)
Recall: 0.82800 (+/- 0.22796)


### Model Evaluation - Naive Bayes





In [37]:
modelNB = GaussianNB(var_smoothing=1e-09)
modelNB.fit(X_train, y_train)

GaussianNB()

In [38]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [39]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.6291415700010915
Testing accuracy is  0.470410795792816


In [40]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.470410795792816
F1 Score: 0.6294125815858909
Precision Score: 0.46621955233706386
Recall Score: 0.9683787710452099
Confusion Matrix:
 [[  521 12973]
 [  370 11331]]


### Cross validation - Naive Bayes




In [41]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.61469 (+/- 0.06101)
F1 Score: 0.54327 (+/- 0.18170)
Precision: 0.84091 (+/- 0.14223)
Recall: 0.43291 (+/- 0.37372)


### Model Evaluation - Random Forest





In [42]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [43]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [44]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  0.9797572858886452
Testing accuracy is  0.5302639412581861


In [45]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.5302639412581861
F1 Score: 0.4009495500051521
Precision Score: 0.4891662398206622
Recall Score: 0.5302639412581861
Confusion Matrix:
 [[12895   599]
 [11236   465]]


### Cross validation - Random Forest





In [46]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.97747 (+/- 0.00525)
F1 Score: 0.98026 (+/- 0.00450)
Precision: 0.97808 (+/- 0.01023)
Recall: 0.98247 (+/- 0.00447)


### Model Evaluation - KNN

In [47]:
modelKNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=1, n_neighbors=2,
                     weights='distance')

In [48]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [49]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9682864145588775
Testing accuracy is  0.08358801349474101


In [50]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.08358801349474101
F1 Score: 0.14081047891936144
Precision Score: 0.1247034010018455
Recall Score: 0.16169558157422442
Confusion Matrix:
 [[  214 13280]
 [ 9809  1892]]


### Cross validation - KNN





In [51]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.96868 (+/- 0.01713)
F1 Score: 0.97232 (+/- 0.01590)
Precision: 0.97610 (+/- 0.00874)
Recall: 0.96875 (+/- 0.03297)


### Model Evaluation - CatBoost




In [52]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.6590119	total: 7.51ms	remaining: 368ms
1:	learn: 0.6280406	total: 14.8ms	remaining: 354ms
2:	learn: 0.5986940	total: 21.5ms	remaining: 337ms
3:	learn: 0.5720143	total: 27.7ms	remaining: 318ms
4:	learn: 0.5467693	total: 33.6ms	remaining: 303ms
5:	learn: 0.5228950	total: 40.3ms	remaining: 296ms
6:	learn: 0.5005614	total: 46.5ms	remaining: 286ms
7:	learn: 0.4796689	total: 52.7ms	remaining: 277ms
8:	learn: 0.4598923	total: 58.8ms	remaining: 268ms
9:	learn: 0.4419481	total: 65.2ms	remaining: 261ms
10:	learn: 0.4245110	total: 71.7ms	remaining: 254ms
11:	learn: 0.4082962	total: 77.8ms	remaining: 246ms
12:	learn: 0.3929668	total: 84.2ms	remaining: 240ms
13:	learn: 0.3784173	total: 90.2ms	remaining: 232ms
14:	learn: 0.3646854	total: 96.2ms	remaining: 225ms
15:	learn: 0.3517412	total: 103ms	remaining: 218ms
16:	learn: 0.3396057	total: 109ms	remaining: 211ms
17:	learn: 0.3280424	total: 115ms	remaining: 204ms
18:	learn: 0.3169731	total: 121ms	remaining: 198ms
19:	learn: 0.3066439	total

<catboost.core.CatBoostClassifier at 0x7fb5d9b8e3a0>

In [53]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [54]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.9647042479930936
Testing accuracy is  0.39615002976781105


In [55]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.39615002976781105
F1 Score: 0.35575344514217877
Precision Score: 0.34592680278349713
Recall Score: 0.39615002976781105
Confusion Matrix:
 [[ 8593  4901]
 [10313  1388]]


### Cross validation - CatBoost




In [56]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6634155	total: 4.16ms	remaining: 204ms
1:	learn: 0.6344239	total: 8ms	remaining: 192ms
2:	learn: 0.6084946	total: 11.9ms	remaining: 186ms
3:	learn: 0.5835297	total: 15.3ms	remaining: 175ms
4:	learn: 0.5601797	total: 19.5ms	remaining: 175ms
5:	learn: 0.5384516	total: 23.4ms	remaining: 172ms
6:	learn: 0.5186498	total: 27.2ms	remaining: 167ms
7:	learn: 0.4997127	total: 30.9ms	remaining: 162ms
8:	learn: 0.4819605	total: 34.8ms	remaining: 158ms
9:	learn: 0.4655933	total: 38.4ms	remaining: 153ms
10:	learn: 0.4501825	total: 42.4ms	remaining: 150ms
11:	learn: 0.4352704	total: 46.2ms	remaining: 146ms
12:	learn: 0.4212312	total: 50ms	remaining: 142ms
13:	learn: 0.4077347	total: 53.9ms	remaining: 139ms
14:	learn: 0.3954354	total: 57.7ms	remaining: 135ms
15:	learn: 0.3835612	total: 61.4ms	remaining: 131ms
16:	learn: 0.3727266	total: 65.4ms	remaining: 127ms
17:	learn: 0.3623003	total: 69.2ms	remaining: 123ms
18:	learn: 0.3524062	total: 72.9ms	remaining: 119ms
19:	learn: 0.3422993	total:

23:	learn: 0.3052596	total: 88.7ms	remaining: 96.1ms
24:	learn: 0.2976422	total: 92.2ms	remaining: 92.2ms
25:	learn: 0.2904600	total: 96.2ms	remaining: 88.8ms
26:	learn: 0.2839127	total: 99.8ms	remaining: 85ms
27:	learn: 0.2774596	total: 103ms	remaining: 81.2ms
28:	learn: 0.2704764	total: 107ms	remaining: 77.3ms
29:	learn: 0.2635094	total: 110ms	remaining: 73.5ms
30:	learn: 0.2582041	total: 114ms	remaining: 69.8ms
31:	learn: 0.2530655	total: 117ms	remaining: 65.9ms
32:	learn: 0.2481658	total: 121ms	remaining: 62.2ms
33:	learn: 0.2427159	total: 124ms	remaining: 58.6ms
34:	learn: 0.2374436	total: 128ms	remaining: 55ms
35:	learn: 0.2321643	total: 132ms	remaining: 51.2ms
36:	learn: 0.2274610	total: 136ms	remaining: 47.6ms
37:	learn: 0.2232799	total: 139ms	remaining: 44ms
38:	learn: 0.2188667	total: 143ms	remaining: 40.3ms
39:	learn: 0.2149669	total: 147ms	remaining: 36.7ms
40:	learn: 0.2106693	total: 150ms	remaining: 33ms
41:	learn: 0.2067286	total: 154ms	remaining: 29.3ms
42:	learn: 0.203

0:	learn: 0.6634973	total: 3.47ms	remaining: 170ms
1:	learn: 0.6345087	total: 6.91ms	remaining: 166ms
2:	learn: 0.6085636	total: 10.5ms	remaining: 164ms
3:	learn: 0.5834869	total: 14.4ms	remaining: 166ms
4:	learn: 0.5605174	total: 17.9ms	remaining: 161ms
5:	learn: 0.5388575	total: 21.5ms	remaining: 157ms
6:	learn: 0.5182839	total: 25.8ms	remaining: 159ms
7:	learn: 0.4996204	total: 29.6ms	remaining: 156ms
8:	learn: 0.4817295	total: 33.2ms	remaining: 151ms
9:	learn: 0.4652004	total: 36.8ms	remaining: 147ms
10:	learn: 0.4497917	total: 40.2ms	remaining: 143ms
11:	learn: 0.4349054	total: 44ms	remaining: 139ms
12:	learn: 0.4199378	total: 47.7ms	remaining: 136ms
13:	learn: 0.4065459	total: 51.4ms	remaining: 132ms
14:	learn: 0.3943095	total: 55.1ms	remaining: 129ms
15:	learn: 0.3824492	total: 59ms	remaining: 125ms
16:	learn: 0.3713585	total: 62.6ms	remaining: 121ms
17:	learn: 0.3606697	total: 66.3ms	remaining: 118ms
18:	learn: 0.3508572	total: 70.1ms	remaining: 114ms
19:	learn: 0.3407675	total

9:	learn: 0.4655933	total: 35.9ms	remaining: 144ms
10:	learn: 0.4501825	total: 39.8ms	remaining: 141ms
11:	learn: 0.4352704	total: 43.2ms	remaining: 137ms
12:	learn: 0.4212312	total: 46.7ms	remaining: 133ms
13:	learn: 0.4077347	total: 50.2ms	remaining: 129ms
14:	learn: 0.3954354	total: 53.7ms	remaining: 125ms
15:	learn: 0.3835612	total: 57.3ms	remaining: 122ms
16:	learn: 0.3727266	total: 60.9ms	remaining: 118ms
17:	learn: 0.3623003	total: 64.3ms	remaining: 114ms
18:	learn: 0.3524062	total: 67.7ms	remaining: 110ms
19:	learn: 0.3422993	total: 71.4ms	remaining: 107ms
20:	learn: 0.3333600	total: 74.9ms	remaining: 103ms
21:	learn: 0.3237123	total: 78.4ms	remaining: 99.7ms
22:	learn: 0.3150853	total: 82.1ms	remaining: 96.3ms
23:	learn: 0.3066568	total: 85.8ms	remaining: 93ms
24:	learn: 0.2989555	total: 89.5ms	remaining: 89.5ms
25:	learn: 0.2917362	total: 93.4ms	remaining: 86.2ms
26:	learn: 0.2846720	total: 97.1ms	remaining: 82.7ms
27:	learn: 0.2783008	total: 101ms	remaining: 79.3ms
28:	learn

0:	learn: 0.6629951	total: 3.74ms	remaining: 183ms
1:	learn: 0.6342058	total: 7.36ms	remaining: 177ms
2:	learn: 0.6074905	total: 11ms	remaining: 172ms
3:	learn: 0.5826234	total: 14.5ms	remaining: 166ms
4:	learn: 0.5600783	total: 18ms	remaining: 162ms
5:	learn: 0.5384675	total: 21.4ms	remaining: 157ms
6:	learn: 0.5191682	total: 25ms	remaining: 154ms
7:	learn: 0.5000511	total: 28.9ms	remaining: 152ms
8:	learn: 0.4822438	total: 32.6ms	remaining: 149ms
9:	learn: 0.4657575	total: 36.2ms	remaining: 145ms
10:	learn: 0.4503375	total: 40ms	remaining: 142ms
11:	learn: 0.4354607	total: 43.7ms	remaining: 138ms
12:	learn: 0.4205422	total: 47.4ms	remaining: 135ms
13:	learn: 0.4071576	total: 51.3ms	remaining: 132ms
14:	learn: 0.3935234	total: 54.8ms	remaining: 128ms
15:	learn: 0.3816855	total: 58.7ms	remaining: 125ms
16:	learn: 0.3703375	total: 62.3ms	remaining: 121ms
17:	learn: 0.3599022	total: 65.9ms	remaining: 117ms
18:	learn: 0.3496640	total: 69.7ms	remaining: 114ms
19:	learn: 0.3398075	total: 73

0:	learn: 0.6635041	total: 3.41ms	remaining: 167ms
1:	learn: 0.6346459	total: 6.93ms	remaining: 166ms
2:	learn: 0.6088681	total: 10.5ms	remaining: 164ms
3:	learn: 0.5839539	total: 14.1ms	remaining: 162ms
4:	learn: 0.5607169	total: 17.4ms	remaining: 157ms
5:	learn: 0.5391719	total: 21ms	remaining: 154ms
6:	learn: 0.5195957	total: 24.8ms	remaining: 152ms
7:	learn: 0.5005866	total: 28.5ms	remaining: 150ms
8:	learn: 0.4828488	total: 32.4ms	remaining: 148ms
9:	learn: 0.4668762	total: 36.1ms	remaining: 144ms
10:	learn: 0.4515124	total: 39.8ms	remaining: 141ms
11:	learn: 0.4367863	total: 43.6ms	remaining: 138ms
12:	learn: 0.4225354	total: 47.5ms	remaining: 135ms
13:	learn: 0.4095237	total: 51.1ms	remaining: 132ms
14:	learn: 0.3971981	total: 55ms	remaining: 128ms
15:	learn: 0.3852780	total: 58.6ms	remaining: 125ms
16:	learn: 0.3741371	total: 62.5ms	remaining: 121ms
17:	learn: 0.3637962	total: 66.4ms	remaining: 118ms
18:	learn: 0.3537543	total: 70.2ms	remaining: 114ms
19:	learn: 0.3436940	total

0:	learn: 0.6634931	total: 3.48ms	remaining: 171ms
1:	learn: 0.6353426	total: 7.04ms	remaining: 169ms
2:	learn: 0.6093161	total: 10.5ms	remaining: 164ms
3:	learn: 0.5843471	total: 14.1ms	remaining: 163ms
4:	learn: 0.5611902	total: 18.2ms	remaining: 164ms
5:	learn: 0.5394991	total: 22ms	remaining: 162ms
6:	learn: 0.5189061	total: 25.7ms	remaining: 158ms
7:	learn: 0.4999837	total: 29.5ms	remaining: 155ms
8:	learn: 0.4825229	total: 33.4ms	remaining: 152ms
9:	learn: 0.4664043	total: 37.2ms	remaining: 149ms
10:	learn: 0.4509594	total: 40.8ms	remaining: 145ms
11:	learn: 0.4359939	total: 44.8ms	remaining: 142ms
12:	learn: 0.4219241	total: 48.6ms	remaining: 138ms
13:	learn: 0.4084758	total: 52.2ms	remaining: 134ms
14:	learn: 0.3961289	total: 55.8ms	remaining: 130ms
15:	learn: 0.3841465	total: 59.5ms	remaining: 126ms
16:	learn: 0.3726096	total: 63.3ms	remaining: 123ms
17:	learn: 0.3614009	total: 67ms	remaining: 119ms
18:	learn: 0.3509676	total: 70.8ms	remaining: 116ms
19:	learn: 0.3411282	total

0:	learn: 0.6634359	total: 3.63ms	remaining: 178ms
1:	learn: 0.6352872	total: 7.35ms	remaining: 176ms
2:	learn: 0.6092431	total: 10.9ms	remaining: 171ms
3:	learn: 0.5842529	total: 14.5ms	remaining: 167ms
4:	learn: 0.5610594	total: 18.1ms	remaining: 163ms
5:	learn: 0.5393329	total: 21.6ms	remaining: 159ms
6:	learn: 0.5192692	total: 24.9ms	remaining: 153ms
7:	learn: 0.5002846	total: 28.6ms	remaining: 150ms
8:	learn: 0.4825228	total: 32.6ms	remaining: 149ms
9:	learn: 0.4661573	total: 36.5ms	remaining: 146ms
10:	learn: 0.4507281	total: 40.2ms	remaining: 143ms
11:	learn: 0.4358626	total: 44.1ms	remaining: 140ms
12:	learn: 0.4209321	total: 47.8ms	remaining: 136ms
13:	learn: 0.4080014	total: 51.7ms	remaining: 133ms
14:	learn: 0.3955278	total: 55.4ms	remaining: 129ms
15:	learn: 0.3835089	total: 59.2ms	remaining: 126ms
16:	learn: 0.3719288	total: 62.9ms	remaining: 122ms
17:	learn: 0.3607198	total: 66.6ms	remaining: 118ms
18:	learn: 0.3509562	total: 70.6ms	remaining: 115ms
19:	learn: 0.3410261	t

0:	learn: 0.6634155	total: 3.43ms	remaining: 168ms
1:	learn: 0.6344239	total: 6.93ms	remaining: 166ms
2:	learn: 0.6084946	total: 10.5ms	remaining: 164ms
3:	learn: 0.5835297	total: 14ms	remaining: 161ms
4:	learn: 0.5601797	total: 17.4ms	remaining: 156ms
5:	learn: 0.5384516	total: 21.2ms	remaining: 155ms
6:	learn: 0.5186498	total: 24.9ms	remaining: 153ms
7:	learn: 0.4997127	total: 28.8ms	remaining: 151ms
8:	learn: 0.4819605	total: 32.3ms	remaining: 147ms
9:	learn: 0.4655933	total: 36.1ms	remaining: 144ms
10:	learn: 0.4501825	total: 40ms	remaining: 142ms
11:	learn: 0.4352704	total: 43.7ms	remaining: 139ms
12:	learn: 0.4212312	total: 47.6ms	remaining: 135ms
13:	learn: 0.4077347	total: 51.5ms	remaining: 132ms
14:	learn: 0.3954354	total: 55.2ms	remaining: 129ms
15:	learn: 0.3835612	total: 59ms	remaining: 125ms
16:	learn: 0.3727266	total: 62.6ms	remaining: 122ms
17:	learn: 0.3623003	total: 66.4ms	remaining: 118ms
18:	learn: 0.3524062	total: 70.3ms	remaining: 115ms
19:	learn: 0.3422993	total: 

0:	learn: 0.6629951	total: 3.35ms	remaining: 164ms
1:	learn: 0.6342058	total: 6.96ms	remaining: 167ms
2:	learn: 0.6074905	total: 10.5ms	remaining: 165ms
3:	learn: 0.5826234	total: 14ms	remaining: 161ms
4:	learn: 0.5600783	total: 17.6ms	remaining: 158ms
5:	learn: 0.5384675	total: 21.1ms	remaining: 155ms
6:	learn: 0.5191682	total: 24.5ms	remaining: 150ms
7:	learn: 0.5000511	total: 28.2ms	remaining: 148ms
8:	learn: 0.4822438	total: 31.8ms	remaining: 145ms
9:	learn: 0.4657575	total: 35.5ms	remaining: 142ms
10:	learn: 0.4503375	total: 39ms	remaining: 138ms
11:	learn: 0.4354607	total: 42.7ms	remaining: 135ms
12:	learn: 0.4205422	total: 46.3ms	remaining: 132ms
13:	learn: 0.4071576	total: 50ms	remaining: 128ms
14:	learn: 0.3935234	total: 53.7ms	remaining: 125ms
15:	learn: 0.3816855	total: 57.3ms	remaining: 122ms
16:	learn: 0.3703375	total: 60.9ms	remaining: 118ms
17:	learn: 0.3599022	total: 64.5ms	remaining: 115ms
18:	learn: 0.3496640	total: 68.3ms	remaining: 111ms
19:	learn: 0.3398075	total: 

0:	learn: 0.6635041	total: 3.35ms	remaining: 164ms
1:	learn: 0.6346459	total: 6.98ms	remaining: 168ms
2:	learn: 0.6088681	total: 10.6ms	remaining: 166ms
3:	learn: 0.5839539	total: 13.9ms	remaining: 160ms
4:	learn: 0.5607169	total: 17.5ms	remaining: 157ms
5:	learn: 0.5391719	total: 21ms	remaining: 154ms
6:	learn: 0.5195957	total: 24.8ms	remaining: 153ms
7:	learn: 0.5005866	total: 28.8ms	remaining: 151ms
8:	learn: 0.4828488	total: 32.5ms	remaining: 148ms
9:	learn: 0.4668762	total: 36.1ms	remaining: 144ms
10:	learn: 0.4515124	total: 39.9ms	remaining: 141ms
11:	learn: 0.4367863	total: 43.9ms	remaining: 139ms
12:	learn: 0.4225354	total: 47.5ms	remaining: 135ms
13:	learn: 0.4095237	total: 51.2ms	remaining: 132ms
14:	learn: 0.3971981	total: 55.2ms	remaining: 129ms
15:	learn: 0.3852780	total: 59.2ms	remaining: 126ms
16:	learn: 0.3741371	total: 62.9ms	remaining: 122ms
17:	learn: 0.3637962	total: 66.6ms	remaining: 118ms
18:	learn: 0.3537543	total: 70.2ms	remaining: 115ms
19:	learn: 0.3436940	tot

In [57]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.95564 (+/- 0.00993)
F1 Score: 0.96181 (+/- 0.00839)
Precision: 0.94337 (+/- 0.01227)
Recall: 0.98099 (+/- 0.00608)
