## Lasso regularisation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

## Read Data

In [2]:
data = pd.read_csv('../DoHBrwTest.csv')
data.shape

(53860, 35)

In [3]:
data.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,is_intrusion
0,7.0,2.0,52742,443,18355.0,0.046455,55,1183.941449,66,1420.729738,...,1.0,0.0,0.0,0.046455,0.046455,0.046455,-10.0,-10.0,0.0,0
1,7.0,2.0,54640,443,18365.0,96.750105,42044,434.562836,44920,464.288902,...,0.561877,0.0017,0.041234,0.033549,0.026931,0.026952,0.481463,0.159978,1.229096,0
2,7.0,2.0,56611,443,18373.0,96.365606,41539,431.056284,44577,462.582054,...,0.549156,0.000956,0.030926,0.026551,0.026848,0.026879,-0.028834,-0.010614,1.164778,0
3,7.0,2.0,56611,443,18374.0,121.35682,60659,499.840058,67897,559.48236,...,0.55657,0.001013,0.031829,0.027571,0.026862,0.026941,0.066819,0.019791,1.154439,0
4,7.0,2.0,56611,443,18375.0,104.669253,30409,290.524668,30718,293.476825,...,0.331633,0.001226,0.035013,0.029797,0.026867,0.026908,0.251063,0.082517,1.175049,0


In [4]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]
data.shape

(53860, 35)

### Train - Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),
    data['is_intrusion'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((37702, 34), (16158, 34))

In [6]:
# linear models benefit from feature scaling

scaler = StandardScaler()
scaler.fit(X_train.fillna(0))

StandardScaler()

### Select features with Lasso

In [7]:
sel_ = SelectFromModel(
    LogisticRegression(C=0.001, penalty='l1', solver='liblinear', random_state=42))

sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=LogisticRegression(C=0.001, penalty='l1',
                                             random_state=42,
                                             solver='liblinear'))

In [8]:
# Visualise the index of the # features that were selected

sel_.get_support()

array([ True, False, False, False,  True,  True, False, False,  True,
       False, False, False, False, False,  True, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False,  True, False, False])

In [9]:
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 34
selected features: 7
features with coefficients shrank to zero: 27


### Examine coefficients that shrank to zero

In [10]:
# the number of features which coefficient was shrank to zero:
np.sum(sel_.estimator_.coef_ == 0)

27

In [11]:
# we can identify the removed features like this:

removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index(['DestinationIP', 'SourcePort', 'DestinationPort', 'FlowBytesSent',
       'FlowSentRate', 'FlowReceivedRate', 'PacketLengthVariance',
       'PacketLengthStandardDeviation', 'PacketLengthMean',
       'PacketLengthMedian', 'PacketLengthSkewFromMedian',
       'PacketLengthSkewFromMode', 'PacketLengthCoefficientofVariation',
       'PacketTimeVariance', 'PacketTimeStandardDeviation', 'PacketTimeMedian',
       'PacketTimeMode', 'PacketTimeSkewFromMedian', 'PacketTimeSkewFromMode',
       'PacketTimeCoefficientofVariation', 'ResponseTimeTimeVariance',
       'ResponseTimeTimeStandardDeviation', 'ResponseTimeTimeMean',
       'ResponseTimeTimeMedian', 'ResponseTimeTimeMode',
       'ResponseTimeTimeSkewFromMode',
       'ResponseTimeTimeCoefficientofVariation'],
      dtype='object')

In [12]:
# we can then remove the features from the training and testing set
# like this:

X_train_selected = sel_.transform(X_train)
X_test_selected = sel_.transform(X_test)

X_train_selected.shape, X_test_selected.shape

((37702, 7), (16158, 7))

## Classifiers




In [72]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation




In [73]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression




In [74]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1).fit(X_train_selected, y_train)

CPU times: user 71.9 ms, sys: 209 ms, total: 281 ms
Wall time: 3.41 s


In [75]:
pred_y_test = clf_LR.predict(X_test_selected)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9491962691010121
F1 Score: 0.9447704521919227
FPR: 0.03905439454572403
TPR: 0.935646525937954


### Naive Bayes




In [76]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-09).fit(X_train_selected, y_train)

CPU times: user 440 ms, sys: 45.2 ms, total: 485 ms
Wall time: 61.7 ms


In [77]:
pred_y_testNB = clf_NB.predict(X_test_selected)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.908989878944235
F1 Score: 0.9011254365917813
FPR: 0.0771453979546465
TPR: 0.8930005982394668


### Random Forest




In [78]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train_selected, y_train)

CPU times: user 5.42 s, sys: 88.7 ms, total: 5.51 s
Wall time: 5.11 s


In [79]:
pred_y_testRF = clf_RF.predict(X_test_selected)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9969041476483429
F1 Score: 0.996903987842693
FPR: 0.0022232103156958646
TPR: 0.9958977865139732


### KNN




In [80]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance').fit(X_train_selected, y_train)

CPU times: user 8.44 ms, sys: 2.93 ms, total: 11.4 ms
Wall time: 9.19 ms


In [81]:
pred_y_testKNN = clf_KNN.predict(X_test_selected)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.9886485413772574
f1: 0.9878059179670845
fpr: 0.012524084778420038
tpr: 0.9900008546278096


### CatBoost




In [82]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train_selected, y_train)

0:	learn: 0.6585221	total: 9.11ms	remaining: 446ms
1:	learn: 0.6262089	total: 17.4ms	remaining: 416ms
2:	learn: 0.5970020	total: 29.4ms	remaining: 460ms
3:	learn: 0.5693399	total: 37.5ms	remaining: 431ms
4:	learn: 0.5429960	total: 46.2ms	remaining: 416ms
5:	learn: 0.5189499	total: 54.1ms	remaining: 396ms
6:	learn: 0.4964442	total: 62ms	remaining: 381ms
7:	learn: 0.4757434	total: 70.3ms	remaining: 369ms
8:	learn: 0.4557317	total: 78.2ms	remaining: 356ms
9:	learn: 0.4365645	total: 86ms	remaining: 344ms
10:	learn: 0.4190039	total: 94.2ms	remaining: 334ms
11:	learn: 0.4023201	total: 102ms	remaining: 324ms
12:	learn: 0.3865211	total: 110ms	remaining: 313ms
13:	learn: 0.3715625	total: 118ms	remaining: 303ms
14:	learn: 0.3575209	total: 127ms	remaining: 296ms
15:	learn: 0.3441613	total: 135ms	remaining: 286ms
16:	learn: 0.3314723	total: 144ms	remaining: 279ms
17:	learn: 0.3192954	total: 152ms	remaining: 271ms
18:	learn: 0.3079326	total: 163ms	remaining: 265ms
19:	learn: 0.2969655	total: 172ms	

In [83]:
pred_y_testCB = clf_CB.predict(X_test_selected)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9833697162135344
F1 Score: 0.9833661278685017
FPR: 0.01282051282051282
TPR: 0.9789761558841125


## Model Evaluation




In [84]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../KDDTest.csv")
test_df.shape

(22543, 42)

In [85]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion'])

In [86]:
X_eval = X_eval[selected_feat]

In [87]:
X_eval.shape

(22543, 19)

### Model Evaluation - Logistic Regression




In [88]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1)
modelLR.fit(X_train_selected, y_train)

LogisticRegression(C=0.1, n_jobs=-1, random_state=42)

In [89]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test_selected)

In [90]:
train_scoreLR = modelLR.score(X_train_selected, y_train)
test_scoreLR = modelLR.score(X_test_selected, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9493237544280937
Testing accuracy is  0.9491962691010121


In [91]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.9491962691010121
F1 Score: 0.9447704521919227
Precision Score: 0.9540740740740741
Recall Score: 0.935646525937954
Confusion Matrix:
 [[12967   527]
 [  753 10948]]


### Cross validation - Logistic Regression




In [92]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.89057 (+/- 0.01153)
F1 Score: 0.90125 (+/- 0.01043)
Precision: 0.92658 (+/- 0.01551)
Recall: 0.87734 (+/- 0.01559)


### Model Evaluation - Naive Bayes



In [93]:
modelNB = GaussianNB(var_smoothing=1e-09)
modelNB.fit(X_train_selected, y_train)

GaussianNB()

In [94]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test_selected)

In [95]:
train_scoreNB = modelNB.score(X_train_selected, y_train)
test_scoreNB = modelNB.score(X_test_selected, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.9109717495063357
Testing accuracy is  0.908989878944235


In [96]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.908989878944235
F1 Score: 0.9011254365917813
Precision Score: 0.9093994778067885
Recall Score: 0.8930005982394668
Confusion Matrix:
 [[12453  1041]
 [ 1252 10449]]


### Cross validation - Naive Bayes




In [97]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.81254 (+/- 0.01514)
F1 Score: 0.81151 (+/- 0.01723)
Precision: 0.94851 (+/- 0.01077)
Recall: 0.70916 (+/- 0.02338)


### Model Evaluation - Random Forest




In [98]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train_selected, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [99]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test_selected)

In [100]:
train_scoreRF = modelRF.score(X_train_selected, y_train)
test_scoreRF = modelRF.score(X_test_selected, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  0.999434394752771
Testing accuracy is  0.9969041476483429


In [101]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.9969041476483429
F1 Score: 0.996903987842693
Precision Score: 0.996904851048095
Recall Score: 0.9969041476483429
Confusion Matrix:
 [[13464    30]
 [   48 11653]]


### Cross validation - Random Forest




In [102]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.97551 (+/- 0.00461)
F1 Score: 0.97855 (+/- 0.00404)
Precision: 0.97598 (+/- 0.00574)
Recall: 0.98114 (+/- 0.00595)


### Model Evaluation - KNN

In [103]:
modelKNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance')
modelKNN.fit(X_train_selected, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=1, n_neighbors=2,
                     weights='distance')

In [104]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test_selected)

In [105]:
train_scoreKNN = modelKNN.score(X_train_selected, y_train)
test_scoreKNN = modelKNN.score(X_test_selected, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9992657054685097
Testing accuracy is  0.9886485413772574


In [106]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.9886485413772574
F1 Score: 0.9878059179670845
Precision Score: 0.9856206925891262
Recall Score: 0.9900008546278096
Confusion Matrix:
 [[13325   169]
 [  117 11584]]


### Cross validation - KNN



In [107]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.93337 (+/- 0.01177)
F1 Score: 0.94175 (+/- 0.01024)
Precision: 0.93743 (+/- 0.01376)
Recall: 0.94615 (+/- 0.01290)


### Model Evaluation - CatBoost




In [108]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train_selected, y_train)

0:	learn: 0.6585221	total: 8.58ms	remaining: 420ms
1:	learn: 0.6262089	total: 16.6ms	remaining: 398ms
2:	learn: 0.5970020	total: 24.3ms	remaining: 381ms
3:	learn: 0.5693399	total: 32.1ms	remaining: 369ms
4:	learn: 0.5429960	total: 40.5ms	remaining: 365ms
5:	learn: 0.5189499	total: 50.1ms	remaining: 367ms
6:	learn: 0.4964442	total: 58ms	remaining: 356ms
7:	learn: 0.4757434	total: 65.7ms	remaining: 345ms
8:	learn: 0.4557317	total: 73.2ms	remaining: 334ms
9:	learn: 0.4365645	total: 81.1ms	remaining: 325ms
10:	learn: 0.4190039	total: 89.3ms	remaining: 317ms
11:	learn: 0.4023201	total: 97.2ms	remaining: 308ms
12:	learn: 0.3865211	total: 105ms	remaining: 299ms
13:	learn: 0.3715625	total: 113ms	remaining: 290ms
14:	learn: 0.3575209	total: 120ms	remaining: 280ms
15:	learn: 0.3441613	total: 128ms	remaining: 272ms
16:	learn: 0.3314723	total: 136ms	remaining: 264ms
17:	learn: 0.3192954	total: 144ms	remaining: 256ms
18:	learn: 0.3079326	total: 152ms	remaining: 247ms
19:	learn: 0.2969655	total: 160

<catboost.core.CatBoostClassifier at 0x7fdd802fd160>

In [109]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test_selected)

In [110]:
train_scoreCB = modelCB.score(X_train_selected, y_train)
test_scoreCB = modelCB.score(X_test_selected, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.9828234617025711
Testing accuracy is  0.9833697162135344


In [111]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.9833697162135344
F1 Score: 0.9833661278685017
Precision Score: 0.9833791453728986
Recall Score: 0.9833697162135344
Confusion Matrix:
 [[13321   173]
 [  246 11455]]


### Cross validation - CatBoost





In [112]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6651532	total: 4.98ms	remaining: 244ms
1:	learn: 0.6402880	total: 9.47ms	remaining: 227ms
2:	learn: 0.6150734	total: 13.7ms	remaining: 215ms
3:	learn: 0.5908968	total: 17.9ms	remaining: 206ms
4:	learn: 0.5692891	total: 22.2ms	remaining: 199ms
5:	learn: 0.5484332	total: 26.8ms	remaining: 197ms
6:	learn: 0.5293679	total: 32.4ms	remaining: 199ms
7:	learn: 0.5115686	total: 37.6ms	remaining: 197ms
8:	learn: 0.4938518	total: 42.9ms	remaining: 195ms
9:	learn: 0.4785212	total: 48ms	remaining: 192ms
10:	learn: 0.4630961	total: 52.5ms	remaining: 186ms
11:	learn: 0.4479192	total: 57.1ms	remaining: 181ms
12:	learn: 0.4340350	total: 62ms	remaining: 176ms
13:	learn: 0.4217504	total: 66.9ms	remaining: 172ms
14:	learn: 0.4095452	total: 72ms	remaining: 168ms
15:	learn: 0.3972860	total: 76.5ms	remaining: 162ms
16:	learn: 0.3859868	total: 80.9ms	remaining: 157ms
17:	learn: 0.3753544	total: 86.1ms	remaining: 153ms
18:	learn: 0.3658161	total: 91.4ms	remaining: 149ms
19:	learn: 0.3561806	total: 

10:	learn: 0.4635863	total: 52.1ms	remaining: 185ms
11:	learn: 0.4484016	total: 57.4ms	remaining: 182ms
12:	learn: 0.4342162	total: 62.7ms	remaining: 179ms
13:	learn: 0.4215121	total: 67.8ms	remaining: 174ms
14:	learn: 0.4091225	total: 73.5ms	remaining: 171ms
15:	learn: 0.3974305	total: 79.1ms	remaining: 168ms
16:	learn: 0.3861798	total: 83.9ms	remaining: 163ms
17:	learn: 0.3758890	total: 88.9ms	remaining: 158ms
18:	learn: 0.3663574	total: 93.3ms	remaining: 152ms
19:	learn: 0.3565122	total: 97.8ms	remaining: 147ms
20:	learn: 0.3478354	total: 102ms	remaining: 141ms
21:	learn: 0.3392597	total: 107ms	remaining: 136ms
22:	learn: 0.3307435	total: 113ms	remaining: 132ms
23:	learn: 0.3227156	total: 117ms	remaining: 127ms
24:	learn: 0.3148288	total: 122ms	remaining: 122ms
25:	learn: 0.3072044	total: 127ms	remaining: 117ms
26:	learn: 0.2998507	total: 133ms	remaining: 113ms
27:	learn: 0.2940238	total: 137ms	remaining: 108ms
28:	learn: 0.2875107	total: 142ms	remaining: 103ms
29:	learn: 0.2814392	

29:	learn: 0.2804829	total: 138ms	remaining: 92.1ms
30:	learn: 0.2743952	total: 143ms	remaining: 87.7ms
31:	learn: 0.2684718	total: 148ms	remaining: 83.2ms
32:	learn: 0.2633465	total: 152ms	remaining: 78.4ms
33:	learn: 0.2579187	total: 156ms	remaining: 73.6ms
34:	learn: 0.2531097	total: 161ms	remaining: 68.9ms
35:	learn: 0.2487314	total: 166ms	remaining: 64.4ms
36:	learn: 0.2442908	total: 170ms	remaining: 59.9ms
37:	learn: 0.2396011	total: 176ms	remaining: 55.5ms
38:	learn: 0.2359022	total: 180ms	remaining: 50.9ms
39:	learn: 0.2324132	total: 185ms	remaining: 46.2ms
40:	learn: 0.2289077	total: 189ms	remaining: 41.5ms
41:	learn: 0.2249586	total: 194ms	remaining: 36.9ms
42:	learn: 0.2213540	total: 198ms	remaining: 32.3ms
43:	learn: 0.2179566	total: 202ms	remaining: 27.6ms
44:	learn: 0.2144059	total: 207ms	remaining: 23ms
45:	learn: 0.2112381	total: 212ms	remaining: 18.4ms
46:	learn: 0.2080027	total: 216ms	remaining: 13.8ms
47:	learn: 0.2048225	total: 220ms	remaining: 9.18ms
48:	learn: 0.2

3:	learn: 0.5908968	total: 16.6ms	remaining: 191ms
4:	learn: 0.5692891	total: 21ms	remaining: 189ms
5:	learn: 0.5484332	total: 25.6ms	remaining: 188ms
6:	learn: 0.5293679	total: 29.9ms	remaining: 184ms
7:	learn: 0.5115686	total: 34.3ms	remaining: 180ms
8:	learn: 0.4938518	total: 39ms	remaining: 177ms
9:	learn: 0.4785212	total: 43.5ms	remaining: 174ms
10:	learn: 0.4630961	total: 48.1ms	remaining: 171ms
11:	learn: 0.4479192	total: 52.5ms	remaining: 166ms
12:	learn: 0.4340350	total: 57.2ms	remaining: 163ms
13:	learn: 0.4217504	total: 61.5ms	remaining: 158ms
14:	learn: 0.4095452	total: 66.6ms	remaining: 155ms
15:	learn: 0.3972860	total: 70.9ms	remaining: 151ms
16:	learn: 0.3859868	total: 75.5ms	remaining: 147ms
17:	learn: 0.3753544	total: 80ms	remaining: 142ms
18:	learn: 0.3658161	total: 84.2ms	remaining: 137ms
19:	learn: 0.3561806	total: 88.6ms	remaining: 133ms
20:	learn: 0.3474185	total: 92.7ms	remaining: 128ms
21:	learn: 0.3388222	total: 97.7ms	remaining: 124ms
22:	learn: 0.3305371	tota

27:	learn: 0.2940238	total: 129ms	remaining: 101ms
28:	learn: 0.2875107	total: 134ms	remaining: 96.9ms
29:	learn: 0.2814392	total: 138ms	remaining: 92.1ms
30:	learn: 0.2755619	total: 142ms	remaining: 87.3ms
31:	learn: 0.2695880	total: 147ms	remaining: 82.6ms
32:	learn: 0.2643983	total: 151ms	remaining: 78ms
33:	learn: 0.2588856	total: 157ms	remaining: 73.7ms
34:	learn: 0.2535951	total: 161ms	remaining: 69ms
35:	learn: 0.2489687	total: 166ms	remaining: 64.5ms
36:	learn: 0.2446975	total: 170ms	remaining: 59.8ms
37:	learn: 0.2402135	total: 175ms	remaining: 55.2ms
38:	learn: 0.2359469	total: 179ms	remaining: 50.6ms
39:	learn: 0.2317127	total: 184ms	remaining: 45.9ms
40:	learn: 0.2282336	total: 188ms	remaining: 41.3ms
41:	learn: 0.2244065	total: 193ms	remaining: 36.7ms
42:	learn: 0.2205654	total: 197ms	remaining: 32.1ms
43:	learn: 0.2172131	total: 202ms	remaining: 27.6ms
44:	learn: 0.2135907	total: 207ms	remaining: 22.9ms
45:	learn: 0.2101746	total: 211ms	remaining: 18.4ms
46:	learn: 0.2073

47:	learn: 0.2048225	total: 218ms	remaining: 9.09ms
48:	learn: 0.2021993	total: 224ms	remaining: 4.58ms
49:	learn: 0.1992742	total: 229ms	remaining: 0us
0:	learn: 0.6648711	total: 4.49ms	remaining: 220ms
1:	learn: 0.6389859	total: 8.94ms	remaining: 215ms
2:	learn: 0.6137806	total: 13.2ms	remaining: 206ms
3:	learn: 0.5899227	total: 17.7ms	remaining: 203ms
4:	learn: 0.5682193	total: 21.6ms	remaining: 194ms
5:	learn: 0.5476141	total: 25.7ms	remaining: 188ms
6:	learn: 0.5285959	total: 29.7ms	remaining: 183ms
7:	learn: 0.5108279	total: 34ms	remaining: 179ms
8:	learn: 0.4931726	total: 38.3ms	remaining: 175ms
9:	learn: 0.4776284	total: 43.1ms	remaining: 172ms
10:	learn: 0.4623783	total: 47.7ms	remaining: 169ms
11:	learn: 0.4473228	total: 52.2ms	remaining: 165ms
12:	learn: 0.4334925	total: 56.6ms	remaining: 161ms
13:	learn: 0.4212617	total: 61.2ms	remaining: 157ms
14:	learn: 0.4086243	total: 65.8ms	remaining: 153ms
15:	learn: 0.3963995	total: 69.9ms	remaining: 149ms
16:	learn: 0.3851903	total:

15:	learn: 0.3972860	total: 72.4ms	remaining: 154ms
16:	learn: 0.3859868	total: 76.9ms	remaining: 149ms
17:	learn: 0.3753544	total: 81.5ms	remaining: 145ms
18:	learn: 0.3658161	total: 86.1ms	remaining: 140ms
19:	learn: 0.3561806	total: 90.5ms	remaining: 136ms
20:	learn: 0.3474185	total: 94.9ms	remaining: 131ms
21:	learn: 0.3388222	total: 99.4ms	remaining: 126ms
22:	learn: 0.3305371	total: 104ms	remaining: 123ms
23:	learn: 0.3225304	total: 109ms	remaining: 118ms
24:	learn: 0.3146432	total: 114ms	remaining: 114ms
25:	learn: 0.3070165	total: 118ms	remaining: 109ms
26:	learn: 0.2997349	total: 123ms	remaining: 105ms
27:	learn: 0.2936282	total: 128ms	remaining: 100ms
28:	learn: 0.2871700	total: 132ms	remaining: 95.8ms
29:	learn: 0.2810291	total: 136ms	remaining: 91ms
30:	learn: 0.2751468	total: 141ms	remaining: 86.3ms
31:	learn: 0.2692382	total: 145ms	remaining: 81.7ms
32:	learn: 0.2640658	total: 150ms	remaining: 77.4ms
33:	learn: 0.2585714	total: 155ms	remaining: 73ms
34:	learn: 0.2536899	t

42:	learn: 0.2205654	total: 196ms	remaining: 32ms
43:	learn: 0.2172131	total: 201ms	remaining: 27.4ms
44:	learn: 0.2135907	total: 205ms	remaining: 22.8ms
45:	learn: 0.2101746	total: 210ms	remaining: 18.3ms
46:	learn: 0.2073625	total: 215ms	remaining: 13.7ms
47:	learn: 0.2042660	total: 219ms	remaining: 9.14ms
48:	learn: 0.2012114	total: 224ms	remaining: 4.57ms
49:	learn: 0.1983653	total: 229ms	remaining: 0us
0:	learn: 0.6650889	total: 4.32ms	remaining: 212ms
1:	learn: 0.6392126	total: 8.61ms	remaining: 207ms
2:	learn: 0.6132293	total: 12.8ms	remaining: 200ms
3:	learn: 0.5892842	total: 17ms	remaining: 195ms
4:	learn: 0.5675069	total: 21.2ms	remaining: 191ms
5:	learn: 0.5462235	total: 25.5ms	remaining: 187ms
6:	learn: 0.5272324	total: 30.1ms	remaining: 185ms
7:	learn: 0.5094074	total: 34.8ms	remaining: 183ms
8:	learn: 0.4918057	total: 39.6ms	remaining: 180ms
9:	learn: 0.4766223	total: 44.4ms	remaining: 177ms
10:	learn: 0.4618974	total: 48.9ms	remaining: 173ms
11:	learn: 0.4467714	total: 5

9:	learn: 0.4776284	total: 43.5ms	remaining: 174ms
10:	learn: 0.4623783	total: 48.1ms	remaining: 171ms
11:	learn: 0.4473228	total: 52.5ms	remaining: 166ms
12:	learn: 0.4334925	total: 57.3ms	remaining: 163ms
13:	learn: 0.4212617	total: 62.4ms	remaining: 161ms
14:	learn: 0.4086243	total: 67.8ms	remaining: 158ms
15:	learn: 0.3963995	total: 73.8ms	remaining: 157ms
16:	learn: 0.3851903	total: 79.7ms	remaining: 155ms
17:	learn: 0.3745057	total: 85.7ms	remaining: 152ms
18:	learn: 0.3649734	total: 91.2ms	remaining: 149ms
19:	learn: 0.3554888	total: 96.4ms	remaining: 145ms
20:	learn: 0.3467926	total: 102ms	remaining: 141ms
21:	learn: 0.3376733	total: 107ms	remaining: 137ms
22:	learn: 0.3293527	total: 113ms	remaining: 132ms
23:	learn: 0.3213924	total: 119ms	remaining: 129ms
24:	learn: 0.3135628	total: 124ms	remaining: 124ms
25:	learn: 0.3059282	total: 130ms	remaining: 120ms
26:	learn: 0.2986617	total: 136ms	remaining: 116ms
27:	learn: 0.2925738	total: 141ms	remaining: 111ms
28:	learn: 0.2861072	

27:	learn: 0.2936282	total: 125ms	remaining: 98.5ms
28:	learn: 0.2871700	total: 130ms	remaining: 94.3ms
29:	learn: 0.2810291	total: 135ms	remaining: 90.1ms
30:	learn: 0.2751468	total: 140ms	remaining: 85.8ms
31:	learn: 0.2692382	total: 145ms	remaining: 81.4ms
32:	learn: 0.2640658	total: 149ms	remaining: 76.9ms
33:	learn: 0.2585714	total: 154ms	remaining: 72.4ms
34:	learn: 0.2536899	total: 159ms	remaining: 68ms
35:	learn: 0.2492111	total: 163ms	remaining: 63.4ms
36:	learn: 0.2452522	total: 169ms	remaining: 59.2ms
37:	learn: 0.2406447	total: 173ms	remaining: 54.7ms
38:	learn: 0.2365194	total: 178ms	remaining: 50.2ms
39:	learn: 0.2320801	total: 182ms	remaining: 45.6ms
40:	learn: 0.2285392	total: 187ms	remaining: 41ms
41:	learn: 0.2246077	total: 191ms	remaining: 36.4ms
42:	learn: 0.2209322	total: 195ms	remaining: 31.8ms
43:	learn: 0.2175655	total: 199ms	remaining: 27.2ms
44:	learn: 0.2140002	total: 204ms	remaining: 22.6ms
45:	learn: 0.2107791	total: 208ms	remaining: 18.1ms
46:	learn: 0.208

0:	learn: 0.6650889	total: 4.46ms	remaining: 219ms
1:	learn: 0.6392126	total: 9.9ms	remaining: 238ms
2:	learn: 0.6132293	total: 14.3ms	remaining: 225ms
3:	learn: 0.5892842	total: 18.9ms	remaining: 217ms
4:	learn: 0.5675069	total: 23.2ms	remaining: 209ms
5:	learn: 0.5462235	total: 27.5ms	remaining: 201ms
6:	learn: 0.5272324	total: 32.1ms	remaining: 197ms
7:	learn: 0.5094074	total: 36.4ms	remaining: 191ms
8:	learn: 0.4918057	total: 41.6ms	remaining: 190ms
9:	learn: 0.4766223	total: 46.6ms	remaining: 186ms
10:	learn: 0.4618974	total: 51.7ms	remaining: 183ms
11:	learn: 0.4467714	total: 56.8ms	remaining: 180ms
12:	learn: 0.4328917	total: 61.5ms	remaining: 175ms
13:	learn: 0.4205321	total: 66.4ms	remaining: 171ms
14:	learn: 0.4082009	total: 71.4ms	remaining: 167ms
15:	learn: 0.3963807	total: 76.4ms	remaining: 162ms
16:	learn: 0.3851414	total: 81.6ms	remaining: 158ms
17:	learn: 0.3744820	total: 86.5ms	remaining: 154ms
18:	learn: 0.3649896	total: 92.3ms	remaining: 151ms
19:	learn: 0.3552352	to

18:	learn: 0.3649734	total: 88.6ms	remaining: 145ms
19:	learn: 0.3554888	total: 93.5ms	remaining: 140ms
20:	learn: 0.3467926	total: 98ms	remaining: 135ms
21:	learn: 0.3376733	total: 103ms	remaining: 131ms
22:	learn: 0.3293527	total: 107ms	remaining: 126ms
23:	learn: 0.3213924	total: 112ms	remaining: 121ms
24:	learn: 0.3135628	total: 117ms	remaining: 117ms
25:	learn: 0.3059282	total: 123ms	remaining: 113ms
26:	learn: 0.2986617	total: 128ms	remaining: 109ms
27:	learn: 0.2925738	total: 133ms	remaining: 105ms
28:	learn: 0.2861072	total: 138ms	remaining: 100ms
29:	learn: 0.2800888	total: 143ms	remaining: 95.1ms
30:	learn: 0.2742414	total: 147ms	remaining: 90.1ms
31:	learn: 0.2683181	total: 152ms	remaining: 85.3ms
32:	learn: 0.2631697	total: 157ms	remaining: 80.7ms
33:	learn: 0.2577179	total: 162ms	remaining: 76.3ms
34:	learn: 0.2527640	total: 167ms	remaining: 71.5ms
35:	learn: 0.2478662	total: 171ms	remaining: 66.6ms
36:	learn: 0.2434447	total: 176ms	remaining: 61.8ms
37:	learn: 0.2387413	t

In [113]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.94522 (+/- 0.01566)
F1 Score: 0.95304 (+/- 0.01355)
Precision: 0.93039 (+/- 0.01285)
Recall: 0.97685 (+/- 0.01838)
