## Duplicated features

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Read Data

In [2]:
data = pd.read_csv('../DoHBrwTest.csv')
data.shape

(53860, 35)

In [3]:
# check the presence of missing data.
# (there are no missing data in this dataset)
[col for col in data.columns if data[col].isnull().sum() > 0]

[]

In [4]:
data.head(5)

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,is_intrusion
0,7.0,2.0,52742,443,18355.0,0.046455,55,1183.941449,66,1420.729738,...,1.0,0.0,0.0,0.046455,0.046455,0.046455,-10.0,-10.0,0.0,0
1,7.0,2.0,54640,443,18365.0,96.750105,42044,434.562836,44920,464.288902,...,0.561877,0.0017,0.041234,0.033549,0.026931,0.026952,0.481463,0.159978,1.229096,0
2,7.0,2.0,56611,443,18373.0,96.365606,41539,431.056284,44577,462.582054,...,0.549156,0.000956,0.030926,0.026551,0.026848,0.026879,-0.028834,-0.010614,1.164778,0
3,7.0,2.0,56611,443,18374.0,121.35682,60659,499.840058,67897,559.48236,...,0.55657,0.001013,0.031829,0.027571,0.026862,0.026941,0.066819,0.019791,1.154439,0
4,7.0,2.0,56611,443,18375.0,104.669253,30409,290.524668,30718,293.476825,...,0.331633,0.001226,0.035013,0.029797,0.026867,0.026908,0.251063,0.082517,1.175049,0


### Train - Test Split

In [5]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),         # drop the target
    data['is_intrusion'],                               # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((37702, 34), (16158, 34))

## Remove constant and quasi-constant (optional)

In [7]:
# remove constant and quasi-constant features first:
# we can remove the 2 types of features together with this code

# create an empty list
quasi_constant_feat = []

# iterate over every feature
for feature in X_train.columns:

    # find the predominant value, that is the value that is shared
    # by most observations
    predominant = (X_train[feature].value_counts() / np.float64(
        len(X_train))).sort_values(ascending=False).values[0]

    # evaluate predominant feature: do more than 99% of the observations
    # show 1 value?
    if predominant > 0.9:
        quasi_constant_feat.append(feature)

len(quasi_constant_feat)

1

In [8]:
quasi_constant_feat

['PacketTimeMode']

In [9]:
# we can then drop these columns from the train and test sets:

X_train.drop(labels=quasi_constant_feat, axis=1, inplace=True)
X_test.drop(labels=quasi_constant_feat, axis=1, inplace=True)

X_train.shape, X_test.shape

((37702, 33), (16158, 33))

## Remove duplicated features

In [10]:
# fiding duplicated features
duplicated_feat_pairs = {}
_duplicated_feat = []

for i in range(0, len(X_train.columns)):
    if i % 10 == 0:  
        print(i)
 
    feat_1 = X_train.columns[i]
    
    if feat_1 not in _duplicated_feat:
        duplicated_feat_pairs[feat_1] = []

        for feat_2 in X_train.columns[i + 1:]:
            if X_train[feat_1].equals(X_train[feat_2]):
                duplicated_feat_pairs[feat_1].append(feat_2)
                _duplicated_feat.append(feat_2)

0
10
20
30


In [11]:
# let's explore our list of duplicated features
len(_duplicated_feat)

0

We found 0 features that were duplicates of others.

In [12]:
# these are the ones:

_duplicated_feat

[]

In [13]:
# let's explore the dictionary we created:

duplicated_feat_pairs

{'SourceIP': [],
 'DestinationIP': [],
 'SourcePort': [],
 'DestinationPort': [],
 'TimeStamp': [],
 'Duration': [],
 'FlowBytesSent': [],
 'FlowSentRate': [],
 'FlowBytesReceived': [],
 'FlowReceivedRate': [],
 'PacketLengthVariance': [],
 'PacketLengthStandardDeviation': [],
 'PacketLengthMean': [],
 'PacketLengthMedian': [],
 'PacketLengthMode': [],
 'PacketLengthSkewFromMedian': [],
 'PacketLengthSkewFromMode': [],
 'PacketLengthCoefficientofVariation': [],
 'PacketTimeVariance': [],
 'PacketTimeStandardDeviation': [],
 'PacketTimeMean': [],
 'PacketTimeMedian': [],
 'PacketTimeSkewFromMedian': [],
 'PacketTimeSkewFromMode': [],
 'PacketTimeCoefficientofVariation': [],
 'ResponseTimeTimeVariance': [],
 'ResponseTimeTimeStandardDeviation': [],
 'ResponseTimeTimeMean': [],
 'ResponseTimeTimeMedian': [],
 'ResponseTimeTimeMode': [],
 'ResponseTimeTimeSkewFromMedian': [],
 'ResponseTimeTimeSkewFromMode': [],
 'ResponseTimeTimeCoefficientofVariation': []}

We see that for every feature, if it had duplicates, we have entries in the list, otherwise, we have empty lists. Let's explore those features with duplicates now:

In [14]:
# let's explore the number of keys in our dictionary
# we see it is 21, because 2 of the 23 were duplicates,
# so they were not included as keys

print(len(duplicated_feat_pairs.keys()))

33


In [15]:
# print the features with its duplicates
# iterate over every feature in our dict:
for feat in duplicated_feat_pairs.keys():
    # if it has duplicates, the list should not be empty:
    if len(duplicated_feat_pairs[feat]) > 0:
        # print the feature and its duplicates:
        print(feat, duplicated_feat_pairs[feat])
        print()

In [16]:
# to remove the duplicates (if necessary)
X_train = X_train[duplicated_feat_pairs.keys()]
X_test = X_test[duplicated_feat_pairs.keys()]
X_train.shape, X_test.shape

((37702, 33), (16158, 33))

0 duplicate features were found in the NSL-KDD dataset

## Standardize Data

In [17]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers

In [52]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation

In [53]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [54]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1).fit(X_train, y_train)

CPU times: user 83.7 ms, sys: 188 ms, total: 272 ms
Wall time: 4.75 s


In [55]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6230204405636038
F1 Score: 0.6914230019493178
FPR: 0.6253149547947235
TPR: 0.909409452183574


### Naive Bayes

In [56]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-09).fit(X_train, y_train)

CPU times: user 716 ms, sys: 36.6 ms, total: 753 ms
Wall time: 94.6 ms


In [57]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.46441754316332606
F1 Score: 0.6342692974848223
FPR: 1.0
TPR: 1.0


### Random Forest

In [58]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 5.49 s, sys: 67.4 ms, total: 5.55 s
Wall time: 5.55 s


In [61]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
FPR: 1.0
TPR: 1.0


### KNN

In [62]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance').fit(X_train, y_train)

CPU times: user 10.4 ms, sys: 1.36 ms, total: 11.8 ms
Wall time: 9.85 ms


In [63]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.22667195872196866
f1: 0.316446814482178
fpr: 0.9109974803616422
tpr: 0.3854371421246047


### CatBoost

In [64]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.5672820	total: 20.2ms	remaining: 992ms
1:	learn: 0.4687045	total: 39.2ms	remaining: 940ms
2:	learn: 0.4033682	total: 57.8ms	remaining: 906ms
3:	learn: 0.3469832	total: 76ms	remaining: 874ms
4:	learn: 0.2970591	total: 94.7ms	remaining: 852ms
5:	learn: 0.2592355	total: 113ms	remaining: 828ms
6:	learn: 0.2244096	total: 131ms	remaining: 802ms
7:	learn: 0.1950752	total: 149ms	remaining: 784ms
8:	learn: 0.1699219	total: 168ms	remaining: 767ms
9:	learn: 0.1510645	total: 187ms	remaining: 747ms
10:	learn: 0.1319274	total: 205ms	remaining: 728ms
11:	learn: 0.1146620	total: 224ms	remaining: 710ms
12:	learn: 0.1036217	total: 243ms	remaining: 692ms
13:	learn: 0.0937029	total: 262ms	remaining: 674ms
14:	learn: 0.0854063	total: 281ms	remaining: 656ms
15:	learn: 0.0778330	total: 299ms	remaining: 636ms
16:	learn: 0.0728949	total: 317ms	remaining: 615ms
17:	learn: 0.0674587	total: 335ms	remaining: 596ms
18:	learn: 0.0634831	total: 353ms	remaining: 577ms
19:	learn: 0.0603704	total: 372ms	rema

In [65]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.543004564397698
F1 Score: 0.3909009429957908
FPR: 0.0009633911368015414
TPR: 0.01709255619177848


## Model Evaluation

In [88]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../KDDTest.csv")
test_df.shape

(22543, 42)

In [89]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion'])

### Model Evaluation - Logistic Regression

In [90]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=0.1, n_jobs=-1, random_state=42)

In [91]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [92]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9538783651031485
Testing accuracy is  0.6230204405636038


In [93]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.6230204405636038
F1 Score: 0.6914230019493178
Precision Score: 0.5577336338382515
Recall Score: 0.909409452183574
Confusion Matrix:
 [[ 5056  8438]
 [ 1060 10641]]


### Cross validation - Logistic Regression



In [94]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.82744 (+/- 0.01983)
F1 Score: 0.84375 (+/- 0.01827)
Precision: 0.87070 (+/- 0.02536)
Recall: 0.81866 (+/- 0.02864)


### Model Evaluation - Naive Bayes



In [95]:
modelNB = GaussianNB(var_smoothing=1e-09)
modelNB.fit(X_train, y_train)

GaussianNB()

In [96]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [97]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.8957698681246713
Testing accuracy is  0.46441754316332606


In [98]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.46441754316332606
F1 Score: 0.6342692974848223
Precision Score: 0.46441754316332606
Recall Score: 1.0
Confusion Matrix:
 [[    0 13494]
 [    0 11701]]


### Cross validation - Naive Bayes


In [99]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.79373 (+/- 0.04138)
F1 Score: 0.80066 (+/- 0.05102)
Precision: 0.88661 (+/- 0.02556)
Recall: 0.73208 (+/- 0.10511)


### Model Evaluation - Random Forest



In [100]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [101]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [102]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  0.9999702313027774
Testing accuracy is  0.5355824568366739


In [103]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
Precision Score: 0.28684856807120773
Recall Score: 0.5355824568366739
Confusion Matrix:
 [[13494     0]
 [11701     0]]


### Cross validation - Random Forest


In [104]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98660 (+/- 0.00500)
F1 Score: 0.98823 (+/- 0.00440)
Precision: 0.98854 (+/- 0.00462)
Recall: 0.98792 (+/- 0.00489)


### Model Evaluation - KNN

In [105]:
modelKNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=1, n_neighbors=2,
                     weights='distance')

In [106]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [107]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9999702313027774
Testing accuracy is  0.22667195872196866


In [108]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.22667195872196866
F1 Score: 0.316446814482178
Precision Score: 0.2684044515860263
Recall Score: 0.3854371421246047
Confusion Matrix:
 [[ 1201 12293]
 [ 7191  4510]]


### Cross validation - KNN



In [109]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.97738 (+/- 0.00678)
F1 Score: 0.98013 (+/- 0.00590)
Precision: 0.98045 (+/- 0.00987)
Recall: 0.97982 (+/- 0.00514)


### Model Evaluation - CatBoost

In [110]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.5672820	total: 18.2ms	remaining: 890ms
1:	learn: 0.4687045	total: 36.1ms	remaining: 866ms
2:	learn: 0.4033682	total: 53.7ms	remaining: 842ms
3:	learn: 0.3469832	total: 70.6ms	remaining: 812ms
4:	learn: 0.2970591	total: 88.7ms	remaining: 798ms
5:	learn: 0.2592355	total: 106ms	remaining: 779ms
6:	learn: 0.2244096	total: 124ms	remaining: 759ms
7:	learn: 0.1950752	total: 141ms	remaining: 740ms
8:	learn: 0.1699219	total: 160ms	remaining: 729ms
9:	learn: 0.1510645	total: 179ms	remaining: 716ms
10:	learn: 0.1319274	total: 198ms	remaining: 701ms
11:	learn: 0.1146620	total: 216ms	remaining: 683ms
12:	learn: 0.1036217	total: 235ms	remaining: 670ms
13:	learn: 0.0937029	total: 254ms	remaining: 654ms
14:	learn: 0.0854063	total: 273ms	remaining: 636ms
15:	learn: 0.0778330	total: 291ms	remaining: 618ms
16:	learn: 0.0728949	total: 308ms	remaining: 599ms
17:	learn: 0.0674587	total: 326ms	remaining: 580ms
18:	learn: 0.0634831	total: 343ms	remaining: 560ms
19:	learn: 0.0603704	total: 361ms	re

<catboost.core.CatBoostClassifier at 0x7fb5c07164f0>

In [111]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [112]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.9962392212508806
Testing accuracy is  0.543004564397698


In [113]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.543004564397698
F1 Score: 0.3909009429957908
Precision Score: 0.7250883857207762
Recall Score: 0.543004564397698
Confusion Matrix:
 [[13481    13]
 [11501   200]]


### Cross validation - CatBoost

In [114]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6077712	total: 11.2ms	remaining: 549ms
1:	learn: 0.5517188	total: 22.9ms	remaining: 549ms
2:	learn: 0.4866925	total: 34.7ms	remaining: 543ms
3:	learn: 0.4392301	total: 46.3ms	remaining: 532ms
4:	learn: 0.4068634	total: 58.2ms	remaining: 524ms
5:	learn: 0.3651297	total: 70.1ms	remaining: 514ms
6:	learn: 0.3359935	total: 81.7ms	remaining: 502ms
7:	learn: 0.3022749	total: 93.1ms	remaining: 489ms
8:	learn: 0.2826473	total: 105ms	remaining: 477ms
9:	learn: 0.2587920	total: 116ms	remaining: 465ms
10:	learn: 0.2377668	total: 128ms	remaining: 453ms
11:	learn: 0.2185554	total: 139ms	remaining: 441ms
12:	learn: 0.2025339	total: 151ms	remaining: 431ms
13:	learn: 0.1908587	total: 163ms	remaining: 419ms
14:	learn: 0.1800125	total: 175ms	remaining: 408ms
15:	learn: 0.1695332	total: 186ms	remaining: 396ms
16:	learn: 0.1624371	total: 198ms	remaining: 385ms
17:	learn: 0.1555761	total: 210ms	remaining: 373ms
18:	learn: 0.1501569	total: 221ms	remaining: 360ms
19:	learn: 0.1444446	total: 233ms

17:	learn: 0.1597762	total: 209ms	remaining: 372ms
18:	learn: 0.1531731	total: 220ms	remaining: 359ms
19:	learn: 0.1459578	total: 232ms	remaining: 347ms
20:	learn: 0.1363746	total: 243ms	remaining: 336ms
21:	learn: 0.1310036	total: 255ms	remaining: 324ms
22:	learn: 0.1254235	total: 267ms	remaining: 314ms
23:	learn: 0.1211109	total: 279ms	remaining: 302ms
24:	learn: 0.1158244	total: 291ms	remaining: 291ms
25:	learn: 0.1111377	total: 302ms	remaining: 279ms
26:	learn: 0.1076379	total: 314ms	remaining: 268ms
27:	learn: 0.1045010	total: 326ms	remaining: 256ms
28:	learn: 0.1019631	total: 337ms	remaining: 244ms
29:	learn: 0.0985676	total: 349ms	remaining: 232ms
30:	learn: 0.0966948	total: 360ms	remaining: 221ms
31:	learn: 0.0941558	total: 372ms	remaining: 209ms
32:	learn: 0.0914431	total: 384ms	remaining: 198ms
33:	learn: 0.0896457	total: 395ms	remaining: 186ms
34:	learn: 0.0872277	total: 407ms	remaining: 175ms
35:	learn: 0.0859216	total: 418ms	remaining: 163ms
36:	learn: 0.0840150	total: 430

35:	learn: 0.0859950	total: 421ms	remaining: 164ms
36:	learn: 0.0844430	total: 433ms	remaining: 152ms
37:	learn: 0.0833863	total: 445ms	remaining: 140ms
38:	learn: 0.0814689	total: 456ms	remaining: 129ms
39:	learn: 0.0801516	total: 467ms	remaining: 117ms
40:	learn: 0.0788085	total: 478ms	remaining: 105ms
41:	learn: 0.0782121	total: 489ms	remaining: 93.2ms
42:	learn: 0.0774700	total: 500ms	remaining: 81.4ms
43:	learn: 0.0762740	total: 511ms	remaining: 69.7ms
44:	learn: 0.0753969	total: 522ms	remaining: 58ms
45:	learn: 0.0744440	total: 534ms	remaining: 46.4ms
46:	learn: 0.0734702	total: 546ms	remaining: 34.8ms
47:	learn: 0.0722450	total: 557ms	remaining: 23.2ms
48:	learn: 0.0713899	total: 568ms	remaining: 11.6ms
49:	learn: 0.0702807	total: 579ms	remaining: 0us
0:	learn: 0.6102082	total: 11.5ms	remaining: 563ms
1:	learn: 0.5427471	total: 23.7ms	remaining: 569ms
2:	learn: 0.4789235	total: 35ms	remaining: 548ms
3:	learn: 0.4292852	total: 46.6ms	remaining: 536ms
4:	learn: 0.3866261	total: 58

0:	learn: 0.6077712	total: 11.3ms	remaining: 555ms
1:	learn: 0.5517188	total: 22.7ms	remaining: 544ms
2:	learn: 0.4866925	total: 33.8ms	remaining: 530ms
3:	learn: 0.4392301	total: 45.7ms	remaining: 525ms
4:	learn: 0.4068634	total: 57.9ms	remaining: 521ms
5:	learn: 0.3651297	total: 69.6ms	remaining: 510ms
6:	learn: 0.3359935	total: 81.1ms	remaining: 498ms
7:	learn: 0.3022749	total: 92.6ms	remaining: 486ms
8:	learn: 0.2826473	total: 104ms	remaining: 474ms
9:	learn: 0.2587920	total: 116ms	remaining: 463ms
10:	learn: 0.2377668	total: 127ms	remaining: 451ms
11:	learn: 0.2185554	total: 139ms	remaining: 440ms
12:	learn: 0.2025339	total: 151ms	remaining: 429ms
13:	learn: 0.1908587	total: 163ms	remaining: 418ms
14:	learn: 0.1800125	total: 174ms	remaining: 407ms
15:	learn: 0.1695332	total: 186ms	remaining: 395ms
16:	learn: 0.1624371	total: 198ms	remaining: 384ms
17:	learn: 0.1555761	total: 209ms	remaining: 372ms
18:	learn: 0.1501569	total: 221ms	remaining: 360ms
19:	learn: 0.1444446	total: 233ms

17:	learn: 0.1597762	total: 212ms	remaining: 376ms
18:	learn: 0.1531731	total: 222ms	remaining: 363ms
19:	learn: 0.1459578	total: 234ms	remaining: 351ms
20:	learn: 0.1363746	total: 246ms	remaining: 340ms
21:	learn: 0.1310036	total: 258ms	remaining: 328ms
22:	learn: 0.1254235	total: 270ms	remaining: 316ms
23:	learn: 0.1211109	total: 282ms	remaining: 305ms
24:	learn: 0.1158244	total: 293ms	remaining: 293ms
25:	learn: 0.1111377	total: 306ms	remaining: 282ms
26:	learn: 0.1076379	total: 318ms	remaining: 271ms
27:	learn: 0.1045010	total: 329ms	remaining: 259ms
28:	learn: 0.1019631	total: 341ms	remaining: 247ms
29:	learn: 0.0985676	total: 353ms	remaining: 235ms
30:	learn: 0.0966948	total: 365ms	remaining: 223ms
31:	learn: 0.0941558	total: 376ms	remaining: 212ms
32:	learn: 0.0914431	total: 388ms	remaining: 200ms
33:	learn: 0.0896457	total: 400ms	remaining: 188ms
34:	learn: 0.0872277	total: 411ms	remaining: 176ms
35:	learn: 0.0859216	total: 423ms	remaining: 164ms
36:	learn: 0.0840150	total: 435

33:	learn: 0.0890390	total: 417ms	remaining: 196ms
34:	learn: 0.0873788	total: 429ms	remaining: 184ms
35:	learn: 0.0859950	total: 441ms	remaining: 171ms
36:	learn: 0.0844430	total: 453ms	remaining: 159ms
37:	learn: 0.0833863	total: 465ms	remaining: 147ms
38:	learn: 0.0814689	total: 477ms	remaining: 135ms
39:	learn: 0.0801516	total: 489ms	remaining: 122ms
40:	learn: 0.0788085	total: 501ms	remaining: 110ms
41:	learn: 0.0782121	total: 512ms	remaining: 97.5ms
42:	learn: 0.0774700	total: 523ms	remaining: 85.2ms
43:	learn: 0.0762740	total: 535ms	remaining: 73ms
44:	learn: 0.0753969	total: 546ms	remaining: 60.7ms
45:	learn: 0.0744440	total: 558ms	remaining: 48.5ms
46:	learn: 0.0734702	total: 570ms	remaining: 36.4ms
47:	learn: 0.0722450	total: 582ms	remaining: 24.2ms
48:	learn: 0.0713899	total: 593ms	remaining: 12.1ms
49:	learn: 0.0702807	total: 604ms	remaining: 0us
0:	learn: 0.6102082	total: 12ms	remaining: 587ms
1:	learn: 0.5427471	total: 23.7ms	remaining: 569ms
2:	learn: 0.4789235	total: 35

0:	learn: 0.6077712	total: 11.1ms	remaining: 543ms
1:	learn: 0.5517188	total: 22.6ms	remaining: 542ms
2:	learn: 0.4866925	total: 33.9ms	remaining: 531ms
3:	learn: 0.4392301	total: 46.2ms	remaining: 531ms
4:	learn: 0.4068634	total: 58.3ms	remaining: 524ms
5:	learn: 0.3651297	total: 69.9ms	remaining: 513ms
6:	learn: 0.3359935	total: 81.5ms	remaining: 500ms
7:	learn: 0.3022749	total: 93.3ms	remaining: 490ms
8:	learn: 0.2826473	total: 105ms	remaining: 476ms
9:	learn: 0.2587920	total: 116ms	remaining: 463ms
10:	learn: 0.2377668	total: 127ms	remaining: 452ms
11:	learn: 0.2185554	total: 139ms	remaining: 441ms
12:	learn: 0.2025339	total: 151ms	remaining: 430ms
13:	learn: 0.1908587	total: 163ms	remaining: 418ms
14:	learn: 0.1800125	total: 174ms	remaining: 407ms
15:	learn: 0.1695332	total: 186ms	remaining: 395ms
16:	learn: 0.1624371	total: 197ms	remaining: 383ms
17:	learn: 0.1555761	total: 209ms	remaining: 372ms
18:	learn: 0.1501569	total: 220ms	remaining: 359ms
19:	learn: 0.1444446	total: 232ms

17:	learn: 0.1597762	total: 212ms	remaining: 378ms
18:	learn: 0.1531731	total: 224ms	remaining: 365ms
19:	learn: 0.1459578	total: 235ms	remaining: 353ms
20:	learn: 0.1363746	total: 248ms	remaining: 342ms
21:	learn: 0.1310036	total: 260ms	remaining: 330ms
22:	learn: 0.1254235	total: 272ms	remaining: 319ms
23:	learn: 0.1211109	total: 284ms	remaining: 307ms
24:	learn: 0.1158244	total: 296ms	remaining: 296ms
25:	learn: 0.1111377	total: 307ms	remaining: 284ms
26:	learn: 0.1076379	total: 319ms	remaining: 272ms
27:	learn: 0.1045010	total: 331ms	remaining: 260ms
28:	learn: 0.1019631	total: 343ms	remaining: 248ms
29:	learn: 0.0985676	total: 355ms	remaining: 236ms
30:	learn: 0.0966948	total: 367ms	remaining: 225ms
31:	learn: 0.0941558	total: 378ms	remaining: 213ms
32:	learn: 0.0914431	total: 390ms	remaining: 201ms
33:	learn: 0.0896457	total: 402ms	remaining: 189ms
34:	learn: 0.0872277	total: 414ms	remaining: 177ms
35:	learn: 0.0859216	total: 425ms	remaining: 165ms
36:	learn: 0.0840150	total: 437

34:	learn: 0.0873788	total: 410ms	remaining: 176ms
35:	learn: 0.0859950	total: 422ms	remaining: 164ms
36:	learn: 0.0844430	total: 433ms	remaining: 152ms
37:	learn: 0.0833863	total: 445ms	remaining: 141ms
38:	learn: 0.0814689	total: 457ms	remaining: 129ms
39:	learn: 0.0801516	total: 469ms	remaining: 117ms
40:	learn: 0.0788085	total: 480ms	remaining: 105ms
41:	learn: 0.0782121	total: 492ms	remaining: 93.7ms
42:	learn: 0.0774700	total: 504ms	remaining: 82ms
43:	learn: 0.0762740	total: 515ms	remaining: 70.2ms
44:	learn: 0.0753969	total: 527ms	remaining: 58.5ms
45:	learn: 0.0744440	total: 538ms	remaining: 46.8ms
46:	learn: 0.0734702	total: 550ms	remaining: 35.1ms
47:	learn: 0.0722450	total: 561ms	remaining: 23.4ms
48:	learn: 0.0713899	total: 573ms	remaining: 11.7ms
49:	learn: 0.0702807	total: 585ms	remaining: 0us
0:	learn: 0.6102082	total: 11.8ms	remaining: 576ms
1:	learn: 0.5427471	total: 23.6ms	remaining: 566ms
2:	learn: 0.4789235	total: 35.2ms	remaining: 552ms
3:	learn: 0.4292852	total: 

0:	learn: 0.6077712	total: 11.1ms	remaining: 545ms
1:	learn: 0.5517188	total: 22.9ms	remaining: 549ms
2:	learn: 0.4866925	total: 34.6ms	remaining: 542ms
3:	learn: 0.4392301	total: 46.6ms	remaining: 536ms
4:	learn: 0.4068634	total: 58.6ms	remaining: 528ms
5:	learn: 0.3651297	total: 70.3ms	remaining: 516ms
6:	learn: 0.3359935	total: 82.1ms	remaining: 504ms
7:	learn: 0.3022749	total: 93.7ms	remaining: 492ms
8:	learn: 0.2826473	total: 105ms	remaining: 479ms
9:	learn: 0.2587920	total: 117ms	remaining: 468ms
10:	learn: 0.2377668	total: 128ms	remaining: 455ms
11:	learn: 0.2185554	total: 140ms	remaining: 442ms
12:	learn: 0.2025339	total: 151ms	remaining: 429ms
13:	learn: 0.1908587	total: 162ms	remaining: 417ms
14:	learn: 0.1800125	total: 174ms	remaining: 405ms
15:	learn: 0.1695332	total: 185ms	remaining: 394ms
16:	learn: 0.1624371	total: 197ms	remaining: 382ms
17:	learn: 0.1555761	total: 208ms	remaining: 370ms
18:	learn: 0.1501569	total: 220ms	remaining: 358ms
19:	learn: 0.1444446	total: 232ms

17:	learn: 0.1597762	total: 211ms	remaining: 375ms
18:	learn: 0.1531731	total: 222ms	remaining: 362ms
19:	learn: 0.1459578	total: 234ms	remaining: 350ms
20:	learn: 0.1363746	total: 247ms	remaining: 340ms
21:	learn: 0.1310036	total: 259ms	remaining: 329ms
22:	learn: 0.1254235	total: 271ms	remaining: 318ms
23:	learn: 0.1211109	total: 283ms	remaining: 306ms
24:	learn: 0.1158244	total: 295ms	remaining: 295ms
25:	learn: 0.1111377	total: 307ms	remaining: 283ms
26:	learn: 0.1076379	total: 319ms	remaining: 271ms
27:	learn: 0.1045010	total: 331ms	remaining: 260ms
28:	learn: 0.1019631	total: 342ms	remaining: 248ms
29:	learn: 0.0985676	total: 354ms	remaining: 236ms
30:	learn: 0.0966948	total: 366ms	remaining: 224ms
31:	learn: 0.0941558	total: 378ms	remaining: 213ms
32:	learn: 0.0914431	total: 390ms	remaining: 201ms
33:	learn: 0.0896457	total: 401ms	remaining: 189ms
34:	learn: 0.0872277	total: 413ms	remaining: 177ms
35:	learn: 0.0859216	total: 425ms	remaining: 165ms
36:	learn: 0.0840150	total: 437

34:	learn: 0.0873788	total: 413ms	remaining: 177ms
35:	learn: 0.0859950	total: 425ms	remaining: 165ms
36:	learn: 0.0844430	total: 436ms	remaining: 153ms
37:	learn: 0.0833863	total: 448ms	remaining: 141ms
38:	learn: 0.0814689	total: 460ms	remaining: 130ms
39:	learn: 0.0801516	total: 472ms	remaining: 118ms
40:	learn: 0.0788085	total: 484ms	remaining: 106ms
41:	learn: 0.0782121	total: 495ms	remaining: 94.3ms
42:	learn: 0.0774700	total: 506ms	remaining: 82.4ms
43:	learn: 0.0762740	total: 518ms	remaining: 70.6ms
44:	learn: 0.0753969	total: 529ms	remaining: 58.8ms
45:	learn: 0.0744440	total: 541ms	remaining: 47ms
46:	learn: 0.0734702	total: 552ms	remaining: 35.2ms
47:	learn: 0.0722450	total: 564ms	remaining: 23.5ms
48:	learn: 0.0713899	total: 575ms	remaining: 11.7ms
49:	learn: 0.0702807	total: 587ms	remaining: 0us
0:	learn: 0.6102082	total: 11.4ms	remaining: 558ms
1:	learn: 0.5427471	total: 23.6ms	remaining: 566ms
2:	learn: 0.4789235	total: 35ms	remaining: 548ms
3:	learn: 0.4292852	total: 46

In [115]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.97959 (+/- 0.00371)
F1 Score: 0.98220 (+/- 0.00317)
Precision: 0.97566 (+/- 0.00952)
Recall: 0.98886 (+/- 0.00718)
