## Duplicated features

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Read Data

In [2]:
data = pd.read_csv('../Kyoto_Train.csv')
data.shape

(124055, 24)

In [3]:
# check the presence of missing data.
# (there are no missing data in this dataset)
[col for col in data.columns if data[col].isnull().sum() > 0]

[]

In [4]:
data.head(5)

Unnamed: 0,Duration,Source,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_port_rate,...,Service_code,Flag_code,IDS_detection_code,Malware_detection_code,Ashula_detection_code,Source_IP_Address_code,Destination_IP_Address_code,Start_Time_code,Protocol_code,Label_code
0,2.863309,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,90847.0,14024.0,25836.0,1.0,0.0
1,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,218531.0,8968.0,45541.0,1.0,0.0
2,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,176665.0,15997.0,59860.0,1.0,0.0
3,0.0,0,0,0,0.0,0.0,0.67,49,100,0.02,...,6.0,6.0,0.0,0.0,0.0,52769.0,473.0,40649.0,1.0,0.0
4,0.0,0,0,1,1.0,0.0,0.36,0,2,0.0,...,6.0,0.0,0.0,0.0,0.0,65048.0,16609.0,39283.0,1.0,0.0


### Train - Test Split

In [5]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['Label_code'], axis=1),         # drop the target
    data['Label_code'],                               # just the target
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((99244, 23), (24811, 23))

## Remove constant and quasi-constant (optional)

In [6]:
# remove constant and quasi-constant features first:
# we can remove the 2 types of features together with this code

# create an empty list
quasi_constant_feat = []

# iterate over every feature
for feature in X_train.columns:

    # find the predominant value, that is the value that is shared
    # by most observations
    predominant = (X_train[feature].value_counts() / np.float64(
        len(X_train))).sort_values(ascending=False).values[0]

    # evaluate predominant feature: do more than 99% of the observations
    # show 1 value?
    if predominant > 0.998:
        quasi_constant_feat.append(feature)

len(quasi_constant_feat)

2

In [7]:
quasi_constant_feat

['Malware_detection_code', 'Ashula_detection_code']

In [8]:
# we can then drop these columns from the train and test sets:

X_train.drop(labels=quasi_constant_feat, axis=1, inplace=True)
X_test.drop(labels=quasi_constant_feat, axis=1, inplace=True)

X_train.shape, X_test.shape

((99244, 21), (24811, 21))

## Remove duplicated features

In [9]:
# fiding duplicated features
duplicated_feat_pairs = {}
_duplicated_feat = []

for i in range(0, len(X_train.columns)):
    if i % 10 == 0:  
        print(i)
 
    feat_1 = X_train.columns[i]
    
    if feat_1 not in _duplicated_feat:
        duplicated_feat_pairs[feat_1] = []

        for feat_2 in X_train.columns[i + 1:]:
            if X_train[feat_1].equals(X_train[feat_2]):
                duplicated_feat_pairs[feat_1].append(feat_2)
                _duplicated_feat.append(feat_2)

0
10
20


In [10]:
# let's explore our list of duplicated features
len(_duplicated_feat)

0

We found 0 features that were duplicates of others.

In [11]:
# these are the ones:

_duplicated_feat

[]

In [12]:
# let's explore the dictionary we created:

duplicated_feat_pairs

{'Duration': [],
 'Source': [],
 'Destination_bytes': [],
 'Count': [],
 'Same_srv_rate': [],
 'Serror_rate': [],
 'Srv_serror_rate': [],
 'Dst_host_count': [],
 'Dst_host_srv_count': [],
 'Dst_host_same_port_rate': [],
 'Dst_host_serror_rate': [],
 'Dst_host_srv_serror_rate': [],
 'Source_Port_Number': [],
 'Destination_Port_Number': [],
 'Service_code': [],
 'Flag_code': [],
 'IDS_detection_code': [],
 'Source_IP_Address_code': [],
 'Destination_IP_Address_code': [],
 'Start_Time_code': [],
 'Protocol_code': []}

We see that for every feature, if it had duplicates, we have entries in the list, otherwise, we have empty lists. Let's explore those features with duplicates now:

In [13]:
# let's explore the number of keys in our dictionary
# we see it is 21, because 2 of the 23 were duplicates,
# so they were not included as keys

print(len(duplicated_feat_pairs.keys()))

21


In [14]:
# print the features with its duplicates
# iterate over every feature in our dict:
for feat in duplicated_feat_pairs.keys():
    # if it has duplicates, the list should not be empty:
    if len(duplicated_feat_pairs[feat]) > 0:
        # print the feature and its duplicates:
        print(feat, duplicated_feat_pairs[feat])
        print()

In [15]:
# to remove the duplicates (if necessary)
X_train = X_train[duplicated_feat_pairs.keys()]
X_test = X_test[duplicated_feat_pairs.keys()]
X_train.shape, X_test.shape

((99244, 21), (24811, 21))

0 duplicate features were found in the Kyoto dataset

## Standardize Data

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers

In [17]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [19]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1).fit(X_train, y_train)

CPU times: user 76 ms, sys: 175 ms, total: 251 ms
Wall time: 2.25 s


In [20]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.449115311756882
F1 Score: 0.24519549370444005
FPR: 0.6029987542267308
TPR: 0.9507494646680942


### Naive Bayes

In [21]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-05).fit(X_train, y_train)

CPU times: user 40.4 ms, sys: 8.32 ms, total: 48.7 ms
Wall time: 46.9 ms


In [22]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9045584619725122
F1 Score: 0.0
FPR: 0.0014682327816337426
TPR: 0.0


### Random Forest

In [25]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 6.9 s, sys: 58.2 ms, total: 6.95 s
Wall time: 6.96 s


In [26]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


### KNN

In [27]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform').fit(X_train, y_train)

CPU times: user 6.79 s, sys: 36 ms, total: 6.83 s
Wall time: 6.82 s


In [28]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.9058885171899561
f1: 0.0
fpr: 1.0
tpr: 1.0


### CatBoost

In [29]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.5830950	total: 75.9ms	remaining: 3.72s
1:	learn: 0.4886781	total: 92.9ms	remaining: 2.23s
2:	learn: 0.4191358	total: 110ms	remaining: 1.72s
3:	learn: 0.3429678	total: 127ms	remaining: 1.46s
4:	learn: 0.2962291	total: 145ms	remaining: 1.3s
5:	learn: 0.2613715	total: 162ms	remaining: 1.19s
6:	learn: 0.2070733	total: 179ms	remaining: 1.1s
7:	learn: 0.1594691	total: 197ms	remaining: 1.03s
8:	learn: 0.1219426	total: 214ms	remaining: 974ms
9:	learn: 0.0938478	total: 231ms	remaining: 923ms
10:	learn: 0.0732574	total: 248ms	remaining: 881ms
11:	learn: 0.0578715	total: 266ms	remaining: 842ms
12:	learn: 0.0505377	total: 284ms	remaining: 807ms
13:	learn: 0.0392189	total: 302ms	remaining: 776ms
14:	learn: 0.0365829	total: 319ms	remaining: 744ms
15:	learn: 0.0286756	total: 336ms	remaining: 715ms
16:	learn: 0.0226511	total: 354ms	remaining: 688ms
17:	learn: 0.0203467	total: 372ms	remaining: 661ms
18:	learn: 0.0166118	total: 389ms	remaining: 635ms
19:	learn: 0.0152240	total: 407ms	remaini

In [30]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


## Model Evaluation

In [31]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../Kyoto_Test.csv")
test_df.shape

(62028, 24)

In [33]:
# Create feature matrix X and target vextor y
y_eval = test_df['Label_code']
X_eval = test_df.drop(columns=['Label_code','Malware_detection_code', 'Ashula_detection_code'])

### Model Evaluation - Logistic Regression

In [35]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=1, n_jobs=-1, random_state=42)

In [36]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [37]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9302426343154246
Testing accuracy is  0.449115311756882


In [38]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.449115311756882
F1 Score: 0.24519549370444005
Precision Score: 0.1407468458758638
Recall Score: 0.9507494646680942
Confusion Matrix:
 [[ 8923 13553]
 [  115  2220]]


### Cross validation - Logistic Regression



In [39]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.90063 (+/- 0.00236)
F1 Score: 0.00224 (+/- 0.00643)


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.02781 (+/- 0.07544)
Recall: 0.00117 (+/- 0.00336)


### Model Evaluation - Naive Bayes



In [40]:
modelNB = GaussianNB(var_smoothing=1e-05)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-05)

In [41]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [42]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.6893918020232961
Testing accuracy is  0.9045584619725122


In [43]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.9045584619725122
F1 Score: 0.0
Precision Score: 0.0
Recall Score: 0.0
Confusion Matrix:
 [[22443    33]
 [ 2335     0]]


### Cross validation - Naive Bayes


In [44]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.51851 (+/- 0.28039)
F1 Score: 0.25979 (+/- 0.02808)
Precision: 0.21404 (+/- 0.38891)
Recall: 0.86306 (+/- 0.46302)


### Model Evaluation - Random Forest



In [45]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [46]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [47]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.9058885171899561


In [48]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - Random Forest


In [49]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99936 (+/- 0.00072)
F1 Score: 0.99665 (+/- 0.00376)
Precision: 0.99950 (+/- 0.00154)
Recall: 0.99382 (+/- 0.00686)


### Model Evaluation - KNN

In [50]:
modelKNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(leaf_size=1, n_neighbors=2)

In [51]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [52]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9948309217685704
Testing accuracy is  0.9058885171899561


In [53]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.0
Precision Score: 0.0
Recall Score: 0.0
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Cross validation - KNN



In [54]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98446 (+/- 0.00246)
F1 Score: 0.91715 (+/- 0.01310)
Precision: 0.94433 (+/- 0.01808)
Recall: 0.89156 (+/- 0.01604)


### Model Evaluation - CatBoost

In [55]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.5830950	total: 16.8ms	remaining: 822ms
1:	learn: 0.4886781	total: 33.1ms	remaining: 794ms
2:	learn: 0.4191358	total: 49.3ms	remaining: 772ms
3:	learn: 0.3429678	total: 66ms	remaining: 758ms
4:	learn: 0.2962291	total: 82.4ms	remaining: 741ms
5:	learn: 0.2613715	total: 98.5ms	remaining: 722ms
6:	learn: 0.2070733	total: 115ms	remaining: 704ms
7:	learn: 0.1594691	total: 131ms	remaining: 689ms
8:	learn: 0.1219426	total: 148ms	remaining: 673ms
9:	learn: 0.0938478	total: 165ms	remaining: 661ms
10:	learn: 0.0732574	total: 182ms	remaining: 644ms
11:	learn: 0.0578715	total: 198ms	remaining: 627ms
12:	learn: 0.0505377	total: 215ms	remaining: 612ms
13:	learn: 0.0392189	total: 233ms	remaining: 598ms
14:	learn: 0.0365829	total: 249ms	remaining: 581ms
15:	learn: 0.0286756	total: 266ms	remaining: 564ms
16:	learn: 0.0226511	total: 282ms	remaining: 548ms
17:	learn: 0.0203467	total: 300ms	remaining: 533ms
18:	learn: 0.0166118	total: 316ms	remaining: 516ms
19:	learn: 0.0152240	total: 332ms	rem

<catboost.core.CatBoostClassifier at 0x7fe8c87f7e50>

In [56]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [57]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  1.0
Testing accuracy is  0.9058885171899561


In [58]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - CatBoost

In [59]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.5953306	total: 13.5ms	remaining: 663ms
1:	learn: 0.4982996	total: 27.8ms	remaining: 668ms
2:	learn: 0.4287292	total: 41.4ms	remaining: 649ms
3:	learn: 0.3532792	total: 55.1ms	remaining: 634ms
4:	learn: 0.3000730	total: 69.5ms	remaining: 625ms
5:	learn: 0.2483823	total: 83.2ms	remaining: 610ms
6:	learn: 0.1973843	total: 98.2ms	remaining: 603ms
7:	learn: 0.1550101	total: 113ms	remaining: 596ms
8:	learn: 0.1354717	total: 129ms	remaining: 586ms
9:	learn: 0.1144453	total: 145ms	remaining: 579ms
10:	learn: 0.0900407	total: 160ms	remaining: 568ms
11:	learn: 0.0785206	total: 176ms	remaining: 557ms
12:	learn: 0.0707359	total: 191ms	remaining: 545ms
13:	learn: 0.0656920	total: 207ms	remaining: 532ms
14:	learn: 0.0594646	total: 221ms	remaining: 516ms
15:	learn: 0.0478001	total: 235ms	remaining: 500ms
16:	learn: 0.0384034	total: 249ms	remaining: 483ms
17:	learn: 0.0345135	total: 263ms	remaining: 468ms
18:	learn: 0.0285966	total: 278ms	remaining: 453ms
19:	learn: 0.0266089	total: 291ms	

15:	learn: 0.0432341	total: 220ms	remaining: 468ms
16:	learn: 0.0357080	total: 234ms	remaining: 455ms
17:	learn: 0.0291042	total: 248ms	remaining: 441ms
18:	learn: 0.0237276	total: 262ms	remaining: 427ms
19:	learn: 0.0195390	total: 275ms	remaining: 413ms
20:	learn: 0.0162131	total: 288ms	remaining: 398ms
21:	learn: 0.0134680	total: 302ms	remaining: 384ms
22:	learn: 0.0123605	total: 316ms	remaining: 371ms
23:	learn: 0.0112202	total: 330ms	remaining: 357ms
24:	learn: 0.0095560	total: 344ms	remaining: 344ms
25:	learn: 0.0081383	total: 357ms	remaining: 330ms
26:	learn: 0.0070126	total: 371ms	remaining: 316ms
27:	learn: 0.0060563	total: 385ms	remaining: 302ms
28:	learn: 0.0052409	total: 398ms	remaining: 288ms
29:	learn: 0.0046626	total: 412ms	remaining: 275ms
30:	learn: 0.0041141	total: 426ms	remaining: 261ms
31:	learn: 0.0036439	total: 440ms	remaining: 247ms
32:	learn: 0.0031690	total: 453ms	remaining: 234ms
33:	learn: 0.0028811	total: 467ms	remaining: 220ms
34:	learn: 0.0025331	total: 480

28:	learn: 0.0051849	total: 423ms	remaining: 306ms
29:	learn: 0.0045904	total: 437ms	remaining: 291ms
30:	learn: 0.0040617	total: 451ms	remaining: 276ms
31:	learn: 0.0036450	total: 464ms	remaining: 261ms
32:	learn: 0.0031863	total: 478ms	remaining: 246ms
33:	learn: 0.0028744	total: 491ms	remaining: 231ms
34:	learn: 0.0025443	total: 504ms	remaining: 216ms
35:	learn: 0.0022830	total: 518ms	remaining: 201ms
36:	learn: 0.0020711	total: 532ms	remaining: 187ms
37:	learn: 0.0018699	total: 546ms	remaining: 173ms
38:	learn: 0.0018049	total: 560ms	remaining: 158ms
39:	learn: 0.0017105	total: 573ms	remaining: 143ms
40:	learn: 0.0016590	total: 587ms	remaining: 129ms
41:	learn: 0.0016400	total: 600ms	remaining: 114ms
42:	learn: 0.0014875	total: 614ms	remaining: 99.9ms
43:	learn: 0.0013674	total: 628ms	remaining: 85.6ms
44:	learn: 0.0012594	total: 642ms	remaining: 71.3ms
45:	learn: 0.0012027	total: 656ms	remaining: 57ms
46:	learn: 0.0011688	total: 669ms	remaining: 42.7ms
47:	learn: 0.0011170	total: 

45:	learn: 0.0013580	total: 667ms	remaining: 58ms
46:	learn: 0.0013200	total: 681ms	remaining: 43.5ms
47:	learn: 0.0012843	total: 697ms	remaining: 29ms
48:	learn: 0.0012180	total: 713ms	remaining: 14.5ms
49:	learn: 0.0011921	total: 727ms	remaining: 0us
0:	learn: 0.5953306	total: 16.1ms	remaining: 790ms
1:	learn: 0.4982996	total: 31.9ms	remaining: 765ms
2:	learn: 0.4287292	total: 45.9ms	remaining: 719ms
3:	learn: 0.3532792	total: 60.2ms	remaining: 692ms
4:	learn: 0.3000730	total: 73.5ms	remaining: 662ms
5:	learn: 0.2483823	total: 87.7ms	remaining: 643ms
6:	learn: 0.1973843	total: 103ms	remaining: 634ms
7:	learn: 0.1550101	total: 117ms	remaining: 614ms
8:	learn: 0.1354717	total: 131ms	remaining: 595ms
9:	learn: 0.1144453	total: 144ms	remaining: 577ms
10:	learn: 0.0900407	total: 158ms	remaining: 560ms
11:	learn: 0.0785206	total: 172ms	remaining: 545ms
12:	learn: 0.0707359	total: 186ms	remaining: 530ms
13:	learn: 0.0656920	total: 201ms	remaining: 517ms
14:	learn: 0.0594646	total: 216ms	rem

9:	learn: 0.1371155	total: 143ms	remaining: 574ms
10:	learn: 0.1063697	total: 157ms	remaining: 558ms
11:	learn: 0.0958535	total: 172ms	remaining: 544ms
12:	learn: 0.0754675	total: 186ms	remaining: 529ms
13:	learn: 0.0616389	total: 199ms	remaining: 512ms
14:	learn: 0.0535908	total: 213ms	remaining: 498ms
15:	learn: 0.0432341	total: 228ms	remaining: 484ms
16:	learn: 0.0357080	total: 242ms	remaining: 470ms
17:	learn: 0.0291042	total: 256ms	remaining: 455ms
18:	learn: 0.0237276	total: 270ms	remaining: 441ms
19:	learn: 0.0195390	total: 284ms	remaining: 427ms
20:	learn: 0.0162131	total: 298ms	remaining: 412ms
21:	learn: 0.0134680	total: 312ms	remaining: 397ms
22:	learn: 0.0123605	total: 326ms	remaining: 383ms
23:	learn: 0.0112202	total: 340ms	remaining: 369ms
24:	learn: 0.0095560	total: 355ms	remaining: 355ms
25:	learn: 0.0081383	total: 369ms	remaining: 341ms
26:	learn: 0.0070126	total: 384ms	remaining: 327ms
27:	learn: 0.0060563	total: 398ms	remaining: 313ms
28:	learn: 0.0052409	total: 412m

26:	learn: 0.0069029	total: 378ms	remaining: 322ms
27:	learn: 0.0059563	total: 392ms	remaining: 308ms
28:	learn: 0.0051849	total: 406ms	remaining: 294ms
29:	learn: 0.0045904	total: 419ms	remaining: 280ms
30:	learn: 0.0040617	total: 433ms	remaining: 266ms
31:	learn: 0.0036450	total: 448ms	remaining: 252ms
32:	learn: 0.0031863	total: 461ms	remaining: 238ms
33:	learn: 0.0028744	total: 475ms	remaining: 223ms
34:	learn: 0.0025443	total: 488ms	remaining: 209ms
35:	learn: 0.0022830	total: 502ms	remaining: 195ms
36:	learn: 0.0020711	total: 516ms	remaining: 181ms
37:	learn: 0.0018699	total: 529ms	remaining: 167ms
38:	learn: 0.0018049	total: 543ms	remaining: 153ms
39:	learn: 0.0017105	total: 557ms	remaining: 139ms
40:	learn: 0.0016590	total: 571ms	remaining: 125ms
41:	learn: 0.0016400	total: 585ms	remaining: 111ms
42:	learn: 0.0014875	total: 599ms	remaining: 97.5ms
43:	learn: 0.0013674	total: 612ms	remaining: 83.5ms
44:	learn: 0.0012594	total: 626ms	remaining: 69.6ms
45:	learn: 0.0012027	total: 

38:	learn: 0.0020593	total: 564ms	remaining: 159ms
39:	learn: 0.0019394	total: 578ms	remaining: 145ms
40:	learn: 0.0018329	total: 592ms	remaining: 130ms
41:	learn: 0.0018024	total: 607ms	remaining: 116ms
42:	learn: 0.0016899	total: 621ms	remaining: 101ms
43:	learn: 0.0015138	total: 636ms	remaining: 86.7ms
44:	learn: 0.0014324	total: 651ms	remaining: 72.3ms
45:	learn: 0.0013580	total: 665ms	remaining: 57.8ms
46:	learn: 0.0013200	total: 679ms	remaining: 43.4ms
47:	learn: 0.0012843	total: 694ms	remaining: 28.9ms
48:	learn: 0.0012180	total: 708ms	remaining: 14.5ms
49:	learn: 0.0011921	total: 722ms	remaining: 0us
0:	learn: 0.5953306	total: 14ms	remaining: 686ms
1:	learn: 0.4982996	total: 28.4ms	remaining: 682ms
2:	learn: 0.4287292	total: 42.3ms	remaining: 663ms
3:	learn: 0.3532792	total: 56.7ms	remaining: 652ms
4:	learn: 0.3000730	total: 70.6ms	remaining: 635ms
5:	learn: 0.2483823	total: 84.2ms	remaining: 618ms
6:	learn: 0.1973843	total: 97.2ms	remaining: 597ms
7:	learn: 0.1550101	total: 11

0:	learn: 0.5925768	total: 15.5ms	remaining: 758ms
1:	learn: 0.4961523	total: 30.5ms	remaining: 732ms
2:	learn: 0.4278656	total: 46.5ms	remaining: 729ms
3:	learn: 0.3527303	total: 60.9ms	remaining: 700ms
4:	learn: 0.3073100	total: 75.8ms	remaining: 682ms
5:	learn: 0.2661476	total: 90.7ms	remaining: 665ms
6:	learn: 0.2349072	total: 106ms	remaining: 649ms
7:	learn: 0.1831458	total: 120ms	remaining: 631ms
8:	learn: 0.1589716	total: 134ms	remaining: 611ms
9:	learn: 0.1371155	total: 148ms	remaining: 592ms
10:	learn: 0.1063697	total: 162ms	remaining: 574ms
11:	learn: 0.0958535	total: 176ms	remaining: 557ms
12:	learn: 0.0754675	total: 190ms	remaining: 540ms
13:	learn: 0.0616389	total: 204ms	remaining: 524ms
14:	learn: 0.0535908	total: 218ms	remaining: 509ms
15:	learn: 0.0432341	total: 233ms	remaining: 494ms
16:	learn: 0.0357080	total: 247ms	remaining: 479ms
17:	learn: 0.0291042	total: 260ms	remaining: 463ms
18:	learn: 0.0237276	total: 274ms	remaining: 447ms
19:	learn: 0.0195390	total: 288ms	r

17:	learn: 0.0293208	total: 260ms	remaining: 463ms
18:	learn: 0.0262080	total: 275ms	remaining: 449ms
19:	learn: 0.0219407	total: 289ms	remaining: 434ms
20:	learn: 0.0182796	total: 303ms	remaining: 418ms
21:	learn: 0.0152304	total: 316ms	remaining: 403ms
22:	learn: 0.0130038	total: 330ms	remaining: 387ms
23:	learn: 0.0108617	total: 344ms	remaining: 373ms
24:	learn: 0.0092312	total: 358ms	remaining: 358ms
25:	learn: 0.0080112	total: 372ms	remaining: 344ms
26:	learn: 0.0069029	total: 387ms	remaining: 330ms
27:	learn: 0.0059563	total: 402ms	remaining: 316ms
28:	learn: 0.0051849	total: 417ms	remaining: 302ms
29:	learn: 0.0045904	total: 432ms	remaining: 288ms
30:	learn: 0.0040617	total: 446ms	remaining: 273ms
31:	learn: 0.0036450	total: 460ms	remaining: 259ms
32:	learn: 0.0031863	total: 474ms	remaining: 244ms
33:	learn: 0.0028744	total: 488ms	remaining: 230ms
34:	learn: 0.0025443	total: 502ms	remaining: 215ms
35:	learn: 0.0022830	total: 516ms	remaining: 201ms
36:	learn: 0.0020711	total: 530

30:	learn: 0.0037592	total: 445ms	remaining: 273ms
31:	learn: 0.0034056	total: 460ms	remaining: 259ms
32:	learn: 0.0029760	total: 475ms	remaining: 245ms
33:	learn: 0.0028332	total: 489ms	remaining: 230ms
34:	learn: 0.0027113	total: 504ms	remaining: 216ms
35:	learn: 0.0024663	total: 518ms	remaining: 202ms
36:	learn: 0.0022989	total: 534ms	remaining: 187ms
37:	learn: 0.0021432	total: 549ms	remaining: 173ms
38:	learn: 0.0020593	total: 566ms	remaining: 160ms
39:	learn: 0.0019394	total: 581ms	remaining: 145ms
40:	learn: 0.0018329	total: 596ms	remaining: 131ms
41:	learn: 0.0018024	total: 609ms	remaining: 116ms
42:	learn: 0.0016899	total: 623ms	remaining: 101ms
43:	learn: 0.0015138	total: 636ms	remaining: 86.8ms
44:	learn: 0.0014324	total: 650ms	remaining: 72.2ms
45:	learn: 0.0013580	total: 665ms	remaining: 57.8ms
46:	learn: 0.0013200	total: 679ms	remaining: 43.4ms
47:	learn: 0.0012843	total: 693ms	remaining: 28.9ms
48:	learn: 0.0012180	total: 708ms	remaining: 14.4ms
49:	learn: 0.0011921	tota

43:	learn: 0.0017374	total: 621ms	remaining: 84.7ms
44:	learn: 0.0016929	total: 635ms	remaining: 70.6ms
45:	learn: 0.0015147	total: 649ms	remaining: 56.5ms
46:	learn: 0.0014354	total: 663ms	remaining: 42.3ms
47:	learn: 0.0013650	total: 677ms	remaining: 28.2ms
48:	learn: 0.0013301	total: 690ms	remaining: 14.1ms
49:	learn: 0.0012989	total: 703ms	remaining: 0us
0:	learn: 0.5925768	total: 14.5ms	remaining: 710ms
1:	learn: 0.4961523	total: 28.5ms	remaining: 684ms
2:	learn: 0.4278656	total: 42.6ms	remaining: 667ms
3:	learn: 0.3527303	total: 56.9ms	remaining: 654ms
4:	learn: 0.3073100	total: 70.4ms	remaining: 634ms
5:	learn: 0.2661476	total: 84.6ms	remaining: 620ms
6:	learn: 0.2349072	total: 98.2ms	remaining: 603ms
7:	learn: 0.1831458	total: 112ms	remaining: 586ms
8:	learn: 0.1589716	total: 125ms	remaining: 570ms
9:	learn: 0.1371155	total: 138ms	remaining: 552ms
10:	learn: 0.1063697	total: 151ms	remaining: 536ms
11:	learn: 0.0958535	total: 165ms	remaining: 522ms
12:	learn: 0.0754675	total: 17

7:	learn: 0.1925622	total: 112ms	remaining: 589ms
8:	learn: 0.1686613	total: 127ms	remaining: 579ms
9:	learn: 0.1384586	total: 142ms	remaining: 567ms
10:	learn: 0.1084458	total: 156ms	remaining: 552ms
11:	learn: 0.0860644	total: 169ms	remaining: 536ms
12:	learn: 0.0682654	total: 183ms	remaining: 522ms
13:	learn: 0.0585681	total: 197ms	remaining: 508ms
14:	learn: 0.0472386	total: 212ms	remaining: 494ms
15:	learn: 0.0387275	total: 226ms	remaining: 481ms
16:	learn: 0.0311865	total: 241ms	remaining: 467ms
17:	learn: 0.0293208	total: 255ms	remaining: 453ms
18:	learn: 0.0262080	total: 269ms	remaining: 440ms
19:	learn: 0.0219407	total: 284ms	remaining: 426ms
20:	learn: 0.0182796	total: 298ms	remaining: 411ms
21:	learn: 0.0152304	total: 312ms	remaining: 397ms
22:	learn: 0.0130038	total: 327ms	remaining: 384ms
23:	learn: 0.0108617	total: 341ms	remaining: 370ms
24:	learn: 0.0092312	total: 355ms	remaining: 355ms
25:	learn: 0.0080112	total: 369ms	remaining: 341ms
26:	learn: 0.0069029	total: 384ms	

21:	learn: 0.0130817	total: 311ms	remaining: 395ms
22:	learn: 0.0109226	total: 325ms	remaining: 381ms
23:	learn: 0.0092703	total: 339ms	remaining: 367ms
24:	learn: 0.0080124	total: 353ms	remaining: 353ms
25:	learn: 0.0069244	total: 366ms	remaining: 338ms
26:	learn: 0.0060504	total: 380ms	remaining: 324ms
27:	learn: 0.0052964	total: 394ms	remaining: 310ms
28:	learn: 0.0045528	total: 408ms	remaining: 295ms
29:	learn: 0.0042615	total: 421ms	remaining: 281ms
30:	learn: 0.0037592	total: 436ms	remaining: 267ms
31:	learn: 0.0034056	total: 450ms	remaining: 253ms
32:	learn: 0.0029760	total: 464ms	remaining: 239ms
33:	learn: 0.0028332	total: 478ms	remaining: 225ms
34:	learn: 0.0027113	total: 492ms	remaining: 211ms
35:	learn: 0.0024663	total: 506ms	remaining: 197ms
36:	learn: 0.0022989	total: 521ms	remaining: 183ms
37:	learn: 0.0021432	total: 535ms	remaining: 169ms
38:	learn: 0.0020593	total: 549ms	remaining: 155ms
39:	learn: 0.0019394	total: 562ms	remaining: 141ms
40:	learn: 0.0018329	total: 576

In [60]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99994 (+/- 0.00030)
F1 Score: 0.99967 (+/- 0.00153)
Precision: 0.99950 (+/- 0.00214)
Recall: 0.99983 (+/- 0.00100)
