## Step backward feature selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

## Read Data

In [2]:
data = pd.read_csv('../DoHBrwTest.csv')
data.shape

(53860, 35)

In [3]:
data.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,is_intrusion
0,7.0,2.0,52742,443,18355.0,0.046455,55,1183.941449,66,1420.729738,...,1.0,0.0,0.0,0.046455,0.046455,0.046455,-10.0,-10.0,0.0,0
1,7.0,2.0,54640,443,18365.0,96.750105,42044,434.562836,44920,464.288902,...,0.561877,0.0017,0.041234,0.033549,0.026931,0.026952,0.481463,0.159978,1.229096,0
2,7.0,2.0,56611,443,18373.0,96.365606,41539,431.056284,44577,462.582054,...,0.549156,0.000956,0.030926,0.026551,0.026848,0.026879,-0.028834,-0.010614,1.164778,0
3,7.0,2.0,56611,443,18374.0,121.35682,60659,499.840058,67897,559.48236,...,0.55657,0.001013,0.031829,0.027571,0.026862,0.026941,0.066819,0.019791,1.154439,0
4,7.0,2.0,56611,443,18375.0,104.669253,30409,290.524668,30718,293.476825,...,0.331633,0.001226,0.035013,0.029797,0.026867,0.026908,0.251063,0.082517,1.175049,0


### Train - Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),
    data['is_intrusion'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((37702, 34), (16158, 34))

### Remove correlated features

Step Backward Feature Selection takes a long time to run, so to speed it up we will reduce the feature space by removing correlated features first.

In [5]:
# remove correlated features to reduce the feature space

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_test, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  14


In [6]:
corr_features

{'DestinationPort',
 'FlowBytesReceived',
 'FlowReceivedRate',
 'PacketLengthCoefficientofVariation',
 'PacketLengthMean',
 'PacketLengthSkewFromMode',
 'PacketLengthStandardDeviation',
 'PacketTimeMean',
 'PacketTimeMedian',
 'PacketTimeStandardDeviation',
 'PacketTimeVariance',
 'ResponseTimeTimeMedian',
 'ResponseTimeTimeMode',
 'ResponseTimeTimeStandardDeviation'}

In [7]:
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((37702, 20), (16158, 20))

### Step Backward Feature Selection

In [8]:
# 1. Algorithm to create, in this case RandomForests
# 2. Stopping criteria: 10 features
# 3. Perform step forward or step backward
# 4. Evaluation metric: in this case the roc_auc
# 5. and cross-validation

sfs = SFS(RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=0),
          k_features=14, # the lower the features we want, the longer this will take
          forward=False,
          floating=False,
          verbose=2,
          scoring='roc_auc',
          cv=2)

sfs = sfs.fit(np.array(X_test), y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    2.3s finished

[2021-08-13 08:38:20] Features: 19/14 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:    2.1s finished

[2021-08-13 08:38:22] Features: 18/14 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    1.9s finished

[2021-08-13 08:38:24] Features: 17/14 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining

### Compare performance of feature subsets

In [9]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [10]:
selected_feat= X_test.columns[list(sfs.k_feature_idx_)]

selected_feat

Index(['SourceIP', 'DestinationIP', 'SourcePort', 'TimeStamp', 'Duration',
       'FlowBytesSent', 'FlowSentRate', 'PacketLengthVariance',
       'PacketLengthMedian', 'PacketLengthMode', 'PacketLengthSkewFromMedian',
       'PacketTimeMode', 'PacketTimeSkewFromMedian', 'PacketTimeSkewFromMode'],
      dtype='object')

In [11]:
# evaluate performance of algorithm built using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9999999999999999
Test set
Random Forests roc-auc: 1.0


In [12]:
# and for comparison, we train random forests using all features

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 1.0


Performance, as expected is roughly the same.

In [13]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [14]:
X_train.shape, X_test.shape

((37702, 14), (16158, 14))

## Standardize Data




In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers



In [16]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation


In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression




In [18]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1).fit(X_train, y_train)

CPU times: user 61.1 ms, sys: 169 ms, total: 230 ms
Wall time: 1.98 s


In [19]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.46441754316332606
F1 Score: 0.6342692974848223
FPR: 1.0
TPR: 1.0


### Naive Bayes




In [20]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-09).fit(X_train, y_train)

CPU times: user 29.4 ms, sys: 7.17 ms, total: 36.6 ms
Wall time: 34.9 ms


In [21]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.11200635046636237
F1 Score: 0.15346778160354155
FPR: 0.9411590336445828
TPR: 0.17331851978463378


### Random Forest




In [22]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 3.74 s, sys: 56.4 ms, total: 3.79 s
Wall time: 3.8 s


In [23]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
FPR: 1.0
TPR: 1.0


### KNN




In [24]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance').fit(X_train, y_train)

CPU times: user 7.3 ms, sys: 1.39 ms, total: 8.69 ms
Wall time: 6.94 ms


In [25]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.3394721174836277
f1: 0.4582329578748617
fpr: 0.8877278790573588
tpr: 0.6014870523886847


### CatBoost




In [26]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.6566019	total: 64ms	remaining: 3.14s
1:	learn: 0.6228481	total: 71.2ms	remaining: 1.71s
2:	learn: 0.5916911	total: 78.3ms	remaining: 1.23s
3:	learn: 0.5630943	total: 85ms	remaining: 977ms
4:	learn: 0.5360494	total: 92.1ms	remaining: 829ms
5:	learn: 0.5107111	total: 99.4ms	remaining: 729ms
6:	learn: 0.4873145	total: 106ms	remaining: 653ms
7:	learn: 0.4651411	total: 113ms	remaining: 595ms
8:	learn: 0.4440234	total: 121ms	remaining: 549ms
9:	learn: 0.4242126	total: 127ms	remaining: 510ms
10:	learn: 0.4062084	total: 135ms	remaining: 478ms
11:	learn: 0.3887494	total: 142ms	remaining: 450ms
12:	learn: 0.3722184	total: 149ms	remaining: 424ms
13:	learn: 0.3571205	total: 157ms	remaining: 405ms
14:	learn: 0.3422961	total: 166ms	remaining: 388ms
15:	learn: 0.3287359	total: 173ms	remaining: 369ms
16:	learn: 0.3151333	total: 182ms	remaining: 354ms
17:	learn: 0.3026802	total: 189ms	remaining: 336ms
18:	learn: 0.2908295	total: 196ms	remaining: 320ms
19:	learn: 0.2794005	total: 203ms	remai

In [27]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
FPR: 1.0
TPR: 1.0


## Model Evaluation




In [28]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../KDDTest.csv")
test_df.shape

(22543, 42)

In [29]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion'])

In [30]:
X_eval = X_eval[selected_feat]

### Model Evaluation - Logistic Regression


In [31]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=0.1, n_jobs=-1, random_state=42)

In [32]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [33]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9244371235500164
Testing accuracy is  0.46441754316332606


In [34]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.46441754316332606
F1 Score: 0.6342692974848223
Precision Score: 0.46441754316332606
Recall Score: 1.0
Confusion Matrix:
 [[    0 13494]
 [    0 11701]]


### Cross validation - Logistic Regression




In [35]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.86115 (+/- 0.01507)
F1 Score: 0.87085 (+/- 0.01453)
Precision: 0.92536 (+/- 0.01661)
Recall: 0.82247 (+/- 0.02022)


### Model Evaluation - Naive Bayes




In [36]:
modelNB = GaussianNB(var_smoothing=1e-09)
modelNB.fit(X_train, y_train)

GaussianNB()

In [37]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [38]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.47491987258997587
Testing accuracy is  0.11200635046636237


In [39]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.11200635046636237
F1 Score: 0.15346778160354155
Precision Score: 0.13769690385659966
Recall Score: 0.17331851978463378
Confusion Matrix:
 [[  794 12700]
 [ 9673  2028]]


### Cross validation - Naive Bayes




In [41]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.51727 (+/- 0.15945)
F1 Score: 0.24934 (+/- 0.35460)
Precision: 0.95204 (+/- 0.09177)
Recall: 0.16148 (+/- 0.31753)


### Model Evaluation - Random Forest




In [42]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [43]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [44]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  0.9999702313027774
Testing accuracy is  0.5355824568366739


In [45]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
Precision Score: 0.28684856807120773
Recall Score: 0.5355824568366739
Confusion Matrix:
 [[13494     0]
 [11701     0]]


### Cross validation - Random Forest





In [46]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98505 (+/- 0.00323)
F1 Score: 0.98689 (+/- 0.00283)
Precision: 0.98547 (+/- 0.00427)
Recall: 0.98831 (+/- 0.00451)


### Model Evaluation - KNN

In [47]:
modelKNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=1, n_neighbors=2,
                     weights='distance')

In [48]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [49]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9999603084037032
Testing accuracy is  0.3394721174836277


In [50]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.3394721174836277
F1 Score: 0.4582329578748617
Precision Score: 0.37008991954566967
Recall Score: 0.6014870523886847
Confusion Matrix:
 [[ 1515 11979]
 [ 4663  7038]]


### Cross validation - KNN




In [51]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.97547 (+/- 0.00572)
F1 Score: 0.97844 (+/- 0.00505)
Precision: 0.97917 (+/- 0.00679)
Recall: 0.97771 (+/- 0.00810)


### Model Evaluation - CatBoost




In [52]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.6566019	total: 8.22ms	remaining: 403ms
1:	learn: 0.6228481	total: 15.4ms	remaining: 369ms
2:	learn: 0.5916911	total: 22.6ms	remaining: 353ms
3:	learn: 0.5630943	total: 29.2ms	remaining: 336ms
4:	learn: 0.5360494	total: 36ms	remaining: 324ms
5:	learn: 0.5107111	total: 42.9ms	remaining: 314ms
6:	learn: 0.4873145	total: 49.4ms	remaining: 304ms
7:	learn: 0.4651411	total: 56.4ms	remaining: 296ms
8:	learn: 0.4440234	total: 63.2ms	remaining: 288ms
9:	learn: 0.4242126	total: 69.7ms	remaining: 279ms
10:	learn: 0.4062084	total: 76.2ms	remaining: 270ms
11:	learn: 0.3887494	total: 83.1ms	remaining: 263ms
12:	learn: 0.3722184	total: 89.8ms	remaining: 256ms
13:	learn: 0.3571205	total: 96.5ms	remaining: 248ms
14:	learn: 0.3422961	total: 103ms	remaining: 241ms
15:	learn: 0.3287359	total: 110ms	remaining: 234ms
16:	learn: 0.3151333	total: 117ms	remaining: 228ms
17:	learn: 0.3026802	total: 124ms	remaining: 220ms
18:	learn: 0.2908295	total: 130ms	remaining: 213ms
19:	learn: 0.2794005	total: 1

<catboost.core.CatBoostClassifier at 0x7fe8b1442ee0>

In [53]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [54]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.9878642944322613
Testing accuracy is  0.5355824568366739


In [55]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
Precision Score: 0.28684856807120773
Recall Score: 0.5355824568366739
Confusion Matrix:
 [[13494     0]
 [11701     0]]


### Cross validation - CatBoost



In [56]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6616320	total: 4.27ms	remaining: 209ms
1:	learn: 0.6333111	total: 8.4ms	remaining: 202ms
2:	learn: 0.6063111	total: 12.5ms	remaining: 196ms
3:	learn: 0.5815218	total: 16.6ms	remaining: 191ms
4:	learn: 0.5578587	total: 20.7ms	remaining: 186ms
5:	learn: 0.5365127	total: 24.8ms	remaining: 182ms
6:	learn: 0.5158966	total: 28.8ms	remaining: 177ms
7:	learn: 0.4956064	total: 33.1ms	remaining: 174ms
8:	learn: 0.4771070	total: 37.1ms	remaining: 169ms
9:	learn: 0.4598688	total: 41.4ms	remaining: 165ms
10:	learn: 0.4427562	total: 45.5ms	remaining: 161ms
11:	learn: 0.4273352	total: 49.7ms	remaining: 158ms
12:	learn: 0.4124211	total: 54.1ms	remaining: 154ms
13:	learn: 0.3986297	total: 58.2ms	remaining: 150ms
14:	learn: 0.3851822	total: 62.5ms	remaining: 146ms
15:	learn: 0.3735815	total: 66.7ms	remaining: 142ms
16:	learn: 0.3618641	total: 70.7ms	remaining: 137ms
17:	learn: 0.3505456	total: 74.8ms	remaining: 133ms
18:	learn: 0.3394918	total: 78.9ms	remaining: 129ms
19:	learn: 0.3296269	to

23:	learn: 0.2922763	total: 99.1ms	remaining: 107ms
24:	learn: 0.2842650	total: 103ms	remaining: 103ms
25:	learn: 0.2770100	total: 107ms	remaining: 98.8ms
26:	learn: 0.2697361	total: 111ms	remaining: 94.7ms
27:	learn: 0.2629931	total: 115ms	remaining: 90.4ms
28:	learn: 0.2564169	total: 119ms	remaining: 86.3ms
29:	learn: 0.2497289	total: 123ms	remaining: 82.1ms
30:	learn: 0.2436273	total: 127ms	remaining: 78ms
31:	learn: 0.2378702	total: 131ms	remaining: 73.9ms
32:	learn: 0.2322220	total: 136ms	remaining: 69.8ms
33:	learn: 0.2269373	total: 140ms	remaining: 65.8ms
34:	learn: 0.2215053	total: 144ms	remaining: 61.7ms
35:	learn: 0.2163123	total: 148ms	remaining: 57.6ms
36:	learn: 0.2114145	total: 152ms	remaining: 53.5ms
37:	learn: 0.2070861	total: 156ms	remaining: 49.3ms
38:	learn: 0.2028070	total: 160ms	remaining: 45.2ms
39:	learn: 0.1985043	total: 164ms	remaining: 41.1ms
40:	learn: 0.1943416	total: 169ms	remaining: 37ms
41:	learn: 0.1902557	total: 172ms	remaining: 32.8ms
42:	learn: 0.1865

33:	learn: 0.2274122	total: 140ms	remaining: 65.7ms
34:	learn: 0.2219640	total: 144ms	remaining: 61.6ms
35:	learn: 0.2167144	total: 148ms	remaining: 57.4ms
36:	learn: 0.2119534	total: 151ms	remaining: 53.2ms
37:	learn: 0.2074170	total: 155ms	remaining: 49ms
38:	learn: 0.2030403	total: 159ms	remaining: 44.8ms
39:	learn: 0.1989299	total: 163ms	remaining: 40.7ms
40:	learn: 0.1947362	total: 167ms	remaining: 36.6ms
41:	learn: 0.1910058	total: 171ms	remaining: 32.5ms
42:	learn: 0.1875812	total: 175ms	remaining: 28.4ms
43:	learn: 0.1839424	total: 179ms	remaining: 24.4ms
44:	learn: 0.1803150	total: 183ms	remaining: 20.3ms
45:	learn: 0.1768910	total: 187ms	remaining: 16.3ms
46:	learn: 0.1738133	total: 191ms	remaining: 12.2ms
47:	learn: 0.1707129	total: 195ms	remaining: 8.14ms
48:	learn: 0.1682079	total: 200ms	remaining: 4.08ms
49:	learn: 0.1649203	total: 204ms	remaining: 0us
0:	learn: 0.6616728	total: 4.86ms	remaining: 238ms
1:	learn: 0.6331129	total: 9.17ms	remaining: 220ms
2:	learn: 0.6061686

0:	learn: 0.6616320	total: 4.98ms	remaining: 244ms
1:	learn: 0.6333111	total: 8.98ms	remaining: 216ms
2:	learn: 0.6063111	total: 13.4ms	remaining: 211ms
3:	learn: 0.5815218	total: 17.4ms	remaining: 200ms
4:	learn: 0.5578587	total: 21.2ms	remaining: 191ms
5:	learn: 0.5365127	total: 25.3ms	remaining: 185ms
6:	learn: 0.5158966	total: 29.4ms	remaining: 181ms
7:	learn: 0.4956064	total: 33.4ms	remaining: 175ms
8:	learn: 0.4771070	total: 37.6ms	remaining: 171ms
9:	learn: 0.4598688	total: 41.8ms	remaining: 167ms
10:	learn: 0.4427562	total: 46ms	remaining: 163ms
11:	learn: 0.4273352	total: 50.1ms	remaining: 159ms
12:	learn: 0.4124211	total: 54.4ms	remaining: 155ms
13:	learn: 0.3986297	total: 58.4ms	remaining: 150ms
14:	learn: 0.3851822	total: 62.6ms	remaining: 146ms
15:	learn: 0.3735815	total: 66.8ms	remaining: 142ms
16:	learn: 0.3618641	total: 71ms	remaining: 138ms
17:	learn: 0.3505456	total: 75.5ms	remaining: 134ms
18:	learn: 0.3394918	total: 79.6ms	remaining: 130ms
19:	learn: 0.3296269	total

21:	learn: 0.3095620	total: 92.4ms	remaining: 118ms
22:	learn: 0.3007005	total: 96.3ms	remaining: 113ms
23:	learn: 0.2922763	total: 101ms	remaining: 109ms
24:	learn: 0.2842650	total: 105ms	remaining: 105ms
25:	learn: 0.2770100	total: 109ms	remaining: 100ms
26:	learn: 0.2697361	total: 113ms	remaining: 96.1ms
27:	learn: 0.2629931	total: 117ms	remaining: 91.7ms
28:	learn: 0.2564169	total: 121ms	remaining: 87.5ms
29:	learn: 0.2497289	total: 125ms	remaining: 83.2ms
30:	learn: 0.2436273	total: 129ms	remaining: 79.1ms
31:	learn: 0.2378702	total: 133ms	remaining: 74.9ms
32:	learn: 0.2322220	total: 137ms	remaining: 70.7ms
33:	learn: 0.2269373	total: 141ms	remaining: 66.4ms
34:	learn: 0.2215053	total: 145ms	remaining: 62.3ms
35:	learn: 0.2163123	total: 149ms	remaining: 58.1ms
36:	learn: 0.2114145	total: 153ms	remaining: 53.9ms
37:	learn: 0.2070861	total: 157ms	remaining: 49.7ms
38:	learn: 0.2028070	total: 162ms	remaining: 45.6ms
39:	learn: 0.1985043	total: 166ms	remaining: 41.4ms
40:	learn: 0.19

47:	learn: 0.1707129	total: 197ms	remaining: 8.19ms
48:	learn: 0.1682079	total: 201ms	remaining: 4.1ms
49:	learn: 0.1649203	total: 205ms	remaining: 0us
0:	learn: 0.6616728	total: 4.22ms	remaining: 207ms
1:	learn: 0.6331129	total: 8.41ms	remaining: 202ms
2:	learn: 0.6061686	total: 12.5ms	remaining: 197ms
3:	learn: 0.5809270	total: 16.7ms	remaining: 192ms
4:	learn: 0.5570375	total: 20.8ms	remaining: 187ms
5:	learn: 0.5351154	total: 24.9ms	remaining: 183ms
6:	learn: 0.5144571	total: 28.9ms	remaining: 178ms
7:	learn: 0.4944814	total: 33ms	remaining: 174ms
8:	learn: 0.4756639	total: 37.1ms	remaining: 169ms
9:	learn: 0.4581271	total: 41.2ms	remaining: 165ms
10:	learn: 0.4410790	total: 45ms	remaining: 160ms
11:	learn: 0.4257687	total: 49ms	remaining: 155ms
12:	learn: 0.4116545	total: 53.2ms	remaining: 151ms
13:	learn: 0.3978842	total: 57.2ms	remaining: 147ms
14:	learn: 0.3844613	total: 61.5ms	remaining: 143ms
15:	learn: 0.3714144	total: 65.5ms	remaining: 139ms
16:	learn: 0.3599448	total: 69.6

15:	learn: 0.3735815	total: 66.9ms	remaining: 142ms
16:	learn: 0.3618641	total: 70.9ms	remaining: 138ms
17:	learn: 0.3505456	total: 75.1ms	remaining: 134ms
18:	learn: 0.3394918	total: 79.2ms	remaining: 129ms
19:	learn: 0.3296269	total: 83ms	remaining: 124ms
20:	learn: 0.3198717	total: 86.9ms	remaining: 120ms
21:	learn: 0.3106272	total: 91ms	remaining: 116ms
22:	learn: 0.3020971	total: 95.2ms	remaining: 112ms
23:	learn: 0.2935406	total: 99.2ms	remaining: 107ms
24:	learn: 0.2857067	total: 103ms	remaining: 103ms
25:	learn: 0.2777614	total: 108ms	remaining: 99.4ms
26:	learn: 0.2702109	total: 112ms	remaining: 95.1ms
27:	learn: 0.2636720	total: 116ms	remaining: 91ms
28:	learn: 0.2564387	total: 120ms	remaining: 86.8ms
29:	learn: 0.2498626	total: 124ms	remaining: 82.7ms
30:	learn: 0.2438778	total: 128ms	remaining: 78.6ms
31:	learn: 0.2380360	total: 132ms	remaining: 74.5ms
32:	learn: 0.2317547	total: 137ms	remaining: 70.4ms
33:	learn: 0.2267700	total: 141ms	remaining: 66.4ms
34:	learn: 0.221321

39:	learn: 0.1985043	total: 166ms	remaining: 41.6ms
40:	learn: 0.1943416	total: 170ms	remaining: 37.4ms
41:	learn: 0.1902557	total: 174ms	remaining: 33.2ms
42:	learn: 0.1865978	total: 178ms	remaining: 29ms
43:	learn: 0.1829932	total: 182ms	remaining: 24.8ms
44:	learn: 0.1794379	total: 186ms	remaining: 20.6ms
45:	learn: 0.1760607	total: 190ms	remaining: 16.5ms
46:	learn: 0.1730116	total: 194ms	remaining: 12.4ms
47:	learn: 0.1699390	total: 198ms	remaining: 8.24ms
48:	learn: 0.1674707	total: 202ms	remaining: 4.12ms
49:	learn: 0.1646087	total: 206ms	remaining: 0us
0:	learn: 0.6623559	total: 4.51ms	remaining: 221ms
1:	learn: 0.6338053	total: 8.43ms	remaining: 202ms
2:	learn: 0.6069712	total: 12.4ms	remaining: 194ms
3:	learn: 0.5818632	total: 16.4ms	remaining: 189ms
4:	learn: 0.5581816	total: 20.7ms	remaining: 187ms
5:	learn: 0.5361857	total: 24.8ms	remaining: 182ms
6:	learn: 0.5159851	total: 29.1ms	remaining: 178ms
7:	learn: 0.4958331	total: 33.2ms	remaining: 174ms
8:	learn: 0.4771791	total

7:	learn: 0.4944814	total: 32.9ms	remaining: 173ms
8:	learn: 0.4756639	total: 37ms	remaining: 168ms
9:	learn: 0.4581271	total: 40.7ms	remaining: 163ms
10:	learn: 0.4410790	total: 44.6ms	remaining: 158ms
11:	learn: 0.4257687	total: 48.8ms	remaining: 155ms
12:	learn: 0.4116545	total: 52.8ms	remaining: 150ms
13:	learn: 0.3978842	total: 56.6ms	remaining: 146ms
14:	learn: 0.3844613	total: 60.5ms	remaining: 141ms
15:	learn: 0.3714144	total: 64.6ms	remaining: 137ms
16:	learn: 0.3599448	total: 68.7ms	remaining: 133ms
17:	learn: 0.3488356	total: 72.9ms	remaining: 130ms
18:	learn: 0.3381233	total: 76.9ms	remaining: 126ms
19:	learn: 0.3281908	total: 80.9ms	remaining: 121ms
20:	learn: 0.3182924	total: 85.1ms	remaining: 118ms
21:	learn: 0.3090862	total: 89.2ms	remaining: 114ms
22:	learn: 0.3002618	total: 93.1ms	remaining: 109ms
23:	learn: 0.2917599	total: 97.2ms	remaining: 105ms
24:	learn: 0.2839989	total: 101ms	remaining: 101ms
25:	learn: 0.2767639	total: 105ms	remaining: 97.1ms
26:	learn: 0.26950

31:	learn: 0.2380360	total: 133ms	remaining: 74.5ms
32:	learn: 0.2317547	total: 137ms	remaining: 70.4ms
33:	learn: 0.2267700	total: 141ms	remaining: 66.2ms
34:	learn: 0.2213213	total: 145ms	remaining: 61.9ms
35:	learn: 0.2161159	total: 148ms	remaining: 57.7ms
36:	learn: 0.2113152	total: 152ms	remaining: 53.5ms
37:	learn: 0.2066764	total: 156ms	remaining: 49.3ms
38:	learn: 0.2026824	total: 160ms	remaining: 45.2ms
39:	learn: 0.1987766	total: 164ms	remaining: 41.1ms
40:	learn: 0.1944066	total: 168ms	remaining: 37ms
41:	learn: 0.1909465	total: 173ms	remaining: 32.9ms
42:	learn: 0.1871847	total: 177ms	remaining: 28.8ms
43:	learn: 0.1834054	total: 181ms	remaining: 24.7ms
44:	learn: 0.1795682	total: 186ms	remaining: 20.6ms
45:	learn: 0.1761407	total: 190ms	remaining: 16.5ms
46:	learn: 0.1730639	total: 194ms	remaining: 12.4ms
47:	learn: 0.1697036	total: 198ms	remaining: 8.26ms
48:	learn: 0.1663756	total: 202ms	remaining: 4.13ms
49:	learn: 0.1631848	total: 206ms	remaining: 0us
0:	learn: 0.66150

0:	learn: 0.6623559	total: 4.23ms	remaining: 207ms
1:	learn: 0.6338053	total: 8.19ms	remaining: 197ms
2:	learn: 0.6069712	total: 12ms	remaining: 188ms
3:	learn: 0.5818632	total: 16ms	remaining: 184ms
4:	learn: 0.5581816	total: 20ms	remaining: 180ms
5:	learn: 0.5361857	total: 24.1ms	remaining: 177ms
6:	learn: 0.5159851	total: 28.4ms	remaining: 174ms
7:	learn: 0.4958331	total: 32.5ms	remaining: 171ms
8:	learn: 0.4771791	total: 36.6ms	remaining: 167ms
9:	learn: 0.4596217	total: 40.7ms	remaining: 163ms
10:	learn: 0.4426497	total: 44.7ms	remaining: 159ms
11:	learn: 0.4273927	total: 48.9ms	remaining: 155ms
12:	learn: 0.4126172	total: 53ms	remaining: 151ms
13:	learn: 0.3988267	total: 57ms	remaining: 147ms
14:	learn: 0.3853193	total: 61.1ms	remaining: 143ms
15:	learn: 0.3733386	total: 65.4ms	remaining: 139ms
16:	learn: 0.3614219	total: 69.8ms	remaining: 136ms
17:	learn: 0.3505225	total: 73.9ms	remaining: 131ms
18:	learn: 0.3396563	total: 77.8ms	remaining: 127ms
19:	learn: 0.3297130	total: 81.7

22:	learn: 0.3002618	total: 96ms	remaining: 113ms
23:	learn: 0.2917599	total: 100ms	remaining: 108ms
24:	learn: 0.2839989	total: 104ms	remaining: 104ms
25:	learn: 0.2767639	total: 108ms	remaining: 99.9ms
26:	learn: 0.2695052	total: 112ms	remaining: 95.4ms
27:	learn: 0.2628584	total: 116ms	remaining: 91.3ms
28:	learn: 0.2559712	total: 120ms	remaining: 87.2ms
29:	learn: 0.2496799	total: 124ms	remaining: 83ms
30:	learn: 0.2434006	total: 128ms	remaining: 78.7ms
31:	learn: 0.2375763	total: 133ms	remaining: 74.7ms
32:	learn: 0.2320425	total: 137ms	remaining: 70.5ms
33:	learn: 0.2267792	total: 141ms	remaining: 66.4ms
34:	learn: 0.2215763	total: 145ms	remaining: 62.2ms
35:	learn: 0.2168199	total: 149ms	remaining: 58.1ms
36:	learn: 0.2119718	total: 154ms	remaining: 54ms
37:	learn: 0.2075978	total: 158ms	remaining: 49.8ms
38:	learn: 0.2034011	total: 162ms	remaining: 45.7ms
39:	learn: 0.1992748	total: 166ms	remaining: 41.6ms
40:	learn: 0.1953919	total: 171ms	remaining: 37.4ms
41:	learn: 0.1912276

In [57]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.96371 (+/- 0.00685)
F1 Score: 0.96886 (+/- 0.00578)
Precision: 0.94718 (+/- 0.00992)
Recall: 0.99158 (+/- 0.00631)
