## Step backward feature selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

## Read Data

In [2]:
data = pd.read_csv('../Kyoto_Train.csv')
data.shape

(124055, 24)

In [3]:
data.head()

Unnamed: 0,Duration,Source,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_port_rate,...,Service_code,Flag_code,IDS_detection_code,Malware_detection_code,Ashula_detection_code,Source_IP_Address_code,Destination_IP_Address_code,Start_Time_code,Protocol_code,Label_code
0,2.863309,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,90847.0,14024.0,25836.0,1.0,0.0
1,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,218531.0,8968.0,45541.0,1.0,0.0
2,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,176665.0,15997.0,59860.0,1.0,0.0
3,0.0,0,0,0,0.0,0.0,0.67,49,100,0.02,...,6.0,6.0,0.0,0.0,0.0,52769.0,473.0,40649.0,1.0,0.0
4,0.0,0,0,1,1.0,0.0,0.36,0,2,0.0,...,6.0,0.0,0.0,0.0,0.0,65048.0,16609.0,39283.0,1.0,0.0


### Train - Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['Label_code'], axis=1),
    data['Label_code'],
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((99244, 23), (24811, 23))

### Remove correlated features

Step Backward Feature Selection takes a long time to run, so to speed it up we will reduce the feature space by removing correlated features first.

In [5]:
# remove correlated features to reduce the feature space

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_test, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  1


In [6]:
corr_features

{'Dst_host_srv_count'}

In [7]:
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((99244, 22), (24811, 22))

### Step Backward Feature Selection

In [8]:
# 1. Algorithm to create, in this case RandomForests
# 2. Stopping criteria: 10 features
# 3. Perform step forward or step backward
# 4. Evaluation metric: in this case the roc_auc
# 5. and cross-validation

sfs = SFS(RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=0),
          k_features=15, # the lower the features we want, the longer this will take
          forward=False,
          floating=False,
          verbose=2,
          scoring='roc_auc',
          cv=2)

sfs = sfs.fit(np.array(X_test), y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    2.8s finished

[2021-05-29 14:34:49] Features: 21/15 -- score: 0.9999565091894865[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    2.6s finished

[2021-05-29 14:34:52] Features: 20/15 -- score: 0.9999807790147792[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    2.4s finished

[2021-05-29 14:34:54] Features: 19/15 -- score: 0.9999772286872444[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Don

### Compare performance of feature subsets

In [9]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [10]:
selected_feat= X_test.columns[list(sfs.k_feature_idx_)]

selected_feat

Index(['Duration', 'Source', 'Destination_bytes', 'Count', 'Same_srv_rate',
       'Srv_serror_rate', 'Dst_host_count', 'Dst_host_same_port_rate',
       'Dst_host_srv_serror_rate', 'Service_code', 'IDS_detection_code',
       'Ashula_detection_code', 'Source_IP_Address_code',
       'Destination_IP_Address_code', 'Protocol_code'],
      dtype='object')

In [11]:
# evaluate performance of algorithm built using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9958026803387605
Test set
Random Forests roc-auc: 0.9955979883181604


In [12]:
# and for comparison, we train random forests using all features

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9911323964302354
Test set
Random Forests roc-auc: 0.9907208755244232


Performance, as expected is roughly the same.

In [13]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [14]:
X_train.shape, X_test.shape

((99244, 15), (24811, 15))

## Standardize Data




In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers



In [16]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation


In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression




In [18]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1).fit(X_train, y_train)

CPU times: user 64.7 ms, sys: 180 ms, total: 245 ms
Wall time: 1.85 s


In [19]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.0
FPR: 1.0
TPR: 1.0


### Naive Bayes




In [20]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-05).fit(X_train, y_train)

CPU times: user 28.8 ms, sys: 3.61 ms, total: 32.4 ms
Wall time: 30.7 ms


In [21]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.0
FPR: 1.0
TPR: 1.0


### Random Forest




In [22]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 4.12 s, sys: 45.6 ms, total: 4.17 s
Wall time: 4.17 s


In [23]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


### KNN




In [24]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform').fit(X_train, y_train)

CPU times: user 9.35 s, sys: 38.9 ms, total: 9.39 s
Wall time: 9.37 s


In [25]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.9058885171899561
f1: 0.0
fpr: 1.0
tpr: 1.0


### CatBoost




In [26]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.6555270	total: 61.2ms	remaining: 3s
1:	learn: 0.6223237	total: 70.5ms	remaining: 1.69s
2:	learn: 0.5899211	total: 78.8ms	remaining: 1.23s
3:	learn: 0.5600191	total: 87.3ms	remaining: 1s
4:	learn: 0.5325461	total: 96.7ms	remaining: 870ms
5:	learn: 0.5064573	total: 106ms	remaining: 778ms
6:	learn: 0.4823257	total: 114ms	remaining: 698ms
7:	learn: 0.4595131	total: 123ms	remaining: 643ms
8:	learn: 0.4380134	total: 131ms	remaining: 595ms
9:	learn: 0.4176995	total: 139ms	remaining: 554ms
10:	learn: 0.3992713	total: 147ms	remaining: 520ms
11:	learn: 0.3811583	total: 155ms	remaining: 491ms
12:	learn: 0.3633038	total: 163ms	remaining: 464ms
13:	learn: 0.3478406	total: 172ms	remaining: 442ms
14:	learn: 0.3323893	total: 180ms	remaining: 421ms
15:	learn: 0.3182369	total: 187ms	remaining: 398ms
16:	learn: 0.3046487	total: 195ms	remaining: 379ms
17:	learn: 0.2923097	total: 203ms	remaining: 361ms
18:	learn: 0.2800389	total: 210ms	remaining: 343ms
19:	learn: 0.2683302	total: 216ms	remainin

In [27]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9057676030792794
F1 Score: 0.861174585394256
FPR: 0.0001779676098949991
TPR: 0.00042826552462526765


## Model Evaluation




In [57]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../Kyoto_Test.csv")
test_df.shape

(62028, 24)

In [58]:
# Create feature matrix X and target vextor y
y_eval = test_df['Label_code']
X_eval = test_df.drop(columns=['Label_code'])

In [59]:
X_eval = X_eval[selected_feat]

### Model Evaluation - Logistic Regression


In [60]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=1, n_jobs=-1, random_state=42)

In [61]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [62]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.908024666478578
Testing accuracy is  0.9058885171899561


In [63]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR, average='weighted', zero_division=1))
print('Precision Score:',precision_score(y_test, y_predLR, average='weighted', zero_division=1))
print('Recall Score:', recall_score(y_test, y_predLR, average='weighted', zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.9147454883866614
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - Logistic Regression




In [64]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.90190 (+/- 0.00191)
F1 Score: 0.01603 (+/- 0.01662)
Precision: 0.25765 (+/- 0.18312)
Recall: 0.00835 (+/- 0.00883)


### Model Evaluation - Naive Bayes




In [36]:
modelNB = GaussianNB(var_smoothing=1e-05)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-05)

In [37]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [38]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.3312139776711942
Testing accuracy is  0.9058885171899561


In [65]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB, average='weighted', zero_division=1))
print('Precision Score:',precision_score(y_test, y_predNB, average='weighted', zero_division=1))
print('Recall Score:', recall_score(y_test, y_predNB, average='weighted', zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.9147454883866614
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - Naive Bayes




In [66]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.37957 (+/- 0.35504)
F1 Score: 0.18421 (+/- 0.12351)
Precision: 0.10391 (+/- 0.06980)
Recall: 0.81277 (+/- 0.54262)


### Model Evaluation - Random Forest




In [67]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [68]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [69]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.9058885171899561


In [70]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - Random Forest





In [71]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99947 (+/- 0.00069)
F1 Score: 0.99724 (+/- 0.00360)
Precision: 0.99899 (+/- 0.00268)
Recall: 0.99549 (+/- 0.00560)


### Model Evaluation - KNN

In [72]:
modelKNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(leaf_size=1, n_neighbors=2)

In [47]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [48]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9972492039821047
Testing accuracy is  0.9058885171899561


In [73]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN, average='weighted', zero_division=1))
print('Precision Score:', precision_score(y_test, y_predKNN, average='weighted', zero_division=1))
print('Recall Score:', recall_score(y_test, y_predKNN, average='weighted', zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.9147454883866614
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - KNN




In [74]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99468 (+/- 0.00205)
F1 Score: 0.97191 (+/- 0.01109)
Precision: 0.98961 (+/- 0.00490)
Recall: 0.95489 (+/- 0.01938)


### Model Evaluation - CatBoost




In [75]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.6555270	total: 8.71ms	remaining: 427ms
1:	learn: 0.6223237	total: 16.4ms	remaining: 393ms
2:	learn: 0.5899211	total: 23.8ms	remaining: 373ms
3:	learn: 0.5600191	total: 31.2ms	remaining: 358ms
4:	learn: 0.5325461	total: 38.6ms	remaining: 347ms
5:	learn: 0.5064573	total: 46.1ms	remaining: 338ms
6:	learn: 0.4823257	total: 53.3ms	remaining: 328ms
7:	learn: 0.4595131	total: 60.7ms	remaining: 319ms
8:	learn: 0.4380134	total: 68.2ms	remaining: 311ms
9:	learn: 0.4176995	total: 75.6ms	remaining: 303ms
10:	learn: 0.3992713	total: 83ms	remaining: 294ms
11:	learn: 0.3811583	total: 90.4ms	remaining: 286ms
12:	learn: 0.3633038	total: 98.2ms	remaining: 280ms
13:	learn: 0.3478406	total: 106ms	remaining: 272ms
14:	learn: 0.3323893	total: 114ms	remaining: 266ms
15:	learn: 0.3182369	total: 122ms	remaining: 259ms
16:	learn: 0.3046487	total: 129ms	remaining: 251ms
17:	learn: 0.2923097	total: 137ms	remaining: 243ms
18:	learn: 0.2800389	total: 144ms	remaining: 235ms
19:	learn: 0.2683302	total: 15

<catboost.core.CatBoostClassifier at 0x7fbd00e72fd0>

In [76]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [77]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.996392729031478
Testing accuracy is  0.9057676030792794


In [78]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.9057676030792794
F1 Score: 0.861174585394256
Precision Score: 0.8394756368140724
Recall Score: 0.9057676030792794
Confusion Matrix:
 [[22472     4]
 [ 2334     1]]


### Cross validation - CatBoost



In [79]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6564460	total: 5.92ms	remaining: 290ms
1:	learn: 0.6229622	total: 11.7ms	remaining: 282ms
2:	learn: 0.5912783	total: 17.6ms	remaining: 275ms
3:	learn: 0.5612029	total: 23.3ms	remaining: 268ms
4:	learn: 0.5336658	total: 29.3ms	remaining: 263ms
5:	learn: 0.5079086	total: 35.2ms	remaining: 258ms
6:	learn: 0.4834936	total: 40.8ms	remaining: 251ms
7:	learn: 0.4605836	total: 46.7ms	remaining: 245ms
8:	learn: 0.4395135	total: 52.4ms	remaining: 239ms
9:	learn: 0.4194479	total: 58.5ms	remaining: 234ms
10:	learn: 0.4010215	total: 64.3ms	remaining: 228ms
11:	learn: 0.3824431	total: 70.2ms	remaining: 222ms
12:	learn: 0.3652005	total: 76.1ms	remaining: 216ms
13:	learn: 0.3482896	total: 82.1ms	remaining: 211ms
14:	learn: 0.3327165	total: 88ms	remaining: 205ms
15:	learn: 0.3185677	total: 94.2ms	remaining: 200ms
16:	learn: 0.3045650	total: 100ms	remaining: 195ms
17:	learn: 0.2924611	total: 106ms	remaining: 189ms
18:	learn: 0.2799031	total: 112ms	remaining: 183ms
19:	learn: 0.2681053	total:

18:	learn: 0.2806495	total: 105ms	remaining: 171ms
19:	learn: 0.2689276	total: 111ms	remaining: 167ms
20:	learn: 0.2579163	total: 118ms	remaining: 163ms
21:	learn: 0.2476445	total: 125ms	remaining: 158ms
22:	learn: 0.2371247	total: 131ms	remaining: 154ms
23:	learn: 0.2278537	total: 138ms	remaining: 150ms
24:	learn: 0.2182533	total: 145ms	remaining: 145ms
25:	learn: 0.2092519	total: 152ms	remaining: 140ms
26:	learn: 0.2015073	total: 159ms	remaining: 135ms
27:	learn: 0.1936732	total: 166ms	remaining: 130ms
28:	learn: 0.1860299	total: 172ms	remaining: 125ms
29:	learn: 0.1785679	total: 179ms	remaining: 119ms
30:	learn: 0.1718436	total: 185ms	remaining: 113ms
31:	learn: 0.1653595	total: 191ms	remaining: 108ms
32:	learn: 0.1583218	total: 198ms	remaining: 102ms
33:	learn: 0.1515985	total: 204ms	remaining: 95.9ms
34:	learn: 0.1451996	total: 211ms	remaining: 90.5ms
35:	learn: 0.1392085	total: 218ms	remaining: 84.8ms
36:	learn: 0.1340906	total: 224ms	remaining: 78.8ms
37:	learn: 0.1285304	total:

34:	learn: 0.1470036	total: 201ms	remaining: 86.3ms
35:	learn: 0.1416446	total: 207ms	remaining: 80.7ms
36:	learn: 0.1362240	total: 214ms	remaining: 75.1ms
37:	learn: 0.1309188	total: 220ms	remaining: 69.4ms
38:	learn: 0.1261866	total: 226ms	remaining: 63.7ms
39:	learn: 0.1217321	total: 231ms	remaining: 57.9ms
40:	learn: 0.1177056	total: 237ms	remaining: 52.1ms
41:	learn: 0.1132031	total: 243ms	remaining: 46.3ms
42:	learn: 0.1091619	total: 249ms	remaining: 40.5ms
43:	learn: 0.1054787	total: 255ms	remaining: 34.8ms
44:	learn: 0.1013623	total: 261ms	remaining: 29ms
45:	learn: 0.0979480	total: 267ms	remaining: 23.2ms
46:	learn: 0.0938982	total: 272ms	remaining: 17.4ms
47:	learn: 0.0907578	total: 278ms	remaining: 11.6ms
48:	learn: 0.0876463	total: 284ms	remaining: 5.79ms
49:	learn: 0.0845066	total: 290ms	remaining: 0us
0:	learn: 0.6563890	total: 5.64ms	remaining: 276ms
1:	learn: 0.6229387	total: 11.2ms	remaining: 268ms
2:	learn: 0.5912317	total: 17.1ms	remaining: 267ms
3:	learn: 0.5608017	

45:	learn: 0.0940440	total: 272ms	remaining: 23.6ms
46:	learn: 0.0905675	total: 278ms	remaining: 17.7ms
47:	learn: 0.0874077	total: 284ms	remaining: 11.8ms
48:	learn: 0.0838459	total: 290ms	remaining: 5.92ms
49:	learn: 0.0811943	total: 296ms	remaining: 0us
0:	learn: 0.6564460	total: 6.24ms	remaining: 306ms
1:	learn: 0.6229622	total: 12.2ms	remaining: 292ms
2:	learn: 0.5912783	total: 18.2ms	remaining: 286ms
3:	learn: 0.5612029	total: 24.1ms	remaining: 277ms
4:	learn: 0.5336658	total: 30.1ms	remaining: 271ms
5:	learn: 0.5079086	total: 35.7ms	remaining: 262ms
6:	learn: 0.4834936	total: 41.5ms	remaining: 255ms
7:	learn: 0.4605836	total: 47.7ms	remaining: 250ms
8:	learn: 0.4395135	total: 53.5ms	remaining: 244ms
9:	learn: 0.4194479	total: 59.4ms	remaining: 238ms
10:	learn: 0.4010215	total: 65.2ms	remaining: 231ms
11:	learn: 0.3824431	total: 71.5ms	remaining: 226ms
12:	learn: 0.3652005	total: 77.3ms	remaining: 220ms
13:	learn: 0.3482896	total: 82.9ms	remaining: 213ms
14:	learn: 0.3327165	tota

18:	learn: 0.2806495	total: 112ms	remaining: 183ms
19:	learn: 0.2689276	total: 119ms	remaining: 178ms
20:	learn: 0.2579163	total: 125ms	remaining: 172ms
21:	learn: 0.2476445	total: 131ms	remaining: 166ms
22:	learn: 0.2371247	total: 137ms	remaining: 160ms
23:	learn: 0.2278537	total: 142ms	remaining: 154ms
24:	learn: 0.2182533	total: 148ms	remaining: 148ms
25:	learn: 0.2092519	total: 154ms	remaining: 143ms
26:	learn: 0.2015073	total: 161ms	remaining: 137ms
27:	learn: 0.1936732	total: 166ms	remaining: 131ms
28:	learn: 0.1860299	total: 172ms	remaining: 125ms
29:	learn: 0.1785679	total: 178ms	remaining: 119ms
30:	learn: 0.1718436	total: 184ms	remaining: 113ms
31:	learn: 0.1653595	total: 189ms	remaining: 107ms
32:	learn: 0.1583218	total: 195ms	remaining: 101ms
33:	learn: 0.1515985	total: 202ms	remaining: 94.9ms
34:	learn: 0.1451996	total: 208ms	remaining: 89ms
35:	learn: 0.1392085	total: 213ms	remaining: 83ms
36:	learn: 0.1340906	total: 219ms	remaining: 77ms
37:	learn: 0.1285304	total: 225ms

30:	learn: 0.1724841	total: 203ms	remaining: 124ms
31:	learn: 0.1660426	total: 209ms	remaining: 118ms
32:	learn: 0.1589711	total: 216ms	remaining: 111ms
33:	learn: 0.1526883	total: 221ms	remaining: 104ms
34:	learn: 0.1470036	total: 227ms	remaining: 97.4ms
35:	learn: 0.1416446	total: 232ms	remaining: 90.4ms
36:	learn: 0.1362240	total: 238ms	remaining: 83.7ms
37:	learn: 0.1309188	total: 244ms	remaining: 77ms
38:	learn: 0.1261866	total: 249ms	remaining: 70.3ms
39:	learn: 0.1217321	total: 255ms	remaining: 63.8ms
40:	learn: 0.1177056	total: 261ms	remaining: 57.2ms
41:	learn: 0.1132031	total: 266ms	remaining: 50.7ms
42:	learn: 0.1091619	total: 272ms	remaining: 44.2ms
43:	learn: 0.1054787	total: 278ms	remaining: 37.8ms
44:	learn: 0.1013623	total: 283ms	remaining: 31.4ms
45:	learn: 0.0979480	total: 289ms	remaining: 25.1ms
46:	learn: 0.0938982	total: 294ms	remaining: 18.8ms
47:	learn: 0.0907578	total: 300ms	remaining: 12.5ms
48:	learn: 0.0876463	total: 305ms	remaining: 6.23ms
49:	learn: 0.08450

1:	learn: 0.6229622	total: 12.2ms	remaining: 292ms
2:	learn: 0.5912783	total: 18.1ms	remaining: 284ms
3:	learn: 0.5612029	total: 24.1ms	remaining: 277ms
4:	learn: 0.5336658	total: 30.3ms	remaining: 272ms
5:	learn: 0.5079086	total: 36.1ms	remaining: 265ms
6:	learn: 0.4834936	total: 41.9ms	remaining: 257ms
7:	learn: 0.4605836	total: 47.7ms	remaining: 251ms
8:	learn: 0.4395135	total: 54.5ms	remaining: 248ms
9:	learn: 0.4194479	total: 60.9ms	remaining: 244ms
10:	learn: 0.4010215	total: 67.7ms	remaining: 240ms
11:	learn: 0.3824431	total: 74.3ms	remaining: 235ms
12:	learn: 0.3652005	total: 81.1ms	remaining: 231ms
13:	learn: 0.3482896	total: 87.8ms	remaining: 226ms
14:	learn: 0.3327165	total: 94.5ms	remaining: 221ms
15:	learn: 0.3185677	total: 102ms	remaining: 217ms
16:	learn: 0.3045650	total: 109ms	remaining: 212ms
17:	learn: 0.2924611	total: 116ms	remaining: 207ms
18:	learn: 0.2799031	total: 123ms	remaining: 200ms
19:	learn: 0.2681053	total: 130ms	remaining: 195ms
20:	learn: 0.2571474	total

25:	learn: 0.2092519	total: 154ms	remaining: 142ms
26:	learn: 0.2015073	total: 160ms	remaining: 136ms
27:	learn: 0.1936732	total: 166ms	remaining: 130ms
28:	learn: 0.1860299	total: 172ms	remaining: 124ms
29:	learn: 0.1785679	total: 178ms	remaining: 119ms
30:	learn: 0.1718436	total: 185ms	remaining: 113ms
31:	learn: 0.1653595	total: 191ms	remaining: 107ms
32:	learn: 0.1583218	total: 197ms	remaining: 101ms
33:	learn: 0.1515985	total: 203ms	remaining: 95.7ms
34:	learn: 0.1451996	total: 210ms	remaining: 89.9ms
35:	learn: 0.1392085	total: 216ms	remaining: 83.9ms
36:	learn: 0.1340906	total: 221ms	remaining: 77.8ms
37:	learn: 0.1285304	total: 228ms	remaining: 71.9ms
38:	learn: 0.1233354	total: 234ms	remaining: 65.9ms
39:	learn: 0.1190838	total: 240ms	remaining: 59.9ms
40:	learn: 0.1149072	total: 246ms	remaining: 53.9ms
41:	learn: 0.1105054	total: 252ms	remaining: 47.9ms
42:	learn: 0.1059476	total: 258ms	remaining: 42ms
43:	learn: 0.1018932	total: 264ms	remaining: 36ms
44:	learn: 0.0980671	tot

48:	learn: 0.0876463	total: 290ms	remaining: 5.92ms
49:	learn: 0.0845066	total: 296ms	remaining: 0us
0:	learn: 0.6563890	total: 6.21ms	remaining: 304ms
1:	learn: 0.6229387	total: 12.2ms	remaining: 293ms
2:	learn: 0.5912317	total: 18ms	remaining: 282ms
3:	learn: 0.5608017	total: 23.9ms	remaining: 274ms
4:	learn: 0.5332978	total: 29.7ms	remaining: 267ms
5:	learn: 0.5076480	total: 35.6ms	remaining: 261ms
6:	learn: 0.4832563	total: 41.3ms	remaining: 254ms
7:	learn: 0.4603516	total: 47ms	remaining: 247ms
8:	learn: 0.4390959	total: 52.9ms	remaining: 241ms
9:	learn: 0.4184791	total: 58.9ms	remaining: 235ms
10:	learn: 0.4000957	total: 64.8ms	remaining: 230ms
11:	learn: 0.3815892	total: 70.6ms	remaining: 224ms
12:	learn: 0.3644186	total: 76.7ms	remaining: 218ms
13:	learn: 0.3479641	total: 83ms	remaining: 213ms
14:	learn: 0.3324030	total: 89.6ms	remaining: 209ms
15:	learn: 0.3179935	total: 95.4ms	remaining: 203ms
16:	learn: 0.3043760	total: 101ms	remaining: 197ms
17:	learn: 0.2923100	total: 107m

22:	learn: 0.2357082	total: 130ms	remaining: 153ms
23:	learn: 0.2266724	total: 136ms	remaining: 147ms
24:	learn: 0.2167712	total: 141ms	remaining: 141ms
25:	learn: 0.2078050	total: 147ms	remaining: 136ms
26:	learn: 0.2000660	total: 153ms	remaining: 130ms
27:	learn: 0.1916793	total: 158ms	remaining: 125ms
28:	learn: 0.1846077	total: 164ms	remaining: 119ms
29:	learn: 0.1773447	total: 170ms	remaining: 113ms
30:	learn: 0.1705900	total: 176ms	remaining: 108ms
31:	learn: 0.1641072	total: 181ms	remaining: 102ms
32:	learn: 0.1573360	total: 187ms	remaining: 96.2ms
33:	learn: 0.1513883	total: 192ms	remaining: 90.5ms
34:	learn: 0.1450616	total: 198ms	remaining: 84.8ms
35:	learn: 0.1393184	total: 204ms	remaining: 79.2ms
36:	learn: 0.1336008	total: 209ms	remaining: 73.6ms
37:	learn: 0.1280697	total: 215ms	remaining: 68ms
38:	learn: 0.1229861	total: 221ms	remaining: 62.3ms
39:	learn: 0.1183262	total: 227ms	remaining: 56.7ms
40:	learn: 0.1140964	total: 232ms	remaining: 51ms
41:	learn: 0.1096039	total

35:	learn: 0.1392085	total: 215ms	remaining: 83.7ms
36:	learn: 0.1340906	total: 221ms	remaining: 77.8ms
37:	learn: 0.1285304	total: 227ms	remaining: 71.8ms
38:	learn: 0.1233354	total: 233ms	remaining: 65.9ms
39:	learn: 0.1190838	total: 240ms	remaining: 59.9ms
40:	learn: 0.1149072	total: 246ms	remaining: 53.9ms
41:	learn: 0.1105054	total: 252ms	remaining: 47.9ms
42:	learn: 0.1059476	total: 258ms	remaining: 42ms
43:	learn: 0.1018932	total: 264ms	remaining: 36ms
44:	learn: 0.0980671	total: 270ms	remaining: 30ms
45:	learn: 0.0942157	total: 276ms	remaining: 24ms
46:	learn: 0.0907342	total: 282ms	remaining: 18ms
47:	learn: 0.0876705	total: 287ms	remaining: 12ms
48:	learn: 0.0846096	total: 293ms	remaining: 5.98ms
49:	learn: 0.0818384	total: 299ms	remaining: 0us
0:	learn: 0.6562349	total: 6.12ms	remaining: 300ms
1:	learn: 0.6223905	total: 12ms	remaining: 289ms
2:	learn: 0.5896276	total: 17.9ms	remaining: 280ms
3:	learn: 0.5599896	total: 23.6ms	remaining: 271ms
4:	learn: 0.5331899	total: 29.7ms

48:	learn: 0.0876463	total: 310ms	remaining: 6.32ms
49:	learn: 0.0845066	total: 316ms	remaining: 0us
0:	learn: 0.6563890	total: 6.11ms	remaining: 299ms
1:	learn: 0.6229387	total: 12.7ms	remaining: 304ms
2:	learn: 0.5912317	total: 18.4ms	remaining: 288ms
3:	learn: 0.5608017	total: 24ms	remaining: 276ms
4:	learn: 0.5332978	total: 29.7ms	remaining: 267ms
5:	learn: 0.5076480	total: 35.6ms	remaining: 261ms
6:	learn: 0.4832563	total: 42.1ms	remaining: 259ms
7:	learn: 0.4603516	total: 47.8ms	remaining: 251ms
8:	learn: 0.4390959	total: 54ms	remaining: 246ms
9:	learn: 0.4184791	total: 59.9ms	remaining: 240ms
10:	learn: 0.4000957	total: 65.8ms	remaining: 233ms
11:	learn: 0.3815892	total: 71.7ms	remaining: 227ms
12:	learn: 0.3644186	total: 77.5ms	remaining: 221ms
13:	learn: 0.3479641	total: 83.2ms	remaining: 214ms
14:	learn: 0.3324030	total: 89.2ms	remaining: 208ms
15:	learn: 0.3179935	total: 94.9ms	remaining: 202ms
16:	learn: 0.3043760	total: 100ms	remaining: 195ms
17:	learn: 0.2923100	total: 10

In [80]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99628 (+/- 0.00188)
F1 Score: 0.98033 (+/- 0.01015)
Precision: 0.99845 (+/- 0.00287)
Recall: 0.96291 (+/- 0.02000)
