## Feature Selection using Random Shuffling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score

## Read Data

In [2]:
data = pd.read_csv('../Kyoto_Train.csv')
data.shape

(124055, 24)

In [3]:
data.head()

Unnamed: 0,Duration,Source,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_port_rate,...,Service_code,Flag_code,IDS_detection_code,Malware_detection_code,Ashula_detection_code,Source_IP_Address_code,Destination_IP_Address_code,Start_Time_code,Protocol_code,Label_code
0,2.863309,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,90847.0,14024.0,25836.0,1.0,0.0
1,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,218531.0,8968.0,45541.0,1.0,0.0
2,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,176665.0,15997.0,59860.0,1.0,0.0
3,0.0,0,0,0,0.0,0.0,0.67,49,100,0.02,...,6.0,6.0,0.0,0.0,0.0,52769.0,473.0,40649.0,1.0,0.0
4,0.0,0,0,1,1.0,0.0,0.36,0,2,0.0,...,6.0,0.0,0.0,0.0,0.0,65048.0,16609.0,39283.0,1.0,0.0


### Train - Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['Label_code'], axis=1),
    data['Label_code'],
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((99244, 23), (24811, 23))

In [5]:
# Reset the indexes of the returned datasets
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

### Train ML algo with all features

In [6]:
rf = RandomForestClassifier(
    n_estimators=50, max_depth=2, random_state=2909, n_jobs=4)

rf.fit(X_train, y_train)

# print roc-auc in train and testing sets
print('train auc score: ',
      roc_auc_score(y_train, (rf.predict_proba(X_train.fillna(0)))[:, 1]))
print('test auc score: ',
      roc_auc_score(y_test, (rf.predict_proba(X_test.fillna(0)))[:, 1]))

train auc score:  0.9463076608966356
test auc score:  0.9459831147990166


### Shuffling resources and assessing performance loss

In [7]:
# overall train roc-auc: using all the features
train_roc = roc_auc_score(y_train, (rf.predict_proba(X_train))[:, 1])

# list to capture the performance shift
performance_shift = []

# selection  logic
for feature in X_train.columns:

    X_train_c = X_train.copy()

    # shuffle individual feature
    X_train_c[feature] = X_train_c[feature].sample(
        frac=1, random_state=10).reset_index(drop=True)

    # make prediction with shuffled feature and calculate roc-auc
    shuff_roc = roc_auc_score(y_train, rf.predict_proba(X_train_c)[:, 1])
    
    drift = train_roc - shuff_roc

    # save the drop in roc-auc
    performance_shift.append(drift)

In [8]:
# list of performances
performance_shift

[0.0002901670390693667,
 0.008205885494790288,
 0.0044710399999543515,
 0.00030743180318792973,
 0.003666586112004877,
 -0.0006455768879178869,
 0.0028087196010633253,
 0.01930730857158125,
 0.000665130332239805,
 0.00044121993251555747,
 0.0008276728986218451,
 -0.0006172056772535406,
 0.0021905350947959423,
 0.0030121950171231315,
 0.006937048427245074,
 0.00890084934891866,
 2.0593375995803243e-05,
 0.0,
 0.0,
 0.05044588598076416,
 0.0027546965447043936,
 0.0,
 -0.0021010873265745467]

In [9]:
# Transform the list into a pandas Series for easy manipulation
feature_importance = pd.Series(performance_shift)

# add variable names in the index
feature_importance.index = X_train.columns

feature_importance.head()

Duration             0.000290
Source               0.008206
Destination_bytes    0.004471
Count                0.000307
Same_srv_rate        0.003667
dtype: float64

In [10]:
# Sort the dataframe according to the drop in performance # caused by feature shuffling
feature_importance.sort_values(ascending=False)

Source_IP_Address_code         0.050446
Dst_host_count                 0.019307
Flag_code                      0.008901
Source                         0.008206
Service_code                   0.006937
Destination_bytes              0.004471
Same_srv_rate                  0.003667
Destination_Port_Number        0.003012
Srv_serror_rate                0.002809
Destination_IP_Address_code    0.002755
Source_Port_Number             0.002191
Dst_host_serror_rate           0.000828
Dst_host_srv_count             0.000665
Dst_host_same_port_rate        0.000441
Count                          0.000307
Duration                       0.000290
IDS_detection_code             0.000021
Start_Time_code                0.000000
Malware_detection_code         0.000000
Ashula_detection_code          0.000000
Dst_host_srv_serror_rate      -0.000617
Serror_rate                   -0.000646
Protocol_code                 -0.002101
dtype: float64

In [11]:
# List the top 10 features that caused the major drop in the roc-auc (aka model performance)

feature_importance.sort_values(ascending=False).head(10)

Source_IP_Address_code         0.050446
Dst_host_count                 0.019307
Flag_code                      0.008901
Source                         0.008206
Service_code                   0.006937
Destination_bytes              0.004471
Same_srv_rate                  0.003667
Destination_Port_Number        0.003012
Srv_serror_rate                0.002809
Destination_IP_Address_code    0.002755
dtype: float64

In [12]:
# original number of features (rows in this case)
feature_importance.shape[0]

23

In [13]:
# number of features that cause a drop in performance when shuffled

feature_importance[feature_importance>0].shape[0]

17

17 out of the 23 features caused a drop in the performance of the random forests when their values were permuted. This means that we could select those features and discard the rest, and should keep the original random forest performance. 

In [14]:
# print the important features

feature_importance[feature_importance>0].index

Index(['Duration', 'Source', 'Destination_bytes', 'Count', 'Same_srv_rate',
       'Srv_serror_rate', 'Dst_host_count', 'Dst_host_srv_count',
       'Dst_host_same_port_rate', 'Dst_host_serror_rate', 'Source_Port_Number',
       'Destination_Port_Number', 'Service_code', 'Flag_code',
       'IDS_detection_code', 'Source_IP_Address_code',
       'Destination_IP_Address_code'],
      dtype='object')

### Select features

In [15]:
# Building a random forests only with the selected features capture the selected features

selected_features = feature_importance[feature_importance > 0].index

# train a new random forests using only the selected features
rf = RandomForestClassifier(n_estimators=50,
                            max_depth=2,
                            random_state=2909,
                            n_jobs=4)

rf.fit(X_train[selected_features], y_train)

# print roc-auc in train and testing sets
print('train auc score: ', roc_auc_score(y_train, (rf.predict_proba(X_train[selected_features]))[:,1]))
print('test auc score: ', roc_auc_score(y_test, (rf.predict_proba(X_test[selected_features]))[:,1]))

train auc score:  0.9622166566597714
test auc score:  0.9624599048883167


The random forests with the selected features show a similar performance (or even slightly higher) to the random forests built using all of the features. And it provides a simpler, faster and more reliable model.

In [16]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [17]:
X_train.shape, X_test.shape

((99244, 17), (24811, 17))

## Standardize Data




In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers




In [19]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation




In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression





In [21]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1).fit(X_train, y_train)

CPU times: user 71.1 ms, sys: 211 ms, total: 282 ms
Wall time: 1.79 s


In [22]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.2134537100479626
F1 Score: 0.1875182147466589
FPR: 0.8645666488699056
TPR: 0.9644539614561027


### Naive Bayes





In [23]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-05).fit(X_train, y_train)

CPU times: user 32.6 ms, sys: 7.83 ms, total: 40.5 ms
Wall time: 38.7 ms


In [24]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.0
FPR: 1.0
TPR: 1.0


### Random Forest




In [25]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 5.78 s, sys: 62.6 ms, total: 5.85 s
Wall time: 5.85 s


In [26]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


### KNN





In [27]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform').fit(X_train, y_train)

CPU times: user 6.34 s, sys: 29.2 ms, total: 6.37 s
Wall time: 6.35 s


In [28]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.9058885171899561
f1: 0.0
fpr: 1.0
tpr: 1.0


### CatBoost




In [29]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.6565281	total: 62.1ms	remaining: 3.04s
1:	learn: 0.6229214	total: 70.8ms	remaining: 1.7s
2:	learn: 0.5908028	total: 79.1ms	remaining: 1.24s
3:	learn: 0.5605137	total: 87.3ms	remaining: 1s
4:	learn: 0.5321501	total: 95.8ms	remaining: 862ms
5:	learn: 0.5070266	total: 104ms	remaining: 766ms
6:	learn: 0.4821326	total: 112ms	remaining: 689ms
7:	learn: 0.4590912	total: 120ms	remaining: 630ms
8:	learn: 0.4373909	total: 128ms	remaining: 583ms
9:	learn: 0.4170320	total: 136ms	remaining: 542ms
10:	learn: 0.3982614	total: 144ms	remaining: 509ms
11:	learn: 0.3792286	total: 153ms	remaining: 483ms
12:	learn: 0.3627073	total: 161ms	remaining: 458ms
13:	learn: 0.3466191	total: 170ms	remaining: 437ms
14:	learn: 0.3311118	total: 178ms	remaining: 414ms
15:	learn: 0.3162409	total: 186ms	remaining: 396ms
16:	learn: 0.3032527	total: 195ms	remaining: 378ms
17:	learn: 0.2895732	total: 203ms	remaining: 360ms
18:	learn: 0.2780523	total: 211ms	remaining: 344ms
19:	learn: 0.2663186	total: 219ms	remain

In [30]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


## Model Evaluation





In [31]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../Kyoto_Test.csv")
test_df.shape

(62028, 24)

In [32]:
# Create feature matrix X and target vextor y
y_eval = test_df['Label_code']
X_eval = test_df.drop(columns=['Label_code'])

In [33]:
X_eval = X_eval[selected_features]

In [34]:
X_eval.shape

(62028, 17)

### Model Evaluation - Logistic Regression





In [35]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=1, n_jobs=-1, random_state=42)

In [36]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [37]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9262826971907622
Testing accuracy is  0.2134537100479626


In [38]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.2134537100479626
F1 Score: 0.1875182147466589
Precision Score: 0.1038553772366722
Recall Score: 0.9644539614561027
Confusion Matrix:
 [[ 3044 19432]
 [   83  2252]]


### Cross validation - Logistic Regression





In [39]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.89864 (+/- 0.00354)
F1 Score: 0.00032 (+/- 0.00190)
Precision: 0.00303 (+/- 0.01818)
Recall: 0.00017 (+/- 0.00100)


### Model Evaluation - Naive Bayes





In [40]:
modelNB = GaussianNB(var_smoothing=1e-05)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-05)

In [41]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [42]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.6822477933174802
Testing accuracy is  0.9058885171899561


In [43]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.0
Precision Score: 0.0
Recall Score: 0.0
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Cross validation - Naive Bayes






In [44]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.51847 (+/- 0.28052)
F1 Score: 0.26001 (+/- 0.02883)
Precision: 0.21418 (+/- 0.38986)
Recall: 0.86323 (+/- 0.46202)


### Model Evaluation - Random Forest




In [45]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [46]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [47]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.9058885171899561


In [48]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - Random Forest




In [49]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99953 (+/- 0.00042)
F1 Score: 0.99757 (+/- 0.00218)
Precision: 0.99950 (+/- 0.00215)
Recall: 0.99565 (+/- 0.00341)


### Model Evaluation - KNN




In [50]:
modelKNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(leaf_size=1, n_neighbors=2)

In [51]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [52]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9961005199306759
Testing accuracy is  0.9058885171899561


In [53]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.0
Precision Score: 0.0
Recall Score: 0.0
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Cross validation - KNN




In [54]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98862 (+/- 0.00240)
F1 Score: 0.93916 (+/- 0.01303)
Precision: 0.96960 (+/- 0.01296)
Recall: 0.91061 (+/- 0.01713)


### Model Evaluation - CatBoost





In [55]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.6565281	total: 7.64ms	remaining: 374ms
1:	learn: 0.6229214	total: 15.4ms	remaining: 370ms
2:	learn: 0.5908028	total: 22.9ms	remaining: 359ms
3:	learn: 0.5605137	total: 30.9ms	remaining: 355ms
4:	learn: 0.5321501	total: 38.2ms	remaining: 344ms
5:	learn: 0.5070266	total: 46ms	remaining: 338ms
6:	learn: 0.4821326	total: 53.8ms	remaining: 331ms
7:	learn: 0.4590912	total: 62.3ms	remaining: 327ms
8:	learn: 0.4373909	total: 70.1ms	remaining: 319ms
9:	learn: 0.4170320	total: 78.2ms	remaining: 313ms
10:	learn: 0.3982614	total: 86ms	remaining: 305ms
11:	learn: 0.3792286	total: 93.8ms	remaining: 297ms
12:	learn: 0.3627073	total: 101ms	remaining: 289ms
13:	learn: 0.3466191	total: 110ms	remaining: 282ms
14:	learn: 0.3311118	total: 118ms	remaining: 275ms
15:	learn: 0.3162409	total: 126ms	remaining: 268ms
16:	learn: 0.3032527	total: 134ms	remaining: 261ms
17:	learn: 0.2895732	total: 142ms	remaining: 253ms
18:	learn: 0.2780523	total: 151ms	remaining: 246ms
19:	learn: 0.2663186	total: 160ms

<catboost.core.CatBoostClassifier at 0x7fe7380f3e80>

In [56]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [57]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.9969670710571924
Testing accuracy is  0.9058885171899561


In [58]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - CatBoost






In [59]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6559864	total: 6.22ms	remaining: 305ms
1:	learn: 0.6224722	total: 12.4ms	remaining: 297ms
2:	learn: 0.5905940	total: 18.3ms	remaining: 286ms
3:	learn: 0.5602410	total: 24.1ms	remaining: 277ms
4:	learn: 0.5328195	total: 30.3ms	remaining: 272ms
5:	learn: 0.5078294	total: 36.1ms	remaining: 265ms
6:	learn: 0.4827645	total: 42.2ms	remaining: 259ms
7:	learn: 0.4606035	total: 48.3ms	remaining: 253ms
8:	learn: 0.4387625	total: 54.3ms	remaining: 247ms
9:	learn: 0.4185852	total: 59.9ms	remaining: 240ms
10:	learn: 0.3997686	total: 65.7ms	remaining: 233ms
11:	learn: 0.3823886	total: 71.5ms	remaining: 226ms
12:	learn: 0.3652511	total: 77.5ms	remaining: 220ms
13:	learn: 0.3492696	total: 83.2ms	remaining: 214ms
14:	learn: 0.3335665	total: 89ms	remaining: 208ms
15:	learn: 0.3185183	total: 95ms	remaining: 202ms
16:	learn: 0.3048804	total: 101ms	remaining: 197ms
17:	learn: 0.2923353	total: 107ms	remaining: 190ms
18:	learn: 0.2808006	total: 114ms	remaining: 186ms
19:	learn: 0.2693321	total: 1

13:	learn: 0.3489959	total: 83.5ms	remaining: 215ms
14:	learn: 0.3330638	total: 89.7ms	remaining: 209ms
15:	learn: 0.3194700	total: 95.8ms	remaining: 204ms
16:	learn: 0.3063656	total: 102ms	remaining: 197ms
17:	learn: 0.2937731	total: 108ms	remaining: 191ms
18:	learn: 0.2819150	total: 113ms	remaining: 185ms
19:	learn: 0.2701111	total: 120ms	remaining: 180ms
20:	learn: 0.2596357	total: 126ms	remaining: 174ms
21:	learn: 0.2487490	total: 132ms	remaining: 168ms
22:	learn: 0.2381685	total: 138ms	remaining: 162ms
23:	learn: 0.2286793	total: 144ms	remaining: 156ms
24:	learn: 0.2191001	total: 150ms	remaining: 150ms
25:	learn: 0.2105588	total: 155ms	remaining: 144ms
26:	learn: 0.2024326	total: 161ms	remaining: 138ms
27:	learn: 0.1936558	total: 167ms	remaining: 131ms
28:	learn: 0.1860381	total: 173ms	remaining: 125ms
29:	learn: 0.1793799	total: 179ms	remaining: 119ms
30:	learn: 0.1717522	total: 185ms	remaining: 113ms
31:	learn: 0.1657620	total: 191ms	remaining: 107ms
32:	learn: 0.1593845	total: 

33:	learn: 0.1532094	total: 201ms	remaining: 94.7ms
34:	learn: 0.1474242	total: 207ms	remaining: 88.9ms
35:	learn: 0.1423692	total: 213ms	remaining: 83ms
36:	learn: 0.1371298	total: 219ms	remaining: 77ms
37:	learn: 0.1320397	total: 225ms	remaining: 71.1ms
38:	learn: 0.1271725	total: 231ms	remaining: 65.2ms
39:	learn: 0.1226313	total: 237ms	remaining: 59.3ms
40:	learn: 0.1182366	total: 243ms	remaining: 53.3ms
41:	learn: 0.1134022	total: 249ms	remaining: 47.4ms
42:	learn: 0.1086409	total: 255ms	remaining: 41.5ms
43:	learn: 0.1040633	total: 261ms	remaining: 35.6ms
44:	learn: 0.1006256	total: 267ms	remaining: 29.7ms
45:	learn: 0.0967846	total: 273ms	remaining: 23.7ms
46:	learn: 0.0935889	total: 279ms	remaining: 17.8ms
47:	learn: 0.0903397	total: 285ms	remaining: 11.9ms
48:	learn: 0.0870472	total: 291ms	remaining: 5.94ms
49:	learn: 0.0834867	total: 297ms	remaining: 0us
0:	learn: 0.6558717	total: 6.03ms	remaining: 296ms
1:	learn: 0.6221516	total: 12.1ms	remaining: 290ms
2:	learn: 0.5902999	t

8:	learn: 0.4387625	total: 54.2ms	remaining: 247ms
9:	learn: 0.4185852	total: 60ms	remaining: 240ms
10:	learn: 0.3997686	total: 65.6ms	remaining: 233ms
11:	learn: 0.3823886	total: 71.7ms	remaining: 227ms
12:	learn: 0.3652511	total: 77.6ms	remaining: 221ms
13:	learn: 0.3492696	total: 83.2ms	remaining: 214ms
14:	learn: 0.3335665	total: 89.5ms	remaining: 209ms
15:	learn: 0.3185183	total: 95.7ms	remaining: 203ms
16:	learn: 0.3048804	total: 102ms	remaining: 198ms
17:	learn: 0.2923353	total: 108ms	remaining: 192ms
18:	learn: 0.2808006	total: 114ms	remaining: 186ms
19:	learn: 0.2693321	total: 120ms	remaining: 181ms
20:	learn: 0.2590187	total: 126ms	remaining: 175ms
21:	learn: 0.2478637	total: 133ms	remaining: 169ms
22:	learn: 0.2375244	total: 138ms	remaining: 162ms
23:	learn: 0.2281078	total: 144ms	remaining: 156ms
24:	learn: 0.2191425	total: 150ms	remaining: 150ms
25:	learn: 0.2104705	total: 156ms	remaining: 144ms
26:	learn: 0.2027808	total: 162ms	remaining: 138ms
27:	learn: 0.1943146	total:

33:	learn: 0.1533978	total: 215ms	remaining: 101ms
34:	learn: 0.1471464	total: 221ms	remaining: 94.6ms
35:	learn: 0.1411219	total: 226ms	remaining: 88ms
36:	learn: 0.1357510	total: 232ms	remaining: 81.5ms
37:	learn: 0.1312425	total: 238ms	remaining: 75.3ms
38:	learn: 0.1258263	total: 244ms	remaining: 68.9ms
39:	learn: 0.1206282	total: 250ms	remaining: 62.6ms
40:	learn: 0.1166692	total: 256ms	remaining: 56.3ms
41:	learn: 0.1120373	total: 262ms	remaining: 49.9ms
42:	learn: 0.1075229	total: 268ms	remaining: 43.6ms
43:	learn: 0.1039783	total: 274ms	remaining: 37.4ms
44:	learn: 0.0998697	total: 280ms	remaining: 31.1ms
45:	learn: 0.0961000	total: 286ms	remaining: 24.9ms
46:	learn: 0.0925548	total: 292ms	remaining: 18.6ms
47:	learn: 0.0893640	total: 298ms	remaining: 12.4ms
48:	learn: 0.0861517	total: 304ms	remaining: 6.2ms
49:	learn: 0.0831298	total: 310ms	remaining: 0us
0:	learn: 0.6558662	total: 5.87ms	remaining: 288ms
1:	learn: 0.6223882	total: 11.8ms	remaining: 283ms
2:	learn: 0.5908029	t

46:	learn: 0.0935889	total: 275ms	remaining: 17.5ms
47:	learn: 0.0903397	total: 283ms	remaining: 11.8ms
48:	learn: 0.0870472	total: 289ms	remaining: 5.9ms
49:	learn: 0.0834867	total: 295ms	remaining: 0us
0:	learn: 0.6558717	total: 5.98ms	remaining: 293ms
1:	learn: 0.6221516	total: 11.9ms	remaining: 285ms
2:	learn: 0.5902999	total: 17.6ms	remaining: 276ms
3:	learn: 0.5599462	total: 23.4ms	remaining: 269ms
4:	learn: 0.5323254	total: 29.1ms	remaining: 261ms
5:	learn: 0.5073679	total: 35ms	remaining: 256ms
6:	learn: 0.4820993	total: 40.8ms	remaining: 251ms
7:	learn: 0.4594464	total: 46.6ms	remaining: 245ms
8:	learn: 0.4376859	total: 52.3ms	remaining: 238ms
9:	learn: 0.4175675	total: 58.1ms	remaining: 232ms
10:	learn: 0.3988414	total: 63.8ms	remaining: 226ms
11:	learn: 0.3814933	total: 69.9ms	remaining: 221ms
12:	learn: 0.3649083	total: 75.6ms	remaining: 215ms
13:	learn: 0.3490137	total: 81.3ms	remaining: 209ms
14:	learn: 0.3330778	total: 87.2ms	remaining: 203ms
15:	learn: 0.3181384	total: 

8:	learn: 0.4387625	total: 53.9ms	remaining: 245ms
9:	learn: 0.4185852	total: 59.5ms	remaining: 238ms
10:	learn: 0.3997686	total: 65.2ms	remaining: 231ms
11:	learn: 0.3823886	total: 71.3ms	remaining: 226ms
12:	learn: 0.3652511	total: 77.4ms	remaining: 220ms
13:	learn: 0.3492696	total: 83.2ms	remaining: 214ms
14:	learn: 0.3335665	total: 89.2ms	remaining: 208ms
15:	learn: 0.3185183	total: 95.2ms	remaining: 202ms
16:	learn: 0.3048804	total: 101ms	remaining: 197ms
17:	learn: 0.2923353	total: 108ms	remaining: 192ms
18:	learn: 0.2808006	total: 114ms	remaining: 186ms
19:	learn: 0.2693321	total: 120ms	remaining: 180ms
20:	learn: 0.2590187	total: 126ms	remaining: 174ms
21:	learn: 0.2478637	total: 132ms	remaining: 168ms
22:	learn: 0.2375244	total: 138ms	remaining: 162ms
23:	learn: 0.2281078	total: 144ms	remaining: 156ms
24:	learn: 0.2191425	total: 150ms	remaining: 150ms
25:	learn: 0.2104705	total: 155ms	remaining: 143ms
26:	learn: 0.2027808	total: 161ms	remaining: 137ms
27:	learn: 0.1943146	tota

24:	learn: 0.2191001	total: 145ms	remaining: 145ms
25:	learn: 0.2105588	total: 151ms	remaining: 139ms
26:	learn: 0.2024326	total: 156ms	remaining: 133ms
27:	learn: 0.1936558	total: 162ms	remaining: 127ms
28:	learn: 0.1860381	total: 168ms	remaining: 121ms
29:	learn: 0.1793799	total: 173ms	remaining: 116ms
30:	learn: 0.1717522	total: 179ms	remaining: 110ms
31:	learn: 0.1657620	total: 185ms	remaining: 104ms
32:	learn: 0.1593845	total: 190ms	remaining: 98.1ms
33:	learn: 0.1533978	total: 196ms	remaining: 92.3ms
34:	learn: 0.1471464	total: 202ms	remaining: 86.5ms
35:	learn: 0.1411219	total: 207ms	remaining: 80.5ms
36:	learn: 0.1357510	total: 213ms	remaining: 74.7ms
37:	learn: 0.1312425	total: 218ms	remaining: 68.9ms
38:	learn: 0.1258263	total: 224ms	remaining: 63.2ms
39:	learn: 0.1206282	total: 230ms	remaining: 57.5ms
40:	learn: 0.1166692	total: 235ms	remaining: 51.7ms
41:	learn: 0.1120373	total: 241ms	remaining: 46ms
42:	learn: 0.1075229	total: 247ms	remaining: 40.2ms
43:	learn: 0.1039783	t

1:	learn: 0.6221516	total: 12.5ms	remaining: 300ms
2:	learn: 0.5902999	total: 18.4ms	remaining: 288ms
3:	learn: 0.5599462	total: 24.4ms	remaining: 281ms
4:	learn: 0.5323254	total: 30ms	remaining: 270ms
5:	learn: 0.5073679	total: 36ms	remaining: 264ms
6:	learn: 0.4820993	total: 42ms	remaining: 258ms
7:	learn: 0.4594464	total: 47.9ms	remaining: 251ms
8:	learn: 0.4376859	total: 53.6ms	remaining: 244ms
9:	learn: 0.4175675	total: 59.3ms	remaining: 237ms
10:	learn: 0.3988414	total: 64.9ms	remaining: 230ms
11:	learn: 0.3814933	total: 70.4ms	remaining: 223ms
12:	learn: 0.3649083	total: 76.2ms	remaining: 217ms
13:	learn: 0.3490137	total: 81.8ms	remaining: 210ms
14:	learn: 0.3330778	total: 87.2ms	remaining: 203ms
15:	learn: 0.3181384	total: 92.7ms	remaining: 197ms
16:	learn: 0.3046470	total: 98.3ms	remaining: 191ms
17:	learn: 0.2921299	total: 104ms	remaining: 185ms
18:	learn: 0.2804744	total: 110ms	remaining: 179ms
19:	learn: 0.2687667	total: 115ms	remaining: 173ms
20:	learn: 0.2582737	total: 12

34:	learn: 0.1478941	total: 201ms	remaining: 86ms
35:	learn: 0.1419772	total: 206ms	remaining: 80.3ms
36:	learn: 0.1366178	total: 212ms	remaining: 74.5ms
37:	learn: 0.1319563	total: 218ms	remaining: 68.8ms
38:	learn: 0.1272942	total: 224ms	remaining: 63.1ms
39:	learn: 0.1220325	total: 229ms	remaining: 57.4ms
40:	learn: 0.1179810	total: 235ms	remaining: 51.7ms
41:	learn: 0.1132694	total: 241ms	remaining: 45.9ms
42:	learn: 0.1088096	total: 247ms	remaining: 40.2ms
43:	learn: 0.1052081	total: 253ms	remaining: 34.4ms
44:	learn: 0.1008625	total: 258ms	remaining: 28.7ms
45:	learn: 0.0973311	total: 264ms	remaining: 22.9ms
46:	learn: 0.0938118	total: 269ms	remaining: 17.2ms
47:	learn: 0.0901156	total: 275ms	remaining: 11.5ms
48:	learn: 0.0869076	total: 281ms	remaining: 5.73ms
49:	learn: 0.0841618	total: 287ms	remaining: 0us
0:	learn: 0.6560489	total: 5.99ms	remaining: 294ms
1:	learn: 0.6226092	total: 11.9ms	remaining: 285ms
2:	learn: 0.5909950	total: 19.1ms	remaining: 299ms
3:	learn: 0.5608007	

9:	learn: 0.4198670	total: 60.8ms	remaining: 243ms
10:	learn: 0.4010493	total: 66.5ms	remaining: 236ms
11:	learn: 0.3828360	total: 72.1ms	remaining: 228ms
12:	learn: 0.3662160	total: 77.7ms	remaining: 221ms
13:	learn: 0.3499351	total: 83.5ms	remaining: 215ms
14:	learn: 0.3344584	total: 89.3ms	remaining: 208ms
15:	learn: 0.3192385	total: 94.8ms	remaining: 202ms
16:	learn: 0.3061391	total: 101ms	remaining: 195ms
17:	learn: 0.2932005	total: 106ms	remaining: 189ms
18:	learn: 0.2816608	total: 112ms	remaining: 183ms
19:	learn: 0.2698659	total: 118ms	remaining: 177ms
20:	learn: 0.2595607	total: 124ms	remaining: 171ms
21:	learn: 0.2484659	total: 130ms	remaining: 165ms
22:	learn: 0.2377916	total: 136ms	remaining: 159ms
23:	learn: 0.2283877	total: 141ms	remaining: 153ms
24:	learn: 0.2194031	total: 147ms	remaining: 147ms
25:	learn: 0.2102248	total: 153ms	remaining: 141ms
26:	learn: 0.2012029	total: 159ms	remaining: 135ms
27:	learn: 0.1925403	total: 165ms	remaining: 129ms
28:	learn: 0.1851314	tota

44:	learn: 0.0993829	total: 263ms	remaining: 29.2ms
45:	learn: 0.0958813	total: 268ms	remaining: 23.3ms
46:	learn: 0.0923695	total: 274ms	remaining: 17.5ms
47:	learn: 0.0891952	total: 280ms	remaining: 11.7ms
48:	learn: 0.0861402	total: 286ms	remaining: 5.83ms
49:	learn: 0.0833618	total: 292ms	remaining: 0us
0:	learn: 0.6559984	total: 6.05ms	remaining: 297ms
1:	learn: 0.6225216	total: 13.4ms	remaining: 321ms
2:	learn: 0.5908908	total: 19ms	remaining: 298ms
3:	learn: 0.5606767	total: 24.8ms	remaining: 285ms
4:	learn: 0.5326164	total: 30.5ms	remaining: 275ms
5:	learn: 0.5075830	total: 36.3ms	remaining: 266ms
6:	learn: 0.4826310	total: 41.8ms	remaining: 257ms
7:	learn: 0.4606766	total: 47.4ms	remaining: 249ms
8:	learn: 0.4395824	total: 53ms	remaining: 241ms
9:	learn: 0.4192299	total: 58.4ms	remaining: 234ms
10:	learn: 0.4003914	total: 64ms	remaining: 227ms
11:	learn: 0.3821637	total: 69.6ms	remaining: 220ms
12:	learn: 0.3655766	total: 75.3ms	remaining: 214ms
13:	learn: 0.3494933	total: 81.

In [60]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99661 (+/- 0.00150)
F1 Score: 0.98215 (+/- 0.00803)
Precision: 0.99862 (+/- 0.00337)
Recall: 0.96625 (+/- 0.01553)
