## Feature Selection using Random Shuffling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score

## Read Data

In [2]:
data = pd.read_csv('../UNSW_Train.csv')
data.shape

(175341, 44)

In [3]:
data.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack,is_intrusion
0,0.121478,113.0,0.0,2.0,6,4,258,172,74.08749,252,...,1,1,0,0,0,1,1,0,0,0
1,0.649902,113.0,0.0,2.0,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,0,0
2,1.623129,113.0,0.0,2.0,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,0,0
3,1.681642,113.0,3.0,2.0,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,0,0
4,0.449454,113.0,0.0,2.0,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,0,0


### Train - Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),
    data['is_intrusion'],
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((140272, 43), (35069, 43))

In [5]:
# Reset the indexes of the returned datasets
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

### Train ML algo with all features

In [6]:
rf = RandomForestClassifier(
    n_estimators=50, max_depth=2, random_state=2909, n_jobs=4)

rf.fit(X_train, y_train)

# print roc-auc in train and testing sets
print('train auc score: ',
      roc_auc_score(y_train, (rf.predict_proba(X_train.fillna(0)))[:, 1]))
print('test auc score: ',
      roc_auc_score(y_test, (rf.predict_proba(X_test.fillna(0)))[:, 1]))

train auc score:  0.9999376473247632
test auc score:  0.999934448981656


### Shuffling resources and assessing performance loss

In [7]:
# overall train roc-auc: using all the features
train_roc = roc_auc_score(y_train, (rf.predict_proba(X_train))[:, 1])

# list to capture the performance shift
performance_shift = []

# selection  logic
for feature in X_train.columns:

    X_train_c = X_train.copy()

    # shuffle individual feature
    X_train_c[feature] = X_train_c[feature].sample(
        frac=1, random_state=10).reset_index(drop=True)

    # make prediction with shuffled feature and calculate roc-auc
    shuff_roc = roc_auc_score(y_train, rf.predict_proba(X_train_c)[:, 1])
    
    drift = train_roc - shuff_roc

    # save the drop in roc-auc
    performance_shift.append(drift)

In [8]:
# list of performances
performance_shift

[-3.3543458943463555e-05,
 -6.555141197073233e-06,
 1.3485971380733375e-06,
 -3.9850168195854074e-05,
 -5.137067329297196e-07,
 -1.664377064536371e-05,
 -7.883386565765704e-07,
 1.0092606230971413e-05,
 3.9837536063069834e-05,
 0.0001837914497458959,
 2.55688403080212e-05,
 -3.0640173760532186e-05,
 3.884638149698194e-05,
 -2.6885389257103043e-06,
 0.0,
 2.3289909982793233e-06,
 -2.2404023190869538e-05,
 0.0,
 -1.3067707427927289e-05,
 4.760442628981565e-07,
 -6.2816789152542185e-06,
 9.127885571835748e-07,
 0.0,
 -2.7890813454778396e-05,
 -8.200827383619824e-06,
 -7.688290292628963e-06,
 6.104361199521513e-06,
 -3.395727825605821e-05,
 0.0,
 0.0,
 0.0,
 -3.6388495958550315e-05,
 3.4200329847022815e-06,
 0.0,
 1.039273634839688e-05,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -2.582569368358989e-07,
 0.08173297954359993]

In [9]:
# Transform the list into a pandas Series for easy manipulation
feature_importance = pd.Series(performance_shift)

# add variable names in the index
feature_importance.index = X_train.columns

feature_importance.head()

dur       -3.354346e-05
proto     -6.555141e-06
service    1.348597e-06
state     -3.985017e-05
spkts     -5.137067e-07
dtype: float64

In [10]:
# Sort the dataframe according to the drop in performance # caused by feature shuffling
feature_importance.sort_values(ascending=False)

attack               8.173298e-02
sttl                 1.837914e-04
rate                 3.983754e-05
dload                3.884638e-05
dttl                 2.556884e-05
ct_dst_sport_ltm     1.039274e-05
dbytes               1.009261e-05
smean                6.104361e-06
ct_dst_ltm           3.420033e-06
sinpkt               2.328991e-06
service              1.348597e-06
dtcpb                9.127886e-07
swin                 4.760443e-07
ct_srv_src           0.000000e+00
ct_srv_dst           0.000000e+00
ct_src_ltm           0.000000e+00
ct_flw_http_mthd     0.000000e+00
ct_ftp_cmd           0.000000e+00
is_ftp_login         0.000000e+00
ct_dst_src_ltm       0.000000e+00
dloss                0.000000e+00
response_body_len    0.000000e+00
sjit                 0.000000e+00
ct_src_dport_ltm     0.000000e+00
dwin                 0.000000e+00
trans_depth          0.000000e+00
is_sm_ips_ports     -2.582569e-07
spkts               -5.137067e-07
sbytes              -7.883387e-07
sloss         

In [11]:
# List the top 10 features that caused the major drop in the roc-auc (aka model performance)

feature_importance.sort_values(ascending=False).head(10)

attack              0.081733
sttl                0.000184
rate                0.000040
dload               0.000039
dttl                0.000026
ct_dst_sport_ltm    0.000010
dbytes              0.000010
smean               0.000006
ct_dst_ltm          0.000003
sinpkt              0.000002
dtype: float64

In [12]:
# original number of features (rows in this case)
feature_importance.shape[0]

43

In [13]:
# number of features that cause a drop in performance when shuffled

feature_importance[feature_importance>0].shape[0]

13

13 out of the 43 features caused a drop in the performance of the random forests when their values were permuted. This means that we could select those features and discard the rest, and should keep the original random forest performance. 

In [14]:
# print the important features

feature_importance[feature_importance>0].index

Index(['service', 'dbytes', 'rate', 'sttl', 'dttl', 'dload', 'sinpkt', 'swin',
       'dtcpb', 'smean', 'ct_dst_ltm', 'ct_dst_sport_ltm', 'attack'],
      dtype='object')

### Select features

In [15]:
# Building a random forests only with the selected features capture the selected features

selected_features = feature_importance[feature_importance > 0].index

# train a new random forests using only the selected features
rf = RandomForestClassifier(n_estimators=50,
                            max_depth=2,
                            random_state=2909,
                            n_jobs=4)

rf.fit(X_train[selected_features], y_train)

# print roc-auc in train and testing sets
print('train auc score: ', roc_auc_score(y_train, (rf.predict_proba(X_train[selected_features]))[:,1]))
print('test auc score: ', roc_auc_score(y_test, (rf.predict_proba(X_test[selected_features]))[:,1]))

train auc score:  1.0
test auc score:  1.0


The random forests with the selected features show a similar performance (or even slightly higher) to the random forests built using all of the features. And it provides a simpler, faster and more reliable model.

In [16]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [17]:
X_train.shape, X_test.shape

((140272, 13), (35069, 13))

## Standardize Data




In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers




In [19]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation




In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression





In [21]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25).fit(X_train, y_train)

CPU times: user 74.8 ms, sys: 183 ms, total: 258 ms
Wall time: 2.19 s


In [22]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.7375459807807465
F1 Score: 0.7695427913265562
FPR: 0.06651253779121466
TPR: 0.6450489023212862


### Naive Bayes





In [23]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-08).fit(X_train, y_train)

CPU times: user 38.9 ms, sys: 9.52 ms, total: 48.4 ms
Wall time: 46.5 ms


In [24]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.7420798996264507
F1 Score: 0.7748487790306923
FPR: 0.06989151698381647
TPR: 0.6533182218864123


### Random Forest




In [25]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 4.53 s, sys: 89.2 ms, total: 4.62 s
Wall time: 4.62 s


In [26]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


### KNN





In [27]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform').fit(X_train, y_train)

CPU times: user 2.62 s, sys: 15.1 ms, total: 2.63 s
Wall time: 2.62 s


In [28]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.6739855713022898
f1: 0.7441308775149386
fpr: 0.37657833896496534
tpr: 0.6978550140620409


### CatBoost




In [29]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.6539785	total: 62.8ms	remaining: 3.08s
1:	learn: 0.6177951	total: 69.2ms	remaining: 1.66s
2:	learn: 0.5843297	total: 78.3ms	remaining: 1.23s
3:	learn: 0.5532772	total: 86.9ms	remaining: 999ms
4:	learn: 0.5243606	total: 96.2ms	remaining: 866ms
5:	learn: 0.4973725	total: 106ms	remaining: 774ms
6:	learn: 0.4721488	total: 114ms	remaining: 701ms
7:	learn: 0.4485173	total: 123ms	remaining: 646ms
8:	learn: 0.4263319	total: 129ms	remaining: 586ms
9:	learn: 0.4055332	total: 138ms	remaining: 553ms
10:	learn: 0.3859791	total: 150ms	remaining: 530ms
11:	learn: 0.3675455	total: 161ms	remaining: 508ms
12:	learn: 0.3501868	total: 170ms	remaining: 484ms
13:	learn: 0.3337885	total: 179ms	remaining: 461ms
14:	learn: 0.3182761	total: 188ms	remaining: 439ms
15:	learn: 0.3036257	total: 197ms	remaining: 418ms
16:	learn: 0.2897683	total: 206ms	remaining: 399ms
17:	learn: 0.2765959	total: 211ms	remaining: 376ms
18:	learn: 0.2641473	total: 221ms	remaining: 360ms
19:	learn: 0.2523041	total: 227ms	re

In [30]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


## Model Evaluation





In [31]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../UNSW_Test.csv")
test_df.shape

(175341, 44)

In [32]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion'])

In [33]:
X_eval = X_eval[selected_features]

In [34]:
X_eval.shape

(175341, 13)

### Model Evaluation - Logistic Regression





In [35]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25)
modelLR.fit(X_train, y_train)

LogisticRegression(C=25, n_jobs=-1, random_state=42)

In [36]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [37]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  1.0
Testing accuracy is  0.7375459807807465


In [38]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.7375459807807465
F1 Score: 0.7695427913265562
Precision Score: 0.9535836177474403
Recall Score: 0.6450489023212862
Confusion Matrix:
 [[10498   748]
 [ 8456 15367]]


### Cross validation - Logistic Regression





In [39]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.83956 (+/- 0.13330)
F1 Score: 0.89091 (+/- 0.07405)
Precision: 0.85152 (+/- 0.15401)
Recall: 0.94068 (+/- 0.05418)


### Model Evaluation - Naive Bayes





In [40]:
modelNB = GaussianNB(var_smoothing=1e-08)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-08)

In [41]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [42]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  1.0
Testing accuracy is  0.7420798996264507


In [43]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.7420798996264507
F1 Score: 0.7748487790306923
Precision Score: 0.9519266055045872
Recall Score: 0.6533182218864123
Confusion Matrix:
 [[10460   786]
 [ 8259 15564]]


### Cross validation - Naive Bayes






In [44]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.82265 (+/- 0.16633)
F1 Score: 0.88604 (+/- 0.09078)
Precision: 0.80758 (+/- 0.14967)
Recall: 0.98678 (+/- 0.01324)


### Model Evaluation - Random Forest




In [45]:
modelRF = RandomForestClassifier(random_state=0,max_depth=100,n_estimators=1000)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=100, n_estimators=1000, random_state=0)

In [46]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [47]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [48]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - Random Forest




In [49]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)


### Model Evaluation - KNN




In [50]:
modelKNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=1)

In [51]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [52]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9999429679479868
Testing accuracy is  0.6739855713022898


In [53]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.6739855713022898
F1 Score: 0.7441308775149386
Precision Score: 0.7969798657718121
Recall Score: 0.6978550140620409
Confusion Matrix:
 [[ 7011  4235]
 [ 7198 16625]]


### Cross validation - KNN




In [54]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.82827 (+/- 0.07217)
F1 Score: 0.87857 (+/- 0.04654)
Precision: 0.85223 (+/- 0.09368)
Recall: 0.91136 (+/- 0.10345)


### Model Evaluation - CatBoost





In [55]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.6539785	total: 7ms	remaining: 343ms
1:	learn: 0.6177951	total: 12.4ms	remaining: 297ms
2:	learn: 0.5843297	total: 20.9ms	remaining: 328ms
3:	learn: 0.5532772	total: 29.7ms	remaining: 341ms
4:	learn: 0.5243606	total: 38.6ms	remaining: 348ms
5:	learn: 0.4973725	total: 47.5ms	remaining: 348ms
6:	learn: 0.4721488	total: 56.1ms	remaining: 344ms
7:	learn: 0.4485173	total: 64.7ms	remaining: 340ms
8:	learn: 0.4263319	total: 69.8ms	remaining: 318ms
9:	learn: 0.4055332	total: 78.4ms	remaining: 314ms
10:	learn: 0.3859791	total: 87.2ms	remaining: 309ms
11:	learn: 0.3675455	total: 95.9ms	remaining: 304ms
12:	learn: 0.3501868	total: 105ms	remaining: 298ms
13:	learn: 0.3337885	total: 113ms	remaining: 291ms
14:	learn: 0.3182761	total: 122ms	remaining: 284ms
15:	learn: 0.3036257	total: 130ms	remaining: 276ms
16:	learn: 0.2897683	total: 138ms	remaining: 269ms
17:	learn: 0.2765959	total: 145ms	remaining: 258ms
18:	learn: 0.2641473	total: 154ms	remaining: 251ms
19:	learn: 0.2523041	total: 160m

<catboost.core.CatBoostClassifier at 0x7fe2b06fd9a0>

In [56]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [57]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [58]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - CatBoost






In [59]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6539740	total: 9.02ms	remaining: 442ms
1:	learn: 0.6177903	total: 14.9ms	remaining: 359ms
2:	learn: 0.5843210	total: 23.9ms	remaining: 374ms
3:	learn: 0.5532591	total: 33ms	remaining: 380ms
4:	learn: 0.5243380	total: 41.8ms	remaining: 376ms
5:	learn: 0.4973472	total: 51.2ms	remaining: 376ms
6:	learn: 0.4721208	total: 61.2ms	remaining: 376ms
7:	learn: 0.4484884	total: 70.7ms	remaining: 371ms
8:	learn: 0.4263043	total: 76.8ms	remaining: 350ms
9:	learn: 0.4055027	total: 86.5ms	remaining: 346ms
10:	learn: 0.3859450	total: 96.1ms	remaining: 341ms
11:	learn: 0.3675084	total: 105ms	remaining: 334ms
12:	learn: 0.3501459	total: 115ms	remaining: 326ms
13:	learn: 0.3337457	total: 125ms	remaining: 322ms
14:	learn: 0.3182333	total: 134ms	remaining: 313ms
15:	learn: 0.3035810	total: 143ms	remaining: 305ms
16:	learn: 0.2897206	total: 152ms	remaining: 296ms
17:	learn: 0.2765501	total: 158ms	remaining: 282ms
18:	learn: 0.2640992	total: 168ms	remaining: 273ms
19:	learn: 0.2522574	total: 174m

22:	learn: 0.2202119	total: 206ms	remaining: 241ms
23:	learn: 0.2105795	total: 217ms	remaining: 235ms
24:	learn: 0.2014155	total: 227ms	remaining: 227ms
25:	learn: 0.1926812	total: 235ms	remaining: 217ms
26:	learn: 0.1843803	total: 245ms	remaining: 208ms
27:	learn: 0.1764729	total: 254ms	remaining: 200ms
28:	learn: 0.1689076	total: 260ms	remaining: 188ms
29:	learn: 0.1617151	total: 270ms	remaining: 180ms
30:	learn: 0.1548487	total: 279ms	remaining: 171ms
31:	learn: 0.1483051	total: 288ms	remaining: 162ms
32:	learn: 0.1420677	total: 299ms	remaining: 154ms
33:	learn: 0.1360900	total: 306ms	remaining: 144ms
34:	learn: 0.1303812	total: 313ms	remaining: 134ms
35:	learn: 0.1249631	total: 323ms	remaining: 126ms
36:	learn: 0.1197645	total: 332ms	remaining: 117ms
37:	learn: 0.1148014	total: 342ms	remaining: 108ms
38:	learn: 0.1100389	total: 349ms	remaining: 98.5ms
39:	learn: 0.1055054	total: 359ms	remaining: 89.8ms
40:	learn: 0.1011461	total: 365ms	remaining: 80.2ms
41:	learn: 0.0969926	total: 

44:	learn: 0.0855587	total: 401ms	remaining: 44.6ms
45:	learn: 0.0820649	total: 410ms	remaining: 35.6ms
46:	learn: 0.0787356	total: 422ms	remaining: 26.9ms
47:	learn: 0.0755501	total: 431ms	remaining: 18ms
48:	learn: 0.0724764	total: 438ms	remaining: 8.94ms
49:	learn: 0.0695324	total: 444ms	remaining: 0us
0:	learn: 0.6539751	total: 7.56ms	remaining: 370ms
1:	learn: 0.6177913	total: 13.3ms	remaining: 319ms
2:	learn: 0.5843192	total: 22.5ms	remaining: 353ms
3:	learn: 0.5532572	total: 31.7ms	remaining: 364ms
4:	learn: 0.5243357	total: 41.9ms	remaining: 377ms
5:	learn: 0.4973450	total: 51.2ms	remaining: 376ms
6:	learn: 0.4721192	total: 60.3ms	remaining: 370ms
7:	learn: 0.4484871	total: 70ms	remaining: 367ms
8:	learn: 0.4263031	total: 75.6ms	remaining: 344ms
9:	learn: 0.4055007	total: 85.1ms	remaining: 341ms
10:	learn: 0.3859428	total: 95.4ms	remaining: 338ms
11:	learn: 0.3675073	total: 105ms	remaining: 333ms
12:	learn: 0.3501447	total: 115ms	remaining: 327ms
13:	learn: 0.3337448	total: 125

7:	learn: 0.4484884	total: 68.3ms	remaining: 359ms
8:	learn: 0.4263043	total: 74.2ms	remaining: 338ms
9:	learn: 0.4055027	total: 84.1ms	remaining: 336ms
10:	learn: 0.3859450	total: 95.4ms	remaining: 338ms
11:	learn: 0.3675084	total: 105ms	remaining: 334ms
12:	learn: 0.3501459	total: 115ms	remaining: 328ms
13:	learn: 0.3337457	total: 126ms	remaining: 323ms
14:	learn: 0.3182333	total: 135ms	remaining: 315ms
15:	learn: 0.3035810	total: 145ms	remaining: 308ms
16:	learn: 0.2897206	total: 154ms	remaining: 300ms
17:	learn: 0.2765501	total: 161ms	remaining: 286ms
18:	learn: 0.2640992	total: 170ms	remaining: 278ms
19:	learn: 0.2522574	total: 178ms	remaining: 268ms
20:	learn: 0.2410437	total: 188ms	remaining: 259ms
21:	learn: 0.2303589	total: 194ms	remaining: 246ms
22:	learn: 0.2202112	total: 201ms	remaining: 236ms
23:	learn: 0.2105790	total: 211ms	remaining: 228ms
24:	learn: 0.2014154	total: 221ms	remaining: 221ms
25:	learn: 0.1926811	total: 230ms	remaining: 212ms
26:	learn: 0.1843795	total: 24

22:	learn: 0.2202119	total: 201ms	remaining: 236ms
23:	learn: 0.2105795	total: 211ms	remaining: 228ms
24:	learn: 0.2014155	total: 221ms	remaining: 221ms
25:	learn: 0.1926812	total: 230ms	remaining: 212ms
26:	learn: 0.1843803	total: 240ms	remaining: 204ms
27:	learn: 0.1764729	total: 249ms	remaining: 196ms
28:	learn: 0.1689076	total: 255ms	remaining: 185ms
29:	learn: 0.1617151	total: 264ms	remaining: 176ms
30:	learn: 0.1548487	total: 273ms	remaining: 167ms
31:	learn: 0.1483051	total: 283ms	remaining: 159ms
32:	learn: 0.1420677	total: 294ms	remaining: 151ms
33:	learn: 0.1360900	total: 301ms	remaining: 142ms
34:	learn: 0.1303812	total: 308ms	remaining: 132ms
35:	learn: 0.1249631	total: 318ms	remaining: 124ms
36:	learn: 0.1197645	total: 328ms	remaining: 115ms
37:	learn: 0.1148014	total: 338ms	remaining: 107ms
38:	learn: 0.1100389	total: 345ms	remaining: 97.2ms
39:	learn: 0.1055054	total: 355ms	remaining: 88.7ms
40:	learn: 0.1011461	total: 361ms	remaining: 79.3ms
41:	learn: 0.0969926	total: 

44:	learn: 0.0855587	total: 405ms	remaining: 45ms
45:	learn: 0.0820649	total: 413ms	remaining: 35.9ms
46:	learn: 0.0787356	total: 424ms	remaining: 27ms
47:	learn: 0.0755501	total: 435ms	remaining: 18.1ms
48:	learn: 0.0724764	total: 442ms	remaining: 9.01ms
49:	learn: 0.0695324	total: 448ms	remaining: 0us
0:	learn: 0.6539751	total: 7.55ms	remaining: 370ms
1:	learn: 0.6177913	total: 14.1ms	remaining: 339ms
2:	learn: 0.5843192	total: 23.1ms	remaining: 362ms
3:	learn: 0.5532572	total: 33ms	remaining: 380ms
4:	learn: 0.5243357	total: 42.1ms	remaining: 378ms
5:	learn: 0.4973450	total: 51.5ms	remaining: 378ms
6:	learn: 0.4721192	total: 61.9ms	remaining: 380ms
7:	learn: 0.4484871	total: 71.3ms	remaining: 374ms
8:	learn: 0.4263031	total: 77.7ms	remaining: 354ms
9:	learn: 0.4055007	total: 89.3ms	remaining: 357ms
10:	learn: 0.3859428	total: 99ms	remaining: 351ms
11:	learn: 0.3675073	total: 109ms	remaining: 345ms
12:	learn: 0.3501447	total: 120ms	remaining: 341ms
13:	learn: 0.3337448	total: 130ms	r

6:	learn: 0.4721208	total: 63.3ms	remaining: 389ms
7:	learn: 0.4484884	total: 73.2ms	remaining: 384ms
8:	learn: 0.4263043	total: 79.3ms	remaining: 361ms
9:	learn: 0.4055027	total: 89.4ms	remaining: 357ms
10:	learn: 0.3859450	total: 99.1ms	remaining: 351ms
11:	learn: 0.3675084	total: 109ms	remaining: 344ms
12:	learn: 0.3501459	total: 118ms	remaining: 337ms
13:	learn: 0.3337457	total: 128ms	remaining: 329ms
14:	learn: 0.3182333	total: 138ms	remaining: 322ms
15:	learn: 0.3035810	total: 148ms	remaining: 314ms
16:	learn: 0.2897206	total: 158ms	remaining: 307ms
17:	learn: 0.2765501	total: 165ms	remaining: 293ms
18:	learn: 0.2640992	total: 175ms	remaining: 286ms
19:	learn: 0.2522574	total: 182ms	remaining: 273ms
20:	learn: 0.2410437	total: 192ms	remaining: 265ms
21:	learn: 0.2303589	total: 198ms	remaining: 252ms
22:	learn: 0.2202112	total: 207ms	remaining: 243ms
23:	learn: 0.2105790	total: 217ms	remaining: 235ms
24:	learn: 0.2014154	total: 227ms	remaining: 227ms
25:	learn: 0.1926811	total: 23

22:	learn: 0.2202119	total: 205ms	remaining: 241ms
23:	learn: 0.2105795	total: 215ms	remaining: 233ms
24:	learn: 0.2014155	total: 225ms	remaining: 225ms
25:	learn: 0.1926812	total: 234ms	remaining: 216ms
26:	learn: 0.1843803	total: 244ms	remaining: 208ms
27:	learn: 0.1764729	total: 253ms	remaining: 199ms
28:	learn: 0.1689076	total: 259ms	remaining: 188ms
29:	learn: 0.1617151	total: 269ms	remaining: 179ms
30:	learn: 0.1548487	total: 278ms	remaining: 170ms
31:	learn: 0.1483051	total: 289ms	remaining: 163ms
32:	learn: 0.1420677	total: 300ms	remaining: 155ms
33:	learn: 0.1360900	total: 308ms	remaining: 145ms
34:	learn: 0.1303812	total: 315ms	remaining: 135ms
35:	learn: 0.1249631	total: 325ms	remaining: 126ms
36:	learn: 0.1197645	total: 334ms	remaining: 117ms
37:	learn: 0.1148014	total: 344ms	remaining: 109ms
38:	learn: 0.1100389	total: 352ms	remaining: 99.2ms
39:	learn: 0.1055054	total: 361ms	remaining: 90.2ms
40:	learn: 0.1011461	total: 367ms	remaining: 80.6ms
41:	learn: 0.0969926	total: 

44:	learn: 0.0855587	total: 405ms	remaining: 45ms
45:	learn: 0.0820649	total: 414ms	remaining: 36ms
46:	learn: 0.0787356	total: 426ms	remaining: 27.2ms
47:	learn: 0.0755501	total: 436ms	remaining: 18.2ms
48:	learn: 0.0724764	total: 443ms	remaining: 9.04ms
49:	learn: 0.0695324	total: 450ms	remaining: 0us
0:	learn: 0.6539751	total: 7.4ms	remaining: 363ms
1:	learn: 0.6177913	total: 13.3ms	remaining: 319ms
2:	learn: 0.5843192	total: 22.4ms	remaining: 351ms
3:	learn: 0.5532572	total: 31.5ms	remaining: 362ms
4:	learn: 0.5243357	total: 40.5ms	remaining: 365ms
5:	learn: 0.4973450	total: 50.2ms	remaining: 368ms
6:	learn: 0.4721192	total: 59.2ms	remaining: 364ms
7:	learn: 0.4484871	total: 67.9ms	remaining: 356ms
8:	learn: 0.4263031	total: 74.3ms	remaining: 338ms
9:	learn: 0.4055007	total: 84.9ms	remaining: 340ms
10:	learn: 0.3859428	total: 94.9ms	remaining: 337ms
11:	learn: 0.3675073	total: 105ms	remaining: 332ms
12:	learn: 0.3501447	total: 116ms	remaining: 331ms
13:	learn: 0.3337448	total: 126m

7:	learn: 0.4484884	total: 70.7ms	remaining: 371ms
8:	learn: 0.4263043	total: 77.1ms	remaining: 351ms
9:	learn: 0.4055027	total: 87.3ms	remaining: 349ms
10:	learn: 0.3859450	total: 97.3ms	remaining: 345ms
11:	learn: 0.3675084	total: 108ms	remaining: 342ms
12:	learn: 0.3501459	total: 118ms	remaining: 336ms
13:	learn: 0.3337457	total: 128ms	remaining: 330ms
14:	learn: 0.3182333	total: 138ms	remaining: 323ms
15:	learn: 0.3035810	total: 149ms	remaining: 317ms
16:	learn: 0.2897206	total: 160ms	remaining: 310ms
17:	learn: 0.2765501	total: 167ms	remaining: 297ms
18:	learn: 0.2640992	total: 177ms	remaining: 289ms
19:	learn: 0.2522574	total: 185ms	remaining: 277ms
20:	learn: 0.2410437	total: 195ms	remaining: 269ms
21:	learn: 0.2303589	total: 201ms	remaining: 255ms
22:	learn: 0.2202112	total: 209ms	remaining: 245ms
23:	learn: 0.2105790	total: 219ms	remaining: 237ms
24:	learn: 0.2014154	total: 229ms	remaining: 229ms
25:	learn: 0.1926811	total: 237ms	remaining: 219ms
26:	learn: 0.1843795	total: 24

23:	learn: 0.2105795	total: 210ms	remaining: 228ms
24:	learn: 0.2014155	total: 221ms	remaining: 221ms
25:	learn: 0.1926812	total: 230ms	remaining: 212ms
26:	learn: 0.1843803	total: 240ms	remaining: 204ms
27:	learn: 0.1764729	total: 250ms	remaining: 196ms
28:	learn: 0.1689076	total: 256ms	remaining: 185ms
29:	learn: 0.1617151	total: 266ms	remaining: 177ms
30:	learn: 0.1548487	total: 275ms	remaining: 169ms
31:	learn: 0.1483051	total: 286ms	remaining: 161ms
32:	learn: 0.1420677	total: 295ms	remaining: 152ms
33:	learn: 0.1360900	total: 303ms	remaining: 143ms
34:	learn: 0.1303812	total: 311ms	remaining: 133ms
35:	learn: 0.1249631	total: 321ms	remaining: 125ms
36:	learn: 0.1197645	total: 331ms	remaining: 116ms
37:	learn: 0.1148014	total: 341ms	remaining: 108ms
38:	learn: 0.1100389	total: 349ms	remaining: 98.4ms
39:	learn: 0.1055054	total: 359ms	remaining: 89.8ms
40:	learn: 0.1011461	total: 366ms	remaining: 80.3ms
41:	learn: 0.0969926	total: 375ms	remaining: 71.5ms
42:	learn: 0.0930077	total:

44:	learn: 0.0855587	total: 406ms	remaining: 45.1ms
45:	learn: 0.0820649	total: 414ms	remaining: 36ms
46:	learn: 0.0787356	total: 425ms	remaining: 27.1ms
47:	learn: 0.0755501	total: 435ms	remaining: 18.1ms
48:	learn: 0.0724764	total: 442ms	remaining: 9.01ms
49:	learn: 0.0695324	total: 448ms	remaining: 0us
0:	learn: 0.6539751	total: 7.59ms	remaining: 372ms
1:	learn: 0.6177913	total: 13.4ms	remaining: 322ms
2:	learn: 0.5843192	total: 22.4ms	remaining: 351ms
3:	learn: 0.5532572	total: 33ms	remaining: 380ms
4:	learn: 0.5243357	total: 42.2ms	remaining: 380ms
5:	learn: 0.4973450	total: 51.9ms	remaining: 380ms
6:	learn: 0.4721192	total: 61.2ms	remaining: 376ms
7:	learn: 0.4484871	total: 70.7ms	remaining: 371ms
8:	learn: 0.4263031	total: 76.8ms	remaining: 350ms
9:	learn: 0.4055007	total: 86.4ms	remaining: 346ms
10:	learn: 0.3859428	total: 96.4ms	remaining: 342ms
11:	learn: 0.3675073	total: 107ms	remaining: 338ms
12:	learn: 0.3501447	total: 118ms	remaining: 336ms
13:	learn: 0.3337448	total: 129

In [60]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)
