## Quasi-constant features

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

## Read Data

In [2]:
data = pd.read_csv('../DoHBrwTest.csv')
data.shape

(53860, 35)

In [3]:
data.head(5)

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,is_intrusion
0,7.0,2.0,52742,443,18355.0,0.046455,55,1183.941449,66,1420.729738,...,1.0,0.0,0.0,0.046455,0.046455,0.046455,-10.0,-10.0,0.0,0
1,7.0,2.0,54640,443,18365.0,96.750105,42044,434.562836,44920,464.288902,...,0.561877,0.0017,0.041234,0.033549,0.026931,0.026952,0.481463,0.159978,1.229096,0
2,7.0,2.0,56611,443,18373.0,96.365606,41539,431.056284,44577,462.582054,...,0.549156,0.000956,0.030926,0.026551,0.026848,0.026879,-0.028834,-0.010614,1.164778,0
3,7.0,2.0,56611,443,18374.0,121.35682,60659,499.840058,67897,559.48236,...,0.55657,0.001013,0.031829,0.027571,0.026862,0.026941,0.066819,0.019791,1.154439,0
4,7.0,2.0,56611,443,18375.0,104.669253,30409,290.524668,30718,293.476825,...,0.331633,0.001226,0.035013,0.029797,0.026867,0.026908,0.251063,0.082517,1.175049,0


### Train - Test Split

In [4]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),       # drop the target
    data['is_intrusion'],                             # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((37702, 34), (16158, 34))

## Remove constant features (optional)

First, I will remove constant features like I did in the previous lecture. This will allow a better visualisation of the quasi-constant ones.

In [10]:
constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0.5]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)
X_train.shape, X_test.shape

((37702, 34), (16158, 34))

## Remove quasi-constant features

### Using the VarianceThreshold from sklearn

In [17]:
sel = VarianceThreshold(threshold=0.5)
sel.fit(X_train)                         # fit finds the features with low variance

VarianceThreshold(threshold=0.5)

In [18]:
# If we sum over get_support, we get the number of features that are not quasi-constant
sum(sel.get_support())

31

In [19]:
# let's print the number of quasi-constant features
quasi_constant = X_train.columns[~sel.get_support()]
len(quasi_constant)

3

We can see that 11 columns / variables are constant. This means that 11 variable show the same value, just one value, for all the observations of the training set.

In [20]:
# let's print the variable names
quasi_constant

Index(['PacketLengthCoefficientofVariation', 'PacketTimeSkewFromMode',
       'PacketTimeCoefficientofVariation'],
      dtype='object')

In [21]:
# percentage of observations showing each of the different values of the variable
X_train['PacketLengthCoefficientofVariation'].value_counts() / np.float64(len(X_train))

1.683583    0.039361
1.667320    0.036576
0.050847    0.020715
1.683891    0.019840
1.683275    0.019256
              ...   
1.835228    0.000027
0.619285    0.000027
1.717092    0.000027
0.996976    0.000027
0.657215    0.000027
Name: PacketLengthCoefficientofVariation, Length: 15717, dtype: float64

We can see that > 99% of the observations show one value, 0. Therefore, this features is fairly constant.

In [22]:
# capture feature names
feat_names = X_train.columns[sel.get_support()]

In [23]:
#remove the quasi-constant features
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
X_train.shape, X_test.shape

((37702, 31), (16158, 31))

In [24]:
# transform the array into a dataframe
X_train = pd.DataFrame(X_train, columns=feat_names)
X_test = pd.DataFrame(X_test, columns=feat_names)
X_test.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,PacketTimeMode,PacketTimeSkewFromMedian,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation
0,17.0,8.0,443.0,55192.0,68833.0,120.013525,256859.0,2140.250443,134304.0,1119.073871,...,0.0,-0.439983,0.06649272,0.257862,0.088398,4.8e-05,3e-05,1.027881,0.342697,2.917041
1,6.0,0.0,44418.0,443.0,141485.0,107.193834,25287.0,235.899763,45678.0,426.125256,...,0.0,0.105784,0.3676699,0.606358,0.159472,0.004897,1.8e-05,0.764768,0.26297,3.802289
2,1.0,6.0,443.0,48522.0,38766.0,75.519956,408.0,5.402546,340.0,4.502121,...,0.0,-0.5094,5.14424e-09,7.2e-05,15.077243,15.077233,15.077156,0.435005,1.218571,5e-06
3,6.0,19.0,52998.0,443.0,92904.0,27.35295,961.0,35.133322,3567.0,130.40641,...,0.0,1.930553,11.94903,3.456736,1.607994,0.015824,0.000127,1.381798,0.46514,2.149719
4,8.0,19.0,49296.0,443.0,117092.0,4.091509,1579.0,385.921185,4551.0,1112.303554,...,0.0,0.91202,5.885625e-05,0.007672,0.008971,0.01525,1.9e-05,-2.455417,1.166855,0.85519


In the dataset, 3 features are classified as Quasi constant, thus, 31 features remain

## Standardize Data

In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers

In [15]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [17]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1).fit(X_train, y_train)

CPU times: user 78 ms, sys: 179 ms, total: 257 ms
Wall time: 3.89 s


In [18]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.7401468545346299
F1 Score: 0.7647756260554018
FPR: 0.40677338076182007
TPR: 0.9095803777454918


### Naive Bayes

In [19]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-09).fit(X_train, y_train)

CPU times: user 595 ms, sys: 32.8 ms, total: 627 ms
Wall time: 78.7 ms


In [20]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.46441754316332606
F1 Score: 0.6342692974848223
FPR: 1.0
TPR: 1.0


### Random Forest

In [21]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 5.8 s, sys: 81 ms, total: 5.88 s
Wall time: 5.56 s


In [30]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.5355030760071443
F1 Score: 0.37356623711533093
FPR: 0.00014821402104639098
TPR: 0.0


### KNN

In [23]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance').fit(X_train, y_train)

CPU times: user 9.5 ms, sys: 1.78 ms, total: 11.3 ms
Wall time: 9.06 ms


In [24]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.22667195872196866
f1: 0.316446814482178
fpr: 0.9109974803616422
tpr: 0.3854371421246047


### CatBoost

In [25]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.5791418	total: 76.8ms	remaining: 3.76s
1:	learn: 0.4792561	total: 96ms	remaining: 2.31s
2:	learn: 0.4042028	total: 116ms	remaining: 1.82s
3:	learn: 0.3510174	total: 134ms	remaining: 1.54s
4:	learn: 0.3031374	total: 153ms	remaining: 1.38s
5:	learn: 0.2580909	total: 173ms	remaining: 1.27s
6:	learn: 0.2251009	total: 193ms	remaining: 1.18s
7:	learn: 0.1982565	total: 211ms	remaining: 1.11s
8:	learn: 0.1759089	total: 230ms	remaining: 1.04s
9:	learn: 0.1514185	total: 249ms	remaining: 995ms
10:	learn: 0.1383632	total: 267ms	remaining: 945ms
11:	learn: 0.1251152	total: 286ms	remaining: 905ms
12:	learn: 0.1115081	total: 304ms	remaining: 866ms
13:	learn: 0.1005587	total: 323ms	remaining: 830ms
14:	learn: 0.0897549	total: 341ms	remaining: 796ms
15:	learn: 0.0824570	total: 360ms	remaining: 765ms
16:	learn: 0.0755109	total: 379ms	remaining: 735ms
17:	learn: 0.0688225	total: 397ms	remaining: 705ms
18:	learn: 0.0640477	total: 415ms	remaining: 678ms
19:	learn: 0.0596990	total: 435ms	remaini

In [26]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.5427267314943441
F1 Score: 0.03127890355671403
FPR: 0.00044464206313917296
TPR: 0.015896077258353986


## Model Evaluation

In [65]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../KDDTest.csv")
test_df.shape

(22543, 42)

In [66]:
test_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,is_intrusion
0,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,1
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,0
2,0,icmp,eco_i,SF,20,0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,1
3,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,1
4,0,tcp,http,SF,267,14515,0,0,0,0,...,255,1.0,0.0,0.01,0.03,0.01,0.0,0.0,0.0,0


In [35]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion','land', 'urgent', 'num_failed_logins', 'root_shell', 'su_attempted',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'dst_host_srv_diff_host_rate'])

### Model Evaluation - Logistic Regression

In [36]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=0.1, n_jobs=-1, random_state=42)

In [37]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [39]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9536402155253679
Testing accuracy is  0.7401468545346299


In [40]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.7401468545346299
F1 Score: 0.7647756260554018
Precision Score: 0.6597446069923134
Recall Score: 0.9095803777454918
Confusion Matrix:
 [[ 8005  5489]
 [ 1058 10643]]


### Cross validation - Logistic Regression

In [41]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.82309 (+/- 0.01630)
F1 Score: 0.83971 (+/- 0.01394)
Precision: 0.86752 (+/- 0.03254)
Recall: 0.81406 (+/- 0.03026)


### Model Evaluation - Naive Bayes


In [42]:
modelNB = GaussianNB(var_smoothing=1e-09)
modelNB.fit(X_train, y_train)

GaussianNB()

In [43]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [44]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.8952638002718875
Testing accuracy is  0.46441754316332606


In [45]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.46441754316332606
F1 Score: 0.6342692974848223
Precision Score: 0.46441754316332606
Recall Score: 1.0
Confusion Matrix:
 [[    0 13494]
 [    0 11701]]


### Cross validation - Naive Bayes

In [46]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.79373 (+/- 0.04138)
F1 Score: 0.80066 (+/- 0.05102)
Precision: 0.88661 (+/- 0.02556)
Recall: 0.73208 (+/- 0.10511)


### Model Evaluation - Random Forest


In [47]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [48]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [49]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  0.9999603084037032
Testing accuracy is  0.5355030760071443


In [50]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.5355030760071443
F1 Score: 0.37356623711533093
Precision Score: 0.28682882180131014
Recall Score: 0.5355030760071443
Confusion Matrix:
 [[13492     2]
 [11701     0]]


### Cross validation - Random Forest

In [51]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98700 (+/- 0.00387)
F1 Score: 0.98858 (+/- 0.00340)
Precision: 0.98885 (+/- 0.00410)
Recall: 0.98831 (+/- 0.00418)


### Model Evaluation - KNN

In [52]:
modelKNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=1, n_neighbors=2,
                     weights='distance')

In [53]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [54]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9999603084037032
Testing accuracy is  0.22667195872196866


In [55]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.22667195872196866
F1 Score: 0.316446814482178
Precision Score: 0.2684044515860263
Recall Score: 0.3854371421246047
Confusion Matrix:
 [[ 1201 12293]
 [ 7191  4510]]


### Cross validation - KNN


In [56]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.97738 (+/- 0.00678)
F1 Score: 0.98013 (+/- 0.00590)
Precision: 0.98045 (+/- 0.00987)
Recall: 0.97982 (+/- 0.00514)


### Model Evaluation - CatBoost

In [57]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.5791418	total: 18.2ms	remaining: 894ms
1:	learn: 0.4792561	total: 35.7ms	remaining: 856ms
2:	learn: 0.4042028	total: 53.4ms	remaining: 836ms
3:	learn: 0.3510174	total: 71.3ms	remaining: 820ms
4:	learn: 0.3031374	total: 89.1ms	remaining: 802ms
5:	learn: 0.2580909	total: 107ms	remaining: 786ms
6:	learn: 0.2251009	total: 125ms	remaining: 770ms
7:	learn: 0.1982565	total: 142ms	remaining: 746ms
8:	learn: 0.1759089	total: 161ms	remaining: 731ms
9:	learn: 0.1514185	total: 180ms	remaining: 721ms
10:	learn: 0.1383632	total: 198ms	remaining: 703ms
11:	learn: 0.1251152	total: 216ms	remaining: 685ms
12:	learn: 0.1115081	total: 234ms	remaining: 667ms
13:	learn: 0.1005587	total: 253ms	remaining: 652ms
14:	learn: 0.0897549	total: 271ms	remaining: 632ms
15:	learn: 0.0824570	total: 288ms	remaining: 613ms
16:	learn: 0.0755109	total: 306ms	remaining: 594ms
17:	learn: 0.0688225	total: 323ms	remaining: 575ms
18:	learn: 0.0640477	total: 341ms	remaining: 557ms
19:	learn: 0.0596990	total: 359ms	re

<catboost.core.CatBoostClassifier at 0x7fc8801b0580>

In [58]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [59]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.995951457177729
Testing accuracy is  0.5427267314943441


In [60]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.5427267314943441
F1 Score: 0.3898251274282043
Precision Score: 0.7388272713188289
Recall Score: 0.5427267314943441
Confusion Matrix:
 [[13488     6]
 [11515   186]]


### Cross validation - CatBoost

In [61]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6137734	total: 11.9ms	remaining: 583ms
1:	learn: 0.5442277	total: 23.4ms	remaining: 561ms
2:	learn: 0.4925864	total: 35.1ms	remaining: 549ms
3:	learn: 0.4372124	total: 46.9ms	remaining: 539ms
4:	learn: 0.4028360	total: 58.5ms	remaining: 527ms
5:	learn: 0.3641417	total: 69.9ms	remaining: 513ms
6:	learn: 0.3292007	total: 81.3ms	remaining: 500ms
7:	learn: 0.2980747	total: 93ms	remaining: 488ms
8:	learn: 0.2704006	total: 105ms	remaining: 476ms
9:	learn: 0.2506436	total: 116ms	remaining: 465ms
10:	learn: 0.2367563	total: 128ms	remaining: 452ms
11:	learn: 0.2222244	total: 139ms	remaining: 442ms
12:	learn: 0.2116249	total: 151ms	remaining: 429ms
13:	learn: 0.1951046	total: 163ms	remaining: 418ms
14:	learn: 0.1832170	total: 174ms	remaining: 407ms
15:	learn: 0.1735467	total: 186ms	remaining: 395ms
16:	learn: 0.1650579	total: 198ms	remaining: 383ms
17:	learn: 0.1571248	total: 209ms	remaining: 372ms
18:	learn: 0.1505409	total: 220ms	remaining: 360ms
19:	learn: 0.1414548	total: 232ms	r

12:	learn: 0.2095282	total: 149ms	remaining: 424ms
13:	learn: 0.1923347	total: 160ms	remaining: 411ms
14:	learn: 0.1788250	total: 171ms	remaining: 400ms
15:	learn: 0.1675458	total: 183ms	remaining: 389ms
16:	learn: 0.1580589	total: 194ms	remaining: 377ms
17:	learn: 0.1504103	total: 206ms	remaining: 365ms
18:	learn: 0.1436914	total: 217ms	remaining: 354ms
19:	learn: 0.1360091	total: 229ms	remaining: 343ms
20:	learn: 0.1298868	total: 241ms	remaining: 332ms
21:	learn: 0.1238204	total: 252ms	remaining: 321ms
22:	learn: 0.1187157	total: 264ms	remaining: 310ms
23:	learn: 0.1142004	total: 275ms	remaining: 298ms
24:	learn: 0.1107601	total: 287ms	remaining: 287ms
25:	learn: 0.1057941	total: 299ms	remaining: 276ms
26:	learn: 0.1029173	total: 311ms	remaining: 265ms
27:	learn: 0.1000242	total: 322ms	remaining: 253ms
28:	learn: 0.0966674	total: 333ms	remaining: 241ms
29:	learn: 0.0940485	total: 345ms	remaining: 230ms
30:	learn: 0.0921785	total: 356ms	remaining: 218ms
31:	learn: 0.0894914	total: 367

31:	learn: 0.0943919	total: 370ms	remaining: 208ms
32:	learn: 0.0920509	total: 382ms	remaining: 197ms
33:	learn: 0.0904182	total: 395ms	remaining: 186ms
34:	learn: 0.0884972	total: 408ms	remaining: 175ms
35:	learn: 0.0859744	total: 421ms	remaining: 164ms
36:	learn: 0.0846618	total: 433ms	remaining: 152ms
37:	learn: 0.0835069	total: 445ms	remaining: 141ms
38:	learn: 0.0819522	total: 458ms	remaining: 129ms
39:	learn: 0.0794394	total: 470ms	remaining: 117ms
40:	learn: 0.0774466	total: 482ms	remaining: 106ms
41:	learn: 0.0761150	total: 494ms	remaining: 94ms
42:	learn: 0.0749276	total: 505ms	remaining: 82.3ms
43:	learn: 0.0739101	total: 517ms	remaining: 70.5ms
44:	learn: 0.0731554	total: 528ms	remaining: 58.7ms
45:	learn: 0.0722342	total: 539ms	remaining: 46.9ms
46:	learn: 0.0712628	total: 551ms	remaining: 35.1ms
47:	learn: 0.0706313	total: 562ms	remaining: 23.4ms
48:	learn: 0.0695001	total: 574ms	remaining: 11.7ms
49:	learn: 0.0686480	total: 586ms	remaining: 0us
0:	learn: 0.6139404	total: 

49:	learn: 0.0694408	total: 572ms	remaining: 0us
0:	learn: 0.6137734	total: 11.7ms	remaining: 572ms
1:	learn: 0.5442277	total: 23.2ms	remaining: 558ms
2:	learn: 0.4925864	total: 34.9ms	remaining: 547ms
3:	learn: 0.4372124	total: 46.5ms	remaining: 535ms
4:	learn: 0.4028360	total: 58.1ms	remaining: 523ms
5:	learn: 0.3641417	total: 70ms	remaining: 513ms
6:	learn: 0.3292007	total: 81.4ms	remaining: 500ms
7:	learn: 0.2980747	total: 93.2ms	remaining: 489ms
8:	learn: 0.2704006	total: 104ms	remaining: 475ms
9:	learn: 0.2506436	total: 116ms	remaining: 464ms
10:	learn: 0.2367563	total: 128ms	remaining: 453ms
11:	learn: 0.2222244	total: 139ms	remaining: 441ms
12:	learn: 0.2116249	total: 151ms	remaining: 431ms
13:	learn: 0.1951046	total: 163ms	remaining: 420ms
14:	learn: 0.1832170	total: 175ms	remaining: 409ms
15:	learn: 0.1735467	total: 187ms	remaining: 397ms
16:	learn: 0.1650579	total: 198ms	remaining: 385ms
17:	learn: 0.1571248	total: 210ms	remaining: 373ms
18:	learn: 0.1505409	total: 221ms	rem

22:	learn: 0.1187157	total: 265ms	remaining: 311ms
23:	learn: 0.1142004	total: 276ms	remaining: 299ms
24:	learn: 0.1107601	total: 288ms	remaining: 288ms
25:	learn: 0.1057941	total: 300ms	remaining: 277ms
26:	learn: 0.1029173	total: 311ms	remaining: 265ms
27:	learn: 0.1000242	total: 323ms	remaining: 254ms
28:	learn: 0.0966674	total: 335ms	remaining: 242ms
29:	learn: 0.0940485	total: 347ms	remaining: 231ms
30:	learn: 0.0921785	total: 358ms	remaining: 219ms
31:	learn: 0.0894914	total: 370ms	remaining: 208ms
32:	learn: 0.0871595	total: 382ms	remaining: 197ms
33:	learn: 0.0850795	total: 393ms	remaining: 185ms
34:	learn: 0.0833512	total: 404ms	remaining: 173ms
35:	learn: 0.0817246	total: 415ms	remaining: 161ms
36:	learn: 0.0797038	total: 427ms	remaining: 150ms
37:	learn: 0.0786833	total: 438ms	remaining: 138ms
38:	learn: 0.0777926	total: 450ms	remaining: 127ms
39:	learn: 0.0769700	total: 461ms	remaining: 115ms
40:	learn: 0.0757433	total: 472ms	remaining: 104ms
41:	learn: 0.0749191	total: 484

41:	learn: 0.0761150	total: 488ms	remaining: 93ms
42:	learn: 0.0749276	total: 499ms	remaining: 81.3ms
43:	learn: 0.0739101	total: 511ms	remaining: 69.6ms
44:	learn: 0.0731554	total: 522ms	remaining: 58ms
45:	learn: 0.0722342	total: 534ms	remaining: 46.4ms
46:	learn: 0.0712628	total: 545ms	remaining: 34.8ms
47:	learn: 0.0706313	total: 556ms	remaining: 23.2ms
48:	learn: 0.0695001	total: 568ms	remaining: 11.6ms
49:	learn: 0.0686480	total: 579ms	remaining: 0us
0:	learn: 0.6139404	total: 11.3ms	remaining: 553ms
1:	learn: 0.5487187	total: 22.8ms	remaining: 548ms
2:	learn: 0.4953343	total: 34.3ms	remaining: 537ms
3:	learn: 0.4391187	total: 45.6ms	remaining: 525ms
4:	learn: 0.3969543	total: 57ms	remaining: 513ms
5:	learn: 0.3582253	total: 68.3ms	remaining: 501ms
6:	learn: 0.3237225	total: 79.9ms	remaining: 491ms
7:	learn: 0.3001264	total: 90.8ms	remaining: 477ms
8:	learn: 0.2718870	total: 102ms	remaining: 465ms
9:	learn: 0.2530270	total: 114ms	remaining: 456ms
10:	learn: 0.2383991	total: 125ms

10:	learn: 0.2367563	total: 126ms	remaining: 446ms
11:	learn: 0.2222244	total: 137ms	remaining: 435ms
12:	learn: 0.2116249	total: 149ms	remaining: 423ms
13:	learn: 0.1951046	total: 160ms	remaining: 411ms
14:	learn: 0.1832170	total: 172ms	remaining: 402ms
15:	learn: 0.1735467	total: 184ms	remaining: 390ms
16:	learn: 0.1650579	total: 195ms	remaining: 379ms
17:	learn: 0.1571248	total: 206ms	remaining: 367ms
18:	learn: 0.1505409	total: 218ms	remaining: 356ms
19:	learn: 0.1414548	total: 229ms	remaining: 344ms
20:	learn: 0.1357879	total: 240ms	remaining: 332ms
21:	learn: 0.1298681	total: 252ms	remaining: 321ms
22:	learn: 0.1239138	total: 264ms	remaining: 309ms
23:	learn: 0.1186607	total: 275ms	remaining: 298ms
24:	learn: 0.1137098	total: 287ms	remaining: 287ms
25:	learn: 0.1105146	total: 298ms	remaining: 275ms
26:	learn: 0.1064076	total: 310ms	remaining: 264ms
27:	learn: 0.1027187	total: 321ms	remaining: 252ms
28:	learn: 0.1001838	total: 333ms	remaining: 241ms
29:	learn: 0.0975719	total: 344

31:	learn: 0.0894914	total: 368ms	remaining: 207ms
32:	learn: 0.0871595	total: 380ms	remaining: 196ms
33:	learn: 0.0850795	total: 392ms	remaining: 184ms
34:	learn: 0.0833512	total: 403ms	remaining: 173ms
35:	learn: 0.0817246	total: 415ms	remaining: 161ms
36:	learn: 0.0797038	total: 427ms	remaining: 150ms
37:	learn: 0.0786833	total: 438ms	remaining: 138ms
38:	learn: 0.0777926	total: 450ms	remaining: 127ms
39:	learn: 0.0769700	total: 461ms	remaining: 115ms
40:	learn: 0.0757433	total: 473ms	remaining: 104ms
41:	learn: 0.0749191	total: 484ms	remaining: 92.3ms
42:	learn: 0.0741055	total: 496ms	remaining: 80.7ms
43:	learn: 0.0735139	total: 507ms	remaining: 69.2ms
44:	learn: 0.0724716	total: 518ms	remaining: 57.6ms
45:	learn: 0.0716591	total: 530ms	remaining: 46.1ms
46:	learn: 0.0707149	total: 541ms	remaining: 34.6ms
47:	learn: 0.0695273	total: 553ms	remaining: 23ms
48:	learn: 0.0682111	total: 565ms	remaining: 11.5ms
49:	learn: 0.0675768	total: 576ms	remaining: 0us
0:	learn: 0.6051955	total: 

2:	learn: 0.4953343	total: 34.5ms	remaining: 540ms
3:	learn: 0.4391187	total: 45.7ms	remaining: 525ms
4:	learn: 0.3969543	total: 57.4ms	remaining: 516ms
5:	learn: 0.3582253	total: 68.9ms	remaining: 506ms
6:	learn: 0.3237225	total: 80.7ms	remaining: 496ms
7:	learn: 0.3001264	total: 92.1ms	remaining: 483ms
8:	learn: 0.2718870	total: 103ms	remaining: 471ms
9:	learn: 0.2530270	total: 115ms	remaining: 459ms
10:	learn: 0.2383991	total: 126ms	remaining: 447ms
11:	learn: 0.2223832	total: 137ms	remaining: 435ms
12:	learn: 0.2096491	total: 149ms	remaining: 424ms
13:	learn: 0.1937736	total: 160ms	remaining: 412ms
14:	learn: 0.1795842	total: 171ms	remaining: 400ms
15:	learn: 0.1693142	total: 182ms	remaining: 388ms
16:	learn: 0.1617241	total: 194ms	remaining: 377ms
17:	learn: 0.1526805	total: 205ms	remaining: 365ms
18:	learn: 0.1458393	total: 217ms	remaining: 354ms
19:	learn: 0.1379604	total: 229ms	remaining: 343ms
20:	learn: 0.1321403	total: 240ms	remaining: 331ms
21:	learn: 0.1258511	total: 252ms

20:	learn: 0.1357879	total: 238ms	remaining: 329ms
21:	learn: 0.1298681	total: 249ms	remaining: 317ms
22:	learn: 0.1239138	total: 261ms	remaining: 306ms
23:	learn: 0.1186607	total: 273ms	remaining: 295ms
24:	learn: 0.1137098	total: 284ms	remaining: 284ms
25:	learn: 0.1105146	total: 296ms	remaining: 273ms
26:	learn: 0.1064076	total: 307ms	remaining: 262ms
27:	learn: 0.1027187	total: 319ms	remaining: 251ms
28:	learn: 0.1001838	total: 331ms	remaining: 240ms
29:	learn: 0.0975719	total: 342ms	remaining: 228ms
30:	learn: 0.0948637	total: 353ms	remaining: 217ms
31:	learn: 0.0918436	total: 365ms	remaining: 205ms
32:	learn: 0.0902463	total: 376ms	remaining: 194ms
33:	learn: 0.0880853	total: 388ms	remaining: 182ms
34:	learn: 0.0861895	total: 399ms	remaining: 171ms
35:	learn: 0.0836505	total: 410ms	remaining: 159ms
36:	learn: 0.0823834	total: 421ms	remaining: 148ms
37:	learn: 0.0810999	total: 433ms	remaining: 137ms
38:	learn: 0.0802425	total: 444ms	remaining: 125ms
39:	learn: 0.0788950	total: 456

39:	learn: 0.0769700	total: 464ms	remaining: 116ms
40:	learn: 0.0757433	total: 475ms	remaining: 104ms
41:	learn: 0.0749191	total: 488ms	remaining: 92.9ms
42:	learn: 0.0741055	total: 499ms	remaining: 81.3ms
43:	learn: 0.0735139	total: 511ms	remaining: 69.6ms
44:	learn: 0.0724716	total: 522ms	remaining: 58ms
45:	learn: 0.0716591	total: 534ms	remaining: 46.4ms
46:	learn: 0.0707149	total: 546ms	remaining: 34.8ms
47:	learn: 0.0695273	total: 557ms	remaining: 23.2ms
48:	learn: 0.0682111	total: 570ms	remaining: 11.6ms
49:	learn: 0.0675768	total: 581ms	remaining: 0us
0:	learn: 0.6051955	total: 11.8ms	remaining: 578ms
1:	learn: 0.5319314	total: 23.2ms	remaining: 557ms
2:	learn: 0.4863374	total: 34.9ms	remaining: 546ms
3:	learn: 0.4328517	total: 46.3ms	remaining: 533ms
4:	learn: 0.3914230	total: 57.9ms	remaining: 521ms
5:	learn: 0.3534057	total: 69.1ms	remaining: 507ms
6:	learn: 0.3202377	total: 81.3ms	remaining: 499ms
7:	learn: 0.2962933	total: 92.1ms	remaining: 483ms
8:	learn: 0.2704777	total: 

7:	learn: 0.3001264	total: 92.9ms	remaining: 488ms
8:	learn: 0.2718870	total: 104ms	remaining: 473ms
9:	learn: 0.2530270	total: 115ms	remaining: 460ms
10:	learn: 0.2383991	total: 126ms	remaining: 448ms
11:	learn: 0.2223832	total: 138ms	remaining: 437ms
12:	learn: 0.2096491	total: 150ms	remaining: 427ms
13:	learn: 0.1937736	total: 161ms	remaining: 414ms
14:	learn: 0.1795842	total: 172ms	remaining: 402ms
15:	learn: 0.1693142	total: 184ms	remaining: 391ms
16:	learn: 0.1617241	total: 195ms	remaining: 379ms
17:	learn: 0.1526805	total: 207ms	remaining: 368ms
18:	learn: 0.1458393	total: 218ms	remaining: 356ms
19:	learn: 0.1379604	total: 229ms	remaining: 344ms
20:	learn: 0.1321403	total: 241ms	remaining: 332ms
21:	learn: 0.1258511	total: 252ms	remaining: 321ms
22:	learn: 0.1223553	total: 264ms	remaining: 310ms
23:	learn: 0.1176185	total: 275ms	remaining: 298ms
24:	learn: 0.1127960	total: 287ms	remaining: 287ms
25:	learn: 0.1083939	total: 298ms	remaining: 275ms
26:	learn: 0.1045879	total: 310ms

In [62]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98017 (+/- 0.00461)
F1 Score: 0.98272 (+/- 0.00399)
Precision: 0.97508 (+/- 0.00717)
Recall: 0.99049 (+/- 0.00557)
