## Duplicated features

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Read Data

In [2]:
data = pd.read_csv('../UNSW_Train.csv')
data.shape

(175341, 44)

In [3]:
# check the presence of missing data.
# (there are no missing data in this dataset)
[col for col in data.columns if data[col].isnull().sum() > 0]

[]

In [4]:
data.head(5)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack,is_intrusion
0,0.121478,113.0,0.0,2.0,6,4,258,172,74.08749,252,...,1,1,0,0,0,1,1,0,0,0
1,0.649902,113.0,0.0,2.0,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,0,0
2,1.623129,113.0,0.0,2.0,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,0,0
3,1.681642,113.0,3.0,2.0,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,0,0
4,0.449454,113.0,0.0,2.0,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,0,0


### Train - Test Split

In [5]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),         # drop the target
    data['is_intrusion'],                               # just the target
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((140272, 43), (35069, 43))

## Remove constant and quasi-constant (optional)

In [6]:
# remove constant and quasi-constant features first:
# we can remove the 2 types of features together with this code

# create an empty list
quasi_constant_feat = []

# iterate over every feature
for feature in X_train.columns:

    # find the predominant value, that is the value that is shared
    # by most observations
    predominant = (X_train[feature].value_counts() / np.float64(
        len(X_train))).sort_values(ascending=False).values[0]

    # evaluate predominant feature: do more than 99% of the observations
    # show 1 value?
    if predominant > 0.998:
        quasi_constant_feat.append(feature)

len(quasi_constant_feat)

0

In [7]:
quasi_constant_feat

[]

In [8]:
# we can then drop these columns from the train and test sets:

X_train.drop(labels=quasi_constant_feat, axis=1, inplace=True)
X_test.drop(labels=quasi_constant_feat, axis=1, inplace=True)

X_train.shape, X_test.shape

((140272, 43), (35069, 43))

## Remove duplicated features

In [9]:
# fiding duplicated features
duplicated_feat_pairs = {}
_duplicated_feat = []

for i in range(0, len(X_train.columns)):
    if i % 10 == 0:  
        print(i)
 
    feat_1 = X_train.columns[i]
    
    if feat_1 not in _duplicated_feat:
        duplicated_feat_pairs[feat_1] = []

        for feat_2 in X_train.columns[i + 1:]:
            if X_train[feat_1].equals(X_train[feat_2]):
                duplicated_feat_pairs[feat_1].append(feat_2)
                _duplicated_feat.append(feat_2)

0
10
20
30
40


In [10]:
# let's explore our list of duplicated features
len(_duplicated_feat)

1

We found 1 features that were duplicates of others.

In [11]:
# these are the ones:

_duplicated_feat

['ct_ftp_cmd']

In [12]:
# let's explore the dictionary we created:

duplicated_feat_pairs

{'dur': [],
 'proto': [],
 'service': [],
 'state': [],
 'spkts': [],
 'dpkts': [],
 'sbytes': [],
 'dbytes': [],
 'rate': [],
 'sttl': [],
 'dttl': [],
 'sload': [],
 'dload': [],
 'sloss': [],
 'dloss': [],
 'sinpkt': [],
 'dinpkt': [],
 'sjit': [],
 'djit': [],
 'swin': [],
 'stcpb': [],
 'dtcpb': [],
 'dwin': [],
 'tcprtt': [],
 'synack': [],
 'ackdat': [],
 'smean': [],
 'dmean': [],
 'trans_depth': [],
 'response_body_len': [],
 'ct_srv_src': [],
 'ct_state_ttl': [],
 'ct_dst_ltm': [],
 'ct_src_dport_ltm': [],
 'ct_dst_sport_ltm': [],
 'ct_dst_src_ltm': [],
 'is_ftp_login': ['ct_ftp_cmd'],
 'ct_flw_http_mthd': [],
 'ct_src_ltm': [],
 'ct_srv_dst': [],
 'is_sm_ips_ports': [],
 'attack': []}

We see that for every feature, if it had duplicates, we have entries in the list, otherwise, we have empty lists. Let's explore those features with duplicates now:

In [13]:
# let's explore the number of keys in our dictionary
# we see it is 21, because 2 of the 23 were duplicates,
# so they were not included as keys

print(len(duplicated_feat_pairs.keys()))

42


In [14]:
# print the features with its duplicates
# iterate over every feature in our dict:
for feat in duplicated_feat_pairs.keys():
    # if it has duplicates, the list should not be empty:
    if len(duplicated_feat_pairs[feat]) > 0:
        # print the feature and its duplicates:
        print(feat, duplicated_feat_pairs[feat])
        print()

is_ftp_login ['ct_ftp_cmd']



In [15]:
# to remove the duplicates (if necessary)
X_train = X_train[duplicated_feat_pairs.keys()]
X_test = X_test[duplicated_feat_pairs.keys()]
X_train.shape, X_test.shape

((140272, 42), (35069, 42))

1 duplicate features were found in the UNSW-NB15 dataset

## Standardize Data

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers

In [17]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [19]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25).fit(X_train, y_train)

CPU times: user 96.8 ms, sys: 193 ms, total: 290 ms
Wall time: 4.45 s


In [20]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.33471156862185975
F1 Score: 0.04479017400204708
FPR: 0.0048906277787657835
TPR: 0.02296100407169542


### Naive Bayes

In [21]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-08).fit(X_train, y_train)

CPU times: user 110 ms, sys: 22.7 ms, total: 133 ms
Wall time: 131 ms


In [22]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.7423365365422453
F1 Score: 0.7751791401273885
FPR: 0.07033611950915881
TPR: 0.653905889266675


### Random Forest

In [23]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=100,n_estimators=1000).fit(X_train, y_train)

CPU times: user 1min 33s, sys: 698 ms, total: 1min 34s
Wall time: 1min 34s


In [24]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


### KNN

In [25]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform').fit(X_train, y_train)

CPU times: user 23.7 s, sys: 191 ms, total: 23.9 s
Wall time: 23.8 s


In [26]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.6842510479340729
f1: 0.7391703766518267
fpr: 0.26142628490129827
tpr: 0.6586072283087773


### CatBoost

In [27]:
%%time
clf_CB = CatBoostClassifier(depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.5217342	total: 77.2ms	remaining: 3.78s
1:	learn: 0.3893232	total: 99.2ms	remaining: 2.38s
2:	learn: 0.2867859	total: 122ms	remaining: 1.91s
3:	learn: 0.2108494	total: 145ms	remaining: 1.66s
4:	learn: 0.1560807	total: 166ms	remaining: 1.5s
5:	learn: 0.1117287	total: 182ms	remaining: 1.33s
6:	learn: 0.0822786	total: 203ms	remaining: 1.25s
7:	learn: 0.0617629	total: 223ms	remaining: 1.17s
8:	learn: 0.0484606	total: 244ms	remaining: 1.11s
9:	learn: 0.0374214	total: 265ms	remaining: 1.06s
10:	learn: 0.0289180	total: 288ms	remaining: 1.02s
11:	learn: 0.0227302	total: 311ms	remaining: 985ms
12:	learn: 0.0179617	total: 332ms	remaining: 946ms
13:	learn: 0.0143036	total: 354ms	remaining: 911ms
14:	learn: 0.0115591	total: 374ms	remaining: 874ms
15:	learn: 0.0094435	total: 394ms	remaining: 838ms
16:	learn: 0.0076586	total: 415ms	remaining: 805ms
17:	learn: 0.0063433	total: 436ms	remaining: 774ms
18:	learn: 0.0052707	total: 456ms	remaining: 744ms
19:	learn: 0.0044306	total: 475ms	remain

In [28]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


## Model Evaluation

In [29]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../UNSW_Test.csv")
test_df.shape

(175341, 44)

In [30]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion','ct_ftp_cmd'])

### Model Evaluation - Logistic Regression

In [31]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25)
modelLR.fit(X_train, y_train)

LogisticRegression(C=25, n_jobs=-1, random_state=42)

In [32]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [33]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  1.0
Testing accuracy is  0.33471156862185975


In [34]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.33471156862185975
F1 Score: 0.04479017400204708
Precision Score: 0.9086378737541528
Recall Score: 0.02296100407169542
Confusion Matrix:
 [[11191    55]
 [23276   547]]


### Cross validation - Logistic Regression



In [35]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))



Accuracy: 0.84997 (+/- 0.14701)
F1 Score: 0.90011 (+/- 0.08196)
Precision: 0.84748 (+/- 0.15547)
Recall: 0.96586 (+/- 0.03799)


### Model Evaluation - Naive Bayes



In [36]:
modelNB = GaussianNB(var_smoothing=1e-08)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-08)

In [37]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [38]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  1.0
Testing accuracy is  0.7423365365422453


In [39]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.7423365365422453
F1 Score: 0.7751791401273885
Precision Score: 0.9516769503329464
Recall Score: 0.653905889266675
Confusion Matrix:
 [[10455   791]
 [ 8245 15578]]


### Cross validation - Naive Bayes


In [40]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.81069 (+/- 0.14598)
F1 Score: 0.87452 (+/- 0.07697)
Precision: 0.81743 (+/- 0.15330)
Recall: 0.94776 (+/- 0.06318)


### Model Evaluation - Random Forest



In [41]:
modelRF = RandomForestClassifier(random_state=0,max_depth=100,n_estimators=1000)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=100, n_estimators=1000, random_state=0)

In [42]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [43]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [44]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - Random Forest


In [45]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=5, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=5, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=5, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=5, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)


### Model Evaluation - KNN

In [46]:
modelKNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=1)

In [47]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [48]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9993441314018479
Testing accuracy is  0.6842510479340729


In [49]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.6842510479340729
F1 Score: 0.7391703766518267
Precision Score: 0.8421900161030595
Recall Score: 0.6586072283087773
Confusion Matrix:
 [[ 8306  2940]
 [ 8133 15690]]


### Cross validation - KNN



In [50]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.87901 (+/- 0.13180)
F1 Score: 0.91584 (+/- 0.07682)
Precision: 0.90359 (+/- 0.17653)
Recall: 0.93684 (+/- 0.07517)


### Model Evaluation - CatBoost

In [51]:
modelCB = CatBoostClassifier(depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.5217342	total: 20.7ms	remaining: 1.01s
1:	learn: 0.3893232	total: 42.3ms	remaining: 1.01s
2:	learn: 0.2867859	total: 62.8ms	remaining: 984ms
3:	learn: 0.2108494	total: 83.3ms	remaining: 958ms
4:	learn: 0.1560807	total: 104ms	remaining: 937ms
5:	learn: 0.1117287	total: 120ms	remaining: 881ms
6:	learn: 0.0822786	total: 140ms	remaining: 863ms
7:	learn: 0.0617629	total: 162ms	remaining: 849ms
8:	learn: 0.0484606	total: 183ms	remaining: 833ms
9:	learn: 0.0374214	total: 204ms	remaining: 815ms
10:	learn: 0.0289180	total: 225ms	remaining: 799ms
11:	learn: 0.0227302	total: 246ms	remaining: 780ms
12:	learn: 0.0179617	total: 266ms	remaining: 757ms
13:	learn: 0.0143036	total: 286ms	remaining: 736ms
14:	learn: 0.0115591	total: 307ms	remaining: 716ms
15:	learn: 0.0094435	total: 326ms	remaining: 693ms
16:	learn: 0.0076586	total: 347ms	remaining: 675ms
17:	learn: 0.0063433	total: 368ms	remaining: 655ms
18:	learn: 0.0052707	total: 389ms	remaining: 635ms
19:	learn: 0.0044306	total: 409ms	rem

<catboost.core.CatBoostClassifier at 0x7fe6b8a02f40>

In [52]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [53]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [54]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - CatBoost

In [55]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=5, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=5, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=5, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=5, scoring='recall')

0:	learn: 0.5217568	total: 20.7ms	remaining: 1.01s
1:	learn: 0.3892948	total: 42.6ms	remaining: 1.02s
2:	learn: 0.2909019	total: 63.8ms	remaining: 999ms
3:	learn: 0.2137098	total: 85ms	remaining: 977ms
4:	learn: 0.1581339	total: 106ms	remaining: 957ms
5:	learn: 0.1131897	total: 122ms	remaining: 895ms
6:	learn: 0.0839631	total: 142ms	remaining: 874ms
7:	learn: 0.0631400	total: 163ms	remaining: 857ms
8:	learn: 0.0484262	total: 184ms	remaining: 840ms
9:	learn: 0.0375332	total: 202ms	remaining: 810ms
10:	learn: 0.0293757	total: 224ms	remaining: 793ms
11:	learn: 0.0232955	total: 245ms	remaining: 777ms
12:	learn: 0.0182611	total: 266ms	remaining: 756ms
13:	learn: 0.0143901	total: 288ms	remaining: 740ms
14:	learn: 0.0115975	total: 309ms	remaining: 720ms
15:	learn: 0.0094226	total: 331ms	remaining: 703ms
16:	learn: 0.0075173	total: 347ms	remaining: 673ms
17:	learn: 0.0062402	total: 367ms	remaining: 653ms
18:	learn: 0.0051324	total: 388ms	remaining: 634ms
19:	learn: 0.0044304	total: 408ms	remai

20:	learn: 0.0038901	total: 425ms	remaining: 587ms
21:	learn: 0.0033149	total: 446ms	remaining: 567ms
22:	learn: 0.0028854	total: 466ms	remaining: 547ms
23:	learn: 0.0025240	total: 487ms	remaining: 527ms
24:	learn: 0.0021977	total: 507ms	remaining: 507ms
25:	learn: 0.0019384	total: 526ms	remaining: 486ms
26:	learn: 0.0017287	total: 546ms	remaining: 465ms
27:	learn: 0.0015298	total: 566ms	remaining: 444ms
28:	learn: 0.0013465	total: 586ms	remaining: 424ms
29:	learn: 0.0012245	total: 605ms	remaining: 403ms
30:	learn: 0.0011121	total: 625ms	remaining: 383ms
31:	learn: 0.0010112	total: 645ms	remaining: 363ms
32:	learn: 0.0009225	total: 664ms	remaining: 342ms
33:	learn: 0.0008463	total: 683ms	remaining: 322ms
34:	learn: 0.0007763	total: 703ms	remaining: 301ms
35:	learn: 0.0007229	total: 722ms	remaining: 281ms
36:	learn: 0.0006683	total: 742ms	remaining: 261ms
37:	learn: 0.0006250	total: 760ms	remaining: 240ms
38:	learn: 0.0005769	total: 778ms	remaining: 220ms
39:	learn: 0.0005411	total: 797

31:	learn: 0.0009957	total: 639ms	remaining: 359ms
32:	learn: 0.0009196	total: 659ms	remaining: 339ms
33:	learn: 0.0008366	total: 679ms	remaining: 320ms
34:	learn: 0.0007687	total: 698ms	remaining: 299ms
35:	learn: 0.0007086	total: 717ms	remaining: 279ms
36:	learn: 0.0006547	total: 737ms	remaining: 259ms
37:	learn: 0.0006036	total: 755ms	remaining: 238ms
38:	learn: 0.0005590	total: 773ms	remaining: 218ms
39:	learn: 0.0005203	total: 792ms	remaining: 198ms
40:	learn: 0.0004907	total: 810ms	remaining: 178ms
41:	learn: 0.0004638	total: 827ms	remaining: 158ms
42:	learn: 0.0004381	total: 847ms	remaining: 138ms
43:	learn: 0.0004152	total: 866ms	remaining: 118ms
44:	learn: 0.0003925	total: 884ms	remaining: 98.3ms
45:	learn: 0.0003716	total: 901ms	remaining: 78.4ms
46:	learn: 0.0003541	total: 918ms	remaining: 58.6ms
47:	learn: 0.0003362	total: 935ms	remaining: 39ms
48:	learn: 0.0003209	total: 952ms	remaining: 19.4ms
49:	learn: 0.0003057	total: 970ms	remaining: 0us
0:	learn: 0.5218547	total: 20.

0:	learn: 0.5217568	total: 20.6ms	remaining: 1.01s
1:	learn: 0.3892948	total: 42.4ms	remaining: 1.02s
2:	learn: 0.2909019	total: 63.2ms	remaining: 990ms
3:	learn: 0.2137098	total: 83.4ms	remaining: 959ms
4:	learn: 0.1581339	total: 104ms	remaining: 938ms
5:	learn: 0.1131897	total: 120ms	remaining: 880ms
6:	learn: 0.0839631	total: 139ms	remaining: 856ms
7:	learn: 0.0631400	total: 160ms	remaining: 840ms
8:	learn: 0.0484262	total: 181ms	remaining: 826ms
9:	learn: 0.0375332	total: 199ms	remaining: 797ms
10:	learn: 0.0293757	total: 221ms	remaining: 783ms
11:	learn: 0.0232955	total: 242ms	remaining: 766ms
12:	learn: 0.0182611	total: 261ms	remaining: 742ms
13:	learn: 0.0143901	total: 281ms	remaining: 723ms
14:	learn: 0.0115975	total: 301ms	remaining: 703ms
15:	learn: 0.0094226	total: 323ms	remaining: 686ms
16:	learn: 0.0075173	total: 339ms	remaining: 657ms
17:	learn: 0.0062402	total: 359ms	remaining: 639ms
18:	learn: 0.0051324	total: 380ms	remaining: 620ms
19:	learn: 0.0044304	total: 400ms	rem

20:	learn: 0.0038901	total: 425ms	remaining: 587ms
21:	learn: 0.0033149	total: 446ms	remaining: 568ms
22:	learn: 0.0028854	total: 467ms	remaining: 548ms
23:	learn: 0.0025240	total: 488ms	remaining: 529ms
24:	learn: 0.0021977	total: 509ms	remaining: 509ms
25:	learn: 0.0019384	total: 528ms	remaining: 487ms
26:	learn: 0.0017287	total: 548ms	remaining: 467ms
27:	learn: 0.0015298	total: 567ms	remaining: 446ms
28:	learn: 0.0013465	total: 588ms	remaining: 426ms
29:	learn: 0.0012245	total: 608ms	remaining: 405ms
30:	learn: 0.0011121	total: 628ms	remaining: 385ms
31:	learn: 0.0010112	total: 648ms	remaining: 364ms
32:	learn: 0.0009225	total: 667ms	remaining: 344ms
33:	learn: 0.0008463	total: 686ms	remaining: 323ms
34:	learn: 0.0007763	total: 705ms	remaining: 302ms
35:	learn: 0.0007229	total: 724ms	remaining: 282ms
36:	learn: 0.0006683	total: 743ms	remaining: 261ms
37:	learn: 0.0006250	total: 761ms	remaining: 240ms
38:	learn: 0.0005769	total: 780ms	remaining: 220ms
39:	learn: 0.0005411	total: 799

41:	learn: 0.0004638	total: 830ms	remaining: 158ms
42:	learn: 0.0004381	total: 849ms	remaining: 138ms
43:	learn: 0.0004152	total: 868ms	remaining: 118ms
44:	learn: 0.0003925	total: 886ms	remaining: 98.5ms
45:	learn: 0.0003716	total: 903ms	remaining: 78.5ms
46:	learn: 0.0003541	total: 920ms	remaining: 58.7ms
47:	learn: 0.0003362	total: 937ms	remaining: 39ms
48:	learn: 0.0003209	total: 954ms	remaining: 19.5ms
49:	learn: 0.0003057	total: 972ms	remaining: 0us
0:	learn: 0.5218547	total: 21.9ms	remaining: 1.07s
1:	learn: 0.3896602	total: 44.7ms	remaining: 1.07s
2:	learn: 0.2912479	total: 66.3ms	remaining: 1.04s
3:	learn: 0.2140785	total: 87.7ms	remaining: 1.01s
4:	learn: 0.1585607	total: 109ms	remaining: 977ms
5:	learn: 0.1134926	total: 124ms	remaining: 910ms
6:	learn: 0.0842337	total: 144ms	remaining: 882ms
7:	learn: 0.0633781	total: 165ms	remaining: 865ms
8:	learn: 0.0486446	total: 186ms	remaining: 848ms
9:	learn: 0.0376992	total: 204ms	remaining: 815ms
10:	learn: 0.0297976	total: 225ms	re

In [56]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)
