## Constant features

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

## Read Data

In [3]:
data.head(5)

Unnamed: 0,Duration,Source,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_port_rate,...,Service_code,Flag_code,IDS_detection_code,Malware_detection_code,Ashula_detection_code,Source_IP_Address_code,Destination_IP_Address_code,Start_Time_code,Protocol_code,Label_code
0,1.551192,1423,2077,1,1.0,0.0,0.0,12,12,0.0,...,13.0,10.0,0.0,0.0,0.0,175740.0,6721.0,27120.0,1.0,0.0
1,0.001476,45,104,9,1.0,0.0,0.03,24,31,0.0,...,1.0,10.0,0.0,0.0,0.0,137102.0,19045.0,28207.0,2.0,0.0
2,0.0,0,0,0,0.0,0.0,0.33,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,257402.0,9396.0,14579.0,1.0,0.0
3,2.095259,520,1745,1,1.0,0.0,0.0,12,12,0.0,...,13.0,10.0,255.0,0.0,0.0,288858.0,11847.0,39547.0,1.0,0.0
4,0.127928,54,106,3,1.0,0.0,0.0,6,62,0.0,...,1.0,10.0,0.0,0.0,0.0,210104.0,5573.0,35734.0,2.0,1.0


### Train - Test Split

In [4]:
data.convert_dtypes().dtypes

Duration                       float64
Source                           Int64
Destination_bytes                Int64
Count                            Int64
Same_srv_rate                  float64
Serror_rate                    float64
Srv_serror_rate                float64
Dst_host_count                   Int64
Dst_host_srv_count               Int64
Dst_host_same_port_rate        float64
Dst_host_serror_rate           float64
Dst_host_srv_serror_rate       float64
Source_Port_Number               Int64
Destination_Port_Number          Int64
Service_code                     Int64
Flag_code                        Int64
IDS_detection_code               Int64
Malware_detection_code           Int64
Ashula_detection_code            Int64
Source_IP_Address_code           Int64
Destination_IP_Address_code      Int64
Start_Time_code                  Int64
Protocol_code                    Int64
Label_code                       Int64
dtype: object

In [24]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['Label_code'], axis=1),       # drop the target
    data['Label_code'],                             # just the target
    test_size=0.2,
    random_state=0)
X_train.shape, X_test.shape

((99244, 23), (24811, 23))

### Using VarianceThreshold from Scikit-learn

The VarianceThreshold from sklearn provides a simple baseline approach to feature selection. It removes all features which variance doesn’t meet a certain threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

In [25]:
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_train)  # fit finds the features with zero variance

VarianceThreshold(threshold=0.01)

In [26]:
# get_support is a boolean vector that indicates which features are retained
# if we sum over get_support, we get the number of features that are not constant
# (if necessary, print the result of sel.get_support() to understand its output)
sum(sel.get_support())

23

In [27]:
# now let's print the number of constant feautures
# (see how we use ~ to exclude non-constant features)
constant = X_train.columns[~sel.get_support()]
len(constant)

0

We can see that 0 columns / variables are constant. This means that 0 variables show the same value, just one value, for all the observations of the training set.

In [28]:
# let's print the constant variable names
constant

Index([], dtype='object')

In [29]:
# let's visualise the values of one of the constant variables
# as an example
X_train['Protocol_code'].unique()

array([1., 2., 0.])

In [30]:
# we can do the same for every feature:
for col in constant:
    print(col, X_train[col].unique())

We then use the transform() method of the VarianceThreshold to reduce the training and testing sets to its non-constant features.

Note that VarianceThreshold returns a NumPy array without feature names, so we need to capture the names first, and reconstitute the dataframe in a later step.

In [31]:
# capture non-constant feature names
feat_names = X_train.columns[sel.get_support()]

In [32]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
X_train.shape, X_test.shape

((99244, 23), (24811, 23))

We have now 23 variables.

In [33]:
# X_ train is a NumPy array
X_train

array([[0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 1.228300e+04,
        2.410500e+04, 1.000000e+00],
       [4.000000e-04, 4.600000e+01, 1.040000e+02, ..., 5.573000e+03,
        4.040200e+04, 2.000000e+00],
       [1.417088e+00, 6.210000e+02, 1.813000e+03, ..., 1.184700e+04,
        2.841000e+03, 1.000000e+00],
       ...,
       [4.290000e-04, 4.400000e+01, 1.050000e+02, ..., 5.573000e+03,
        4.340000e+04, 2.000000e+00],
       [3.940000e-04, 6.100000e+01, 7.700000e+01, ..., 5.573000e+03,
        6.025800e+04, 2.000000e+00],
       [4.640000e-04, 6.100000e+01, 7.700000e+01, ..., 5.573000e+03,
        6.962900e+04, 2.000000e+00]])

In [34]:
# reconstitute de dataframe
X_train = pd.DataFrame(X_train, columns=feat_names)
X_train.head()

Unnamed: 0,Duration,Source,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_port_rate,...,Destination_Port_Number,Service_code,Flag_code,IDS_detection_code,Malware_detection_code,Ashula_detection_code,Source_IP_Address_code,Destination_IP_Address_code,Start_Time_code,Protocol_code
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,22.0,6.0,6.0,0.0,0.0,0.0,84856.0,12283.0,24105.0,1.0
1,0.0004,46.0,104.0,6.0,1.0,0.0,0.14,98.0,98.0,0.0,...,53.0,1.0,10.0,0.0,0.0,0.0,255086.0,5573.0,40402.0,2.0
2,1.417088,621.0,1813.0,2.0,1.0,0.0,0.0,33.0,33.0,0.0,...,22.0,13.0,10.0,0.0,0.0,0.0,96829.0,11847.0,2841.0,1.0
3,2.791112,520.0,1564.0,0.0,0.0,0.0,0.0,37.0,36.0,0.0,...,22.0,13.0,10.0,0.0,0.0,0.0,178954.0,15256.0,67520.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,8080.0,6.0,6.0,0.0,0.0,0.0,12229.0,14405.0,39099.0,1.0


In the Kyoto dataset, 0 features was classified as constant were found, remaining the original 23 features of the dataset

## Standardize Data

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Hyperparameter Optimization

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV

class EstimatorSelectionHelper:
    
    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
    
    def fit(self, X, y, **grid_kwargs):
        for key in self.keys:
            print('Running GridSearchCV for %s.' % key)
            model = self.models[key]
            params = self.params[key]
            grid_search = GridSearchCV(model, params, **grid_kwargs)
            grid_search.fit(X, y)
            self.grid_searches[key] = grid_search
        print('Done.')
    
    def score_summary(self, sort_by='mean_test_score'):
        frames = []
        for name, grid_search in self.grid_searches.items():
            frame = pd.DataFrame(grid_search.cv_results_)
            frame = frame.filter(regex='^(?!.*param_).*$')
            frame['estimator'] = len(frame)*[name]
            frames.append(frame)
        df = pd.concat(frames)
        
        df = df.sort_values([sort_by], ascending=False)
        df = df.reset_index()
        df = df.drop(['rank_test_score', 'index'], 1)
        
        columns = df.columns.tolist()
        columns.remove('estimator')
        columns = ['estimator']+columns
        df = df[columns]
        return df

In [18]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier


models = { 
    'LogisticRegression': linear_model.LogisticRegression(max_iter=1000),
    'GaussianNB': GaussianNB(),
    'RandomForest': RandomForestClassifier(random_state=123),
    'KNN': KNeighborsClassifier(n_jobs=-1),
    'CatBoost': CatBoostClassifier(),
}

params = { 
    'LogisticRegression': {'C':[0.1,0.5,1,2,3,4,5,10,20,25]},
    'GaussianNB': {'var_smoothing':[1e-9,1e-8,1e-7,1e-6,1e-5,1e-4]},
    'RandomForest': {'max_depth': [70,80,90,100],'n_estimators': [100,1000]},
    'KNN': {'n_neighbors':[2,3,4,5],'leaf_size':[1,2,3],'weights':['uniform', 'distance'],
          'algorithm':['auto','ball_tree','kd_tree','brute']},
    'CatBoost': {'depth':[4,5,6,7],'learning_rate':[0.01,0.02,0.03,0.04],'iterations':[10,20,30,40,50]}
}

In [19]:
%%time
helper = EstimatorSelectionHelper(models, params)
helper.fit(X_test, y_test, scoring='f1', n_jobs=2)

Running GridSearchCV for LogisticRegression.




Running GridSearchCV for GaussianNB.




Running GridSearchCV for RandomForest.




Running GridSearchCV for KNN.




Running GridSearchCV for CatBoost.




0:	learn: 0.5962215	total: 89ms	remaining: 4.36s
1:	learn: 0.5157400	total: 111ms	remaining: 2.67s
2:	learn: 0.4602641	total: 133ms	remaining: 2.09s
3:	learn: 0.4037906	total: 155ms	remaining: 1.78s
4:	learn: 0.3630171	total: 176ms	remaining: 1.58s
5:	learn: 0.3296365	total: 197ms	remaining: 1.45s
6:	learn: 0.2736792	total: 220ms	remaining: 1.35s
7:	learn: 0.2523208	total: 241ms	remaining: 1.26s
8:	learn: 0.2224886	total: 262ms	remaining: 1.19s
9:	learn: 0.1674967	total: 284ms	remaining: 1.14s
10:	learn: 0.1547081	total: 306ms	remaining: 1.08s
11:	learn: 0.1419578	total: 329ms	remaining: 1.04s
12:	learn: 0.1334022	total: 351ms	remaining: 998ms
13:	learn: 0.1001768	total: 375ms	remaining: 964ms
14:	learn: 0.0898917	total: 397ms	remaining: 926ms
15:	learn: 0.0783601	total: 418ms	remaining: 889ms
16:	learn: 0.0579665	total: 442ms	remaining: 857ms
17:	learn: 0.0445215	total: 463ms	remaining: 823ms
18:	learn: 0.0345035	total: 485ms	remaining: 792ms
19:	learn: 0.0315594	total: 507ms	remainin

In [20]:
helper.score_summary()

Unnamed: 0,estimator,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score
0,CatBoost,4.319576,0.020049,0.391137,0.021617,"{'depth': 7, 'iterations': 50, 'learning_rate'...",1.000000,1.000000,1.000000,0.999686,0.999790,0.999895,0.000133
1,CatBoost,3.101321,0.025885,0.420988,0.049243,"{'depth': 7, 'iterations': 20, 'learning_rate'...",1.000000,1.000000,1.000000,0.999686,0.999790,0.999895,0.000133
2,CatBoost,3.249692,0.225021,0.421500,0.048314,"{'depth': 6, 'iterations': 20, 'learning_rate'...",1.000000,1.000000,1.000000,0.999686,0.999790,0.999895,0.000133
3,CatBoost,3.624060,0.159963,0.417535,0.045990,"{'depth': 6, 'iterations': 30, 'learning_rate'...",1.000000,1.000000,1.000000,0.999686,0.999790,0.999895,0.000133
4,CatBoost,3.662820,0.170793,0.518677,0.135676,"{'depth': 6, 'iterations': 30, 'learning_rate'...",1.000000,1.000000,1.000000,0.999686,0.999790,0.999895,0.000133
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,GaussianNB,0.161354,0.018309,0.047705,0.002726,{'var_smoothing': 1e-09},0.195950,0.191369,0.191935,0.192577,0.246552,0.203677,0.021497
196,GaussianNB,0.141394,0.002438,0.041558,0.001035,{'var_smoothing': 1e-07},0.198868,0.193671,0.194661,0.195713,0.231815,0.202946,0.014540
197,GaussianNB,0.141202,0.001735,0.041999,0.001101,{'var_smoothing': 1e-08},0.196382,0.191042,0.191756,0.192482,0.238356,0.202004,0.018270
198,GaussianNB,0.139142,0.004729,0.041513,0.001603,{'var_smoothing': 0.0001},0.188366,0.187153,0.187870,0.188410,0.255720,0.201504,0.027112


In [21]:
df_gridsearchcv_summary = helper.score_summary()

In [22]:
df_gridsearchcv_summary.to_csv("gridsearchcv_summaryBase.csv", index=False)

## Classifiers

In [38]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation

In [39]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [40]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1).fit(X_train, y_train)

CPU times: user 77.6 ms, sys: 211 ms, total: 289 ms
Wall time: 3.2 s


In [41]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.4527830397807424
F1 Score: 0.2463502636691646
FPR: 0.5989054991991457
TPR: 0.9503211991434689


### Naive Bayes

In [42]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-05).fit(X_train, y_train)

CPU times: user 40.1 ms, sys: 7.54 ms, total: 47.7 ms
Wall time: 45.9 ms


In [43]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9045584619725122
F1 Score: 0.0
FPR: 0.0014682327816337426
TPR: 0.0


### Random Forest

In [44]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 6.16 s, sys: 45.8 ms, total: 6.2 s
Wall time: 6.2 s


In [45]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


### KNN

In [46]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform').fit(X_train, y_train)

CPU times: user 9.96 s, sys: 61.3 ms, total: 10 s
Wall time: 9.98 s


In [47]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.9058885171899561
f1: 0.0
fpr: 1.0
tpr: 1.0


### CatBoost

In [48]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.5950021	total: 21.5ms	remaining: 1.05s
1:	learn: 0.4943009	total: 39.9ms	remaining: 958ms
2:	learn: 0.4235946	total: 57.5ms	remaining: 901ms
3:	learn: 0.3620619	total: 75.7ms	remaining: 871ms
4:	learn: 0.3121757	total: 93.6ms	remaining: 842ms
5:	learn: 0.2371811	total: 111ms	remaining: 816ms
6:	learn: 0.2070297	total: 129ms	remaining: 795ms
7:	learn: 0.1610239	total: 148ms	remaining: 779ms
8:	learn: 0.1441127	total: 166ms	remaining: 756ms
9:	learn: 0.1140228	total: 186ms	remaining: 744ms
10:	learn: 0.1028733	total: 203ms	remaining: 721ms
11:	learn: 0.0790135	total: 221ms	remaining: 701ms
12:	learn: 0.0614288	total: 239ms	remaining: 681ms
13:	learn: 0.0478516	total: 259ms	remaining: 666ms
14:	learn: 0.0375712	total: 277ms	remaining: 647ms
15:	learn: 0.0312555	total: 295ms	remaining: 627ms
16:	learn: 0.0249900	total: 312ms	remaining: 606ms
17:	learn: 0.0196449	total: 330ms	remaining: 586ms
18:	learn: 0.0159795	total: 348ms	remaining: 567ms
19:	learn: 0.0128696	total: 365ms	re

In [49]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


## Model Evaluation

In [50]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../Kyoto_Test.csv")
test_df.shape

(62028, 24)

In [59]:
# Create feature matrix X and target vextor y
y_eval = test_df['Label_code']
X_eval = test_df.drop(columns=['Label_code'])

### Model Evaluation - Logistic Regression

In [60]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=1, n_jobs=-1, random_state=42)

In [61]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [62]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9304038531296602
Testing accuracy is  0.4527830397807424


In [63]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.4527830397807424
F1 Score: 0.2463502636691646
Precision Score: 0.14151785714285714
Recall Score: 0.9503211991434689
Confusion Matrix:
 [[ 9015 13461]
 [  116  2219]]


### Cross validation - Logistic Regression

In [64]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.90021 (+/- 0.00142)
F1 Score: 0.00064 (+/- 0.00385)
Precision: 0.00769 (+/- 0.04615)
Recall: 0.00033 (+/- 0.00201)


### Model Evaluation - Naive Bayes

In [65]:
modelNB = GaussianNB(var_smoothing=1e-05)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-05)

In [66]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [67]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.3204626979968562
Testing accuracy is  0.9045584619725122


In [68]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.9045584619725122
F1 Score: 0.0
Precision Score: 0.0
Recall Score: 0.0
Confusion Matrix:
 [[22443    33]
 [ 2335     0]]


### Cross validation - Naive Bayes

In [69]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.51851 (+/- 0.28039)
F1 Score: 0.25979 (+/- 0.02808)
Precision: 0.21404 (+/- 0.38891)
Recall: 0.86306 (+/- 0.46302)


### Model Evaluation - Random Forest

In [70]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [71]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [72]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.9058885171899561


In [73]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [87]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=1))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=1))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.9147454883866614
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - Random Forest

In [88]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99929 (+/- 0.00060)
F1 Score: 0.99631 (+/- 0.00312)
Precision: 0.99950 (+/- 0.00154)
Recall: 0.99315 (+/- 0.00676)


### Model Evaluation - KNN

In [89]:
modelKNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(leaf_size=1, n_neighbors=2)

In [None]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [None]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN, average='weighted', zero_division=1))
print('Precision Score:', precision_score(y_test, y_predKNN, average='weighted', zero_division=1))
print('Recall Score:', recall_score(y_test, y_predKNN, average='weighted', zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

### Cross validation - KNN

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

### Model Evaluation - CatBoost

In [None]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

In [None]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [None]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=1))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=1))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

### Cross validation - CatBoost

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

In [None]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))