In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import recall_score
from sklearn.model_selection import RandomizedSearchCV
from timeit import default_timer
import datetime
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from sklearn.model_selection import PredefinedSplit
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
print(f"Notebook last run (end-to-end): {datetime.datetime.now()}")

Notebook last run (end-to-end): 2024-03-11 21:25:55.628090


In [2]:
SEED_ORIG = 2024
SEED_NP = 2023
SEED = SEED_ORIG
NUM_SIMS = 10
rstate = np.random.default_rng(SEED_NP)

# Loading the dataset

In [3]:
Trained_Data = pd.read_csv("../data/KDDTrain+.txt" , header = None)
Tested_Data  = pd.read_csv("../data/KDDTest+.txt" , header = None)

In [4]:
Columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot',
            'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations',
            'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count',
            'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
            'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
            'dst_host_srv_rerror_rate','attack','level'])
Trained_Data.columns = Columns
Tested_Data.columns  = Columns

In [5]:
Trained_attack = Trained_Data.attack.map(lambda a: 0 if a == 'normal' else 1)
Tested_attack = Tested_Data.attack.map(lambda a: 0 if a == 'normal' else 1)

Trained_Data['attack'] = Trained_attack
Tested_Data['attack'] = Tested_attack

In [6]:
Trained_Data.isnull().sum()


duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [7]:
# duplicates
Trained_Data.duplicated().sum(), Tested_Data.duplicated().sum()



(9, 3)

In [8]:
cat_columns = ['protocol_type','service','flag']

In [9]:
Trained_Data = pd.get_dummies(Trained_Data,columns=cat_columns,prefix="",prefix_sep="")
Tested_Data = pd.get_dummies(Tested_Data,columns=cat_columns,prefix="",prefix_sep="")

In [10]:
Trained_Data["attack"]

0         0
1         0
2         1
3         0
4         0
         ..
125968    1
125969    0
125970    0
125971    1
125972    0
Name: attack, Length: 125973, dtype: int64

In [11]:
X_test = Tested_Data.drop("attack",axis=1)
y_test = Tested_Data['attack']

In [12]:
X_train_ = Trained_Data.drop("attack", axis = 1)
y_train_ = Trained_Data["attack"]

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_, y_train_, test_size= 0.25 , random_state=SEED)

In [14]:
Trained_Data

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,REJ,RSTO,RSTOS0,RSTR,S0,S1,S2,S3,SF,SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
125969,8,105,145,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
125970,0,2231,384,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
125971,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [15]:
num_columns = []
for col_name in Trained_Data.drop("attack",axis=1).columns:
    if col_name not in cat_columns:
        num_columns.append(col_name)
num_columns

['duration',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'level',
 'icmp',
 'tcp',
 'udp',
 'IRC',
 'X11',
 'Z39_50',
 'aol',
 'auth',
 'bgp',
 'courier',
 'csnet_ns',
 'ctf',
 'daytime',
 'discard',
 'domain',
 'domain_u',
 'echo',
 'eco_i',
 'ecr_i',
 'efs',
 'exec',
 'finger',
 'ftp',
 'ftp_data',
 'gopher',
 'harvest',
 

In [16]:
"attack" in num_columns

False

In [17]:
#scaler = StandardScaler()

In [18]:
#scaler.fit(X_train[num_columns])

In [19]:
len(X_train), len(X_valid), len(X_test), len(X_train.columns), len(X_valid.columns), len(X_test.columns)

(94479, 31494, 22544, 123, 123, 117)

In [20]:
notintestcols = [col for col in X_train.columns if col not in X_test.columns]

In [21]:
X_train = X_train.drop(notintestcols,axis=1)
X_valid = X_valid.drop(notintestcols,axis=1)

In [22]:
notintestcols = [col for col in X_train.columns if col not in X_test.columns]

In [23]:
notintestcols

[]

# Random Forest

In [24]:
# Space
space = {
    'n_estimators': hp.choice('n_estimators', [int(x) for x in np.linspace(start = 50, stop = 1000, num = 10)]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'min_samples_split' : hp.choice('min_samples_split', [2, 5, 10, 14]),
    'min_samples_leaf' : hp.choice('min_samples_leaf', [1, 2, 4, 6, 8]),
}

In [25]:
# Objective function
def objective(params):

    rf=RandomForestClassifier(random_state=SEED,**params)

    """
    score = cross_val_score(estimator=xgboost,
                            X=X_train_,
                            y=y_train_,
                            cv=ps,
                            scoring='f1',
                            n_jobs=-1).mean()
    """
    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_valid)
    score = f1_score(y_valid,y_pred)
    #score = recall_score(y_valid, y_pred)

    # Loss is negative score
    loss = - score

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [26]:
best_models = []
times_train = []
for i in range(NUM_SIMS):
    print(80*"-")
    print(80*"-")
    print(f"Sim id: {i}")
    print(80*"-")
    start_time = default_timer()
    SEED = SEED_ORIG + i
    best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 10, trials = Trials(), rstate=rstate)
    best_models.append(best)
    end_time = default_timer()
    times_train.append(end_time-start_time)
    print(f"Training time: {end_time-start_time:.2f}s")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 0
--------------------------------------------------------------------------------
100%|██████████| 10/10 [04:03<00:00, 24.31s/trial, best loss: -0.9994551522168494]
Training time: 243.16s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 1
--------------------------------------------------------------------------------
100%|██████████| 10/10 [04:59<00:00, 29.99s/trial, best loss: -0.9990804127924798]
Training time: 299.96s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 2
--------------------------------------------------------------------------------
100%|██████████| 10/10 [02:09

In [27]:
# best hyperparameters
best

{'max_depth': 5,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 6}

In [28]:
results_valid = []
results_test = []
times_eval = []
for i in range(NUM_SIMS):
    print(80*"-")
    print(80*"-")
    print(f"Sim id: {i}")
    print(80*"-")
    
    SEED = SEED_ORIG + i
    
    # Train model using the best parameters
    rf_bo = RandomForestClassifier(random_state=SEED,
                               n_estimators=space_eval(space, best_models[i])["n_estimators"],
                               max_depth=space_eval(space,best_models[i])["max_depth"],
                               min_samples_split=space_eval(space,best_models[i])["min_samples_split"],
                               min_samples_leaf=space_eval(space,best_models[i])["min_samples_leaf"]
                              ).fit(X_train,y_train)
    
    start_time = default_timer()
    y_pred = rf_bo.predict(X_valid)
    recall = recall_score(y_valid, y_pred)
    f1score = f1_score(y_valid,y_pred)
    mcc = matthews_corrcoef(y_valid, y_pred)
    acc = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='macro')
    results_temp = {}
    results_temp["recall"] = recall
    results_temp["f1score"] = f1score
    results_temp["mcc"] = mcc
    results_temp["accuracy"] = acc
    results_temp["precision"] = precision
    results_valid.append(results_temp)
    
    y_pred = rf_bo.predict(X_test)
    recall = recall_score(y_test, y_pred)
    f1score = f1_score(y_test,y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    results_temp = {}
    results_temp["recall"] = recall
    results_temp["f1score"] = f1score
    results_temp["mcc"] = mcc
    results_temp["accuracy"] = acc
    results_temp["precision"] = precision
    results_test.append(results_temp)
    
    end_time = default_timer()
    times_eval.append(end_time-start_time)
    print(f"Evaluation time: {end_time-start_time:.2f}s")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 0
--------------------------------------------------------------------------------
Evaluation time: 1.47s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 1
--------------------------------------------------------------------------------
Evaluation time: 2.82s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 2
--------------------------------------------------------------------------------
Evaluation time: 0.84s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 3


In [29]:
results_test

[{'recall': 0.6481726798098651,
  'f1score': 0.776874941626973,
  'mcc': 0.6334207469966896,
  'accuracy': 0.7880589070262598,
  'precision': 0.8229981557715802},
 {'recall': 0.618717369282319,
  'f1score': 0.7548604839093026,
  'mcc': 0.6088168988369279,
  'accuracy': 0.7712473385379702,
  'precision': 0.8133035931038843},
 {'recall': 0.6319644666095223,
  'f1score': 0.764409255855601,
  'mcc': 0.618338050408607,
  'accuracy': 0.7782558552164656,
  'precision': 0.8167478442585563},
 {'recall': 0.6329774799345438,
  'f1score': 0.76559849198869,
  'mcc': 0.6206268169091727,
  'accuracy': 0.7793647977288858,
  'precision': 0.8179125541493708},
 {'recall': 0.628925426634458,
  'f1score': 0.7624580794483019,
  'mcc': 0.6169002019996122,
  'accuracy': 0.7769251242015613,
  'precision': 0.8163837289552315},
 {'recall': 0.6618873217486169,
  'f1score': 0.7869551118728865,
  'mcc': 0.6453005796817237,
  'accuracy': 0.7959989354151881,
  'precision': 0.8278254979428858},
 {'recall': 0.659705446

In [30]:
times_train

[243.1589326510002,
 299.95735767899987,
 129.26237598299986,
 363.8975365030001,
 334.44866005899985,
 315.53638669400016,
 339.015356332,
 297.04506189400036,
 453.1774042020006,
 428.9848075030004]

In [31]:
results_valid

[{'recall': 0.9997275018734246,
  'f1score': 0.9994551522168494,
  'mcc': 0.9989794034966317,
  'accuracy': 0.9994919667238205,
  'precision': 0.9994724774179512},
 {'recall': 0.9991825056202739,
  'f1score': 0.9990804127924798,
  'mcc': 0.9982775048717707,
  'accuracy': 0.9991426938464469,
  'precision': 0.9991322824755626},
 {'recall': 0.9979562640506846,
  'f1score': 0.9979222725569673,
  'mcc': 0.9961083639751759,
  'accuracy': 0.9980631231345654,
  'precision': 0.9980520279746118},
 {'recall': 0.9996593773417808,
  'f1score': 0.9995231932429671,
  'mcc': 0.9991068812929952,
  'accuracy': 0.9995554708833428,
  'precision': 0.9995448109692207},
 {'recall': 0.9984331357721916,
  'f1score': 0.9986032092120055,
  'mcc': 0.9973842916875115,
  'accuracy': 0.9986981647297898,
  'precision': 0.9987029604320097},
 {'recall': 0.9997275018734246,
  'f1score': 0.9996253533598992,
  'mcc': 0.999298253578952,
  'accuracy': 0.9996507271226265,
  'precision': 0.9996426502135543},
 {'recall': 0.999

In [32]:
# validation f1-scores
values = [results_valid[i]["f1score"] for i in range(NUM_SIMS)]

In [33]:
# the best f1-score model
ind = np.argmax(np.array(values))
ind, np.max(np.array(values))

(5, 0.9996253533598992)

In [34]:
# results corresponding to the best validation f1-score
results_test[ind]

{'recall': 0.6618873217486169,
 'f1score': 0.7869551118728865,
 'mcc': 0.6453005796817237,
 'accuracy': 0.7959989354151881,
 'precision': 0.8278254979428858}

In [35]:
# average training times
np.mean(times_train)

320.44838795000015

In [36]:
# average inference times
np.mean(times_eval)

2.645695334899938

In [37]:
# average validation f1 scores
valid_f1score_values = [results_valid[i]["f1score"] for i in range(NUM_SIMS)]
np.mean(valid_f1score_values)

0.9991042274792454

In [38]:
# average test f1 scores
test_f1score_values = [results_test[i]["f1score"] for i in range(NUM_SIMS)]
np.mean(test_f1score_values)

0.7701171332152138

In [39]:
# average test mcc scores
test_mcc_values = [results_test[i]["mcc"] for i in range(NUM_SIMS)]
np.mean(test_mcc_values)

0.6256581616571211

In [40]:
# average test accuracy scores
test_acc_values = [results_test[i]["accuracy"] for i in range(NUM_SIMS)]
np.mean(test_acc_values)

0.7828513129879348

In [41]:
# average test precision scores
test_precision_values = [results_test[i]["precision"] for i in range(NUM_SIMS)]
np.mean(test_precision_values)

0.8198824404163204

In [42]:
# average test recall scores
test_recall_values = [results_test[i]["recall"] for i in range(NUM_SIMS)]
np.mean(test_recall_values)

0.6392425777292916