In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import recall_score
from sklearn.model_selection import RandomizedSearchCV
from timeit import default_timer
import datetime
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from sklearn.model_selection import PredefinedSplit
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
print(f"Notebook last run (end-to-end): {datetime.datetime.now()}")

Notebook last run (end-to-end): 2024-03-11 19:19:31.336063


In [2]:
SEED_ORIG = 2024
SEED_NP = 2023
SEED = SEED_ORIG
NUM_SIMS = 10
rstate = np.random.default_rng(SEED_NP)

# Loading the dataset

In [3]:
Trained_Data = pd.read_csv("../data/KDDTrain+.txt" , header = None)
Tested_Data  = pd.read_csv("../data/KDDTest+.txt" , header = None)

In [4]:
Columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot',
            'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations',
            'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count',
            'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
            'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
            'dst_host_srv_rerror_rate','attack','level'])
Trained_Data.columns = Columns
Tested_Data.columns  = Columns

In [5]:
Trained_attack = Trained_Data.attack.map(lambda a: 0 if a == 'normal' else 1)
Tested_attack = Tested_Data.attack.map(lambda a: 0 if a == 'normal' else 1)

Trained_Data['attack'] = Trained_attack
Tested_Data['attack'] = Tested_attack

In [6]:
Trained_Data.isnull().sum()


duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [7]:
# duplicates
Trained_Data.duplicated().sum(), Tested_Data.duplicated().sum()



(9, 3)

In [8]:
cat_columns = ['protocol_type','service','flag']

In [9]:
Trained_Data = pd.get_dummies(Trained_Data,columns=cat_columns,prefix="",prefix_sep="")
Tested_Data = pd.get_dummies(Tested_Data,columns=cat_columns,prefix="",prefix_sep="")

In [10]:
Trained_Data["attack"]

0         0
1         0
2         1
3         0
4         0
         ..
125968    1
125969    0
125970    0
125971    1
125972    0
Name: attack, Length: 125973, dtype: int64

In [11]:
X_test = Tested_Data.drop("attack",axis=1)
y_test = Tested_Data['attack']

In [12]:
X_train_ = Trained_Data.drop("attack", axis = 1)
y_train_ = Trained_Data["attack"]

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_, y_train_, test_size= 0.25 , random_state=SEED)

In [14]:
Trained_Data

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,REJ,RSTO,RSTOS0,RSTR,S0,S1,S2,S3,SF,SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
125969,8,105,145,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
125970,0,2231,384,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
125971,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [15]:
num_columns = []
for col_name in Trained_Data.drop("attack",axis=1).columns:
    if col_name not in cat_columns:
        num_columns.append(col_name)
num_columns

['duration',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'level',
 'icmp',
 'tcp',
 'udp',
 'IRC',
 'X11',
 'Z39_50',
 'aol',
 'auth',
 'bgp',
 'courier',
 'csnet_ns',
 'ctf',
 'daytime',
 'discard',
 'domain',
 'domain_u',
 'echo',
 'eco_i',
 'ecr_i',
 'efs',
 'exec',
 'finger',
 'ftp',
 'ftp_data',
 'gopher',
 'harvest',
 

In [16]:
"attack" in num_columns

False

In [17]:
#scaler = StandardScaler()

In [18]:
#scaler.fit(X_train[num_columns])

In [19]:
len(X_train), len(X_valid), len(X_test), len(X_train.columns), len(X_valid.columns), len(X_test.columns)

(94479, 31494, 22544, 123, 123, 117)

In [20]:
notintestcols = [col for col in X_train.columns if col not in X_test.columns]

In [21]:
X_train = X_train.drop(notintestcols,axis=1)
X_valid = X_valid.drop(notintestcols,axis=1)

In [22]:
notintestcols = [col for col in X_train.columns if col not in X_test.columns]

In [23]:
notintestcols

[]

# XGBoost

In [24]:
# Space
space = {
    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,10)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100])
}

In [25]:
# Objective function
def objective(params):

    xgboost = XGBClassifier(seed=SEED, random_state=SEED, **params)
   
    xgboost.fit(X_train,y_train)
    y_pred = xgboost.predict(X_valid)
    score = f1_score(y_valid,y_pred)
    #score = matthews_corrcoef(y_valid, y_pred)

    # Loss is negative score
    loss = - score

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [26]:
best_models = []
times_train = []
for i in range(NUM_SIMS):
    print(80*"-")
    print(80*"-")
    print(f"Sim id: {i}")
    print(80*"-")
    start_time = default_timer()
    SEED = SEED_ORIG + i
    best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 10, trials = Trials(), rstate=rstate)
    best_models.append(best)
    end_time = default_timer()
    times_train.append(end_time-start_time)
    print(f"Training time: {end_time-start_time:.2f}s")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 0
--------------------------------------------------------------------------------
100%|██████████| 10/10 [04:29<00:00, 26.99s/trial, best loss: -0.9996253023129067]
Training time: 269.93s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 1
--------------------------------------------------------------------------------
100%|██████████| 10/10 [05:01<00:00, 30.13s/trial, best loss: -0.9995911692559281]
Training time: 301.30s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 2
--------------------------------------------------------------------------------
100%|██████████| 10/10 [05:44

In [27]:
# best hyperparameters
best

{'colsample_bytree': 5,
 'gamma': 5,
 'learning_rate': 3,
 'max_depth': 4,
 'reg_alpha': 2,
 'reg_lambda': 1}

In [28]:
results_valid = []
results_test = []
times_eval = []
for i in range(NUM_SIMS):
    print(80*"-")
    print(80*"-")
    print(f"Sim id: {i}")
    print(80*"-")
    
    SEED = SEED_ORIG + i
    
    # Train model using the best parameters
    xgboost_bo = XGBClassifier(
                           seed=SEED,
                           random_state=SEED,
                           colsample_bytree=space_eval(space, best_models[i])['colsample_bytree'],
                           gamma=space_eval(space, best_models[i])['gamma'],
                           learning_rate=space_eval(space, best_models[i])['learning_rate'],
                           max_depth=space_eval(space, best_models[i])['max_depth'],
                           reg_alpha=space_eval(space, best_models[i])['reg_alpha'],
                           reg_lambda=space_eval(space, best_models[i])['reg_lambda']
                           ).fit(X_train,y_train)
    
    start_time = default_timer()
    y_pred = xgboost_bo.predict(X_valid)
    recall = recall_score(y_valid, y_pred)
    f1score = f1_score(y_valid,y_pred)
    mcc = matthews_corrcoef(y_valid, y_pred)
    acc = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='macro')
    results_temp = {}
    results_temp["recall"] = recall
    results_temp["f1score"] = f1score
    results_temp["mcc"] = mcc
    results_temp["accuracy"] = acc
    results_temp["precision"] = precision
    results_valid.append(results_temp)
    
    y_pred = xgboost_bo.predict(X_test)
    recall = recall_score(y_test, y_pred)
    f1score = f1_score(y_test,y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    results_temp = {}
    results_temp["recall"] = recall
    results_temp["f1score"] = f1score
    results_temp["mcc"] = mcc
    results_temp["accuracy"] = acc
    results_temp["precision"] = precision
    results_test.append(results_temp)
    
    end_time = default_timer()
    times_eval.append(end_time-start_time)
    print(f"Evaluation time: {end_time-start_time:.2f}s")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 0
--------------------------------------------------------------------------------
Evaluation time: 0.17s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 1
--------------------------------------------------------------------------------
Evaluation time: 0.17s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 2
--------------------------------------------------------------------------------
Evaluation time: 0.18s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 3


In [29]:
results_test

[{'recall': 0.7111353541650433,
  'f1score': 0.8214221422142215,
  'mcc': 0.6873445648228398,
  'accuracy': 0.8239886444286728,
  'precision': 0.8452222152832184},
 {'recall': 0.7356035221694069,
  'f1score': 0.8377335049030483,
  'mcc': 0.7084801191669452,
  'accuracy': 0.8377838892831796,
  'precision': 0.8542711936883143},
 {'recall': 0.7114470505727422,
  'f1score': 0.8211908616657672,
  'mcc': 0.6862495305286355,
  'accuracy': 0.8236337828246983,
  'precision': 0.8445884473123876},
 {'recall': 0.7331878750097405,
  'f1score': 0.8360582903856406,
  'mcc': 0.7061184153916997,
  'accuracy': 0.8363200851667849,
  'precision': 0.8532203602873158},
 {'recall': 0.7077066936803553,
  'f1score': 0.8186407066882999,
  'mcc': 0.6829956472415968,
  'accuracy': 0.8215046132008517,
  'precision': 0.8432070373984377},
 {'recall': 0.7408244369983636,
  'f1score': 0.8411041316464656,
  'mcc': 0.7128903832101708,
  'accuracy': 0.8406671398154719,
  'precision': 0.8561741797995455},
 {'recall': 0.77

In [30]:
times_train

[269.93486166599905,
 301.30401660299685,
 344.89328810699953,
 248.22954225000285,
 298.4247551869994,
 221.93865056800132,
 262.14909128900035,
 234.37547559799714,
 317.8525154129966,
 230.90246411200496]

In [31]:
results_valid

[{'recall': 0.9995912528101369,
  'f1score': 0.9996253023129067,
  'mcc': 0.9992982251643784,
  'accuracy': 0.9996507271226265,
  'precision': 0.9996512755453608},
 {'recall': 0.9993868792152054,
  'f1score': 0.9995911692559281,
  'mcc': 0.9992344818803925,
  'accuracy': 0.9996189750428653,
  'precision': 0.9996302486774553},
 {'recall': 0.9995231282784931,
  'f1score': 0.9995571754607079,
  'mcc': 0.9991706293697044,
  'accuracy': 0.9995872229631041,
  'precision': 0.999587477371845},
 {'recall': 0.9995912528101369,
  'f1score': 0.9996593541354408,
  'mcc': 0.9993620264026466,
  'accuracy': 0.9996824792023877,
  'precision': 0.9996853414564193},
 {'recall': 0.9997275018734246,
  'f1score': 0.9994211189430313,
  'mcc': 0.9989156473227457,
  'accuracy': 0.9994602146440592,
  'precision': 0.9994384567564183},
 {'recall': 0.9995231282784931,
  'f1score': 0.9996933873880012,
  'mcc': 0.9994258598368967,
  'accuracy': 0.999714231282149,
  'precision': 0.9997237666433241},
 {'recall': 0.9994

In [32]:
# validation f1-scores
values = [results_valid[i]["f1score"] for i in range(NUM_SIMS)]

In [33]:
# the best f1-score model
ind = np.argmax(np.array(values))
ind, np.max(np.array(values))

(7, 0.9997615560173042)

In [34]:
# results corresponding to the best validation f1-score
results_test[ind]

{'recall': 0.7133951531208603,
 'f1score': 0.8226994967649174,
 'mcc': 0.6885158799419979,
 'accuracy': 0.8249645138396026,
 'precision': 0.8456224755221814}

In [35]:
# average training times
np.mean(times_train)

273.0004660792998

In [36]:
# average inference times
np.mean(times_eval)

0.16963826069841162

In [37]:
# average validation f1 scores
valid_f1score_values = [results_valid[i]["f1score"] for i in range(NUM_SIMS)]
np.mean(valid_f1score_values)

0.9995674320616617

In [38]:
# average test f1 scores
test_f1score_values = [results_test[i]["f1score"] for i in range(NUM_SIMS)]
np.mean(test_f1score_values)

0.8320278592135655

In [39]:
# average test mcc scores
test_mcc_values = [results_test[i]["mcc"] for i in range(NUM_SIMS)]
np.mean(test_mcc_values)

0.7009690001680844

In [40]:
# average test accuracy scores
test_acc_values = [results_test[i]["accuracy"] for i in range(NUM_SIMS)]
np.mean(test_acc_values)

0.8330021291696239

In [41]:
# average test precision scores
test_precision_values = [results_test[i]["precision"] for i in range(NUM_SIMS)]
np.mean(test_precision_values)

0.8510466324117612

In [42]:
# average test recall scores
test_recall_values = [results_test[i]["recall"] for i in range(NUM_SIMS)]
np.mean(test_recall_values)

0.7277020182342397