### Import Libraries

In [1]:
# General
import time
import pandas as pd
import numpy as np
import joblib
import json
import yaml

# ML Models
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from sklearn.svm import SVC
import xgboost as xgb

# # DL Models
# import tensorflow as tf
# from tensorflow.keras import models
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import (Dense, Dropout, 
#                                      Bidirectional, LSTM, 
#                                      Conv1D, MaxPooling1D, Flatten, LeakyReLU)
# from tensorflow.keras.callbacks import EarlyStopping

# Evaluation
from sklearn.metrics import f1_score, classification_report

# # DVC stuff
# from dvclive import Live



In [3]:
# import os
# os.chdir('..')
%pwd

'c:\\Users\\Abraham Audu\\Documents\\BizDocs Files\\THE THREE\\Intrusion Detection System\\ids_project'

#### Setup Parameters

In [4]:
# Data
# x_train_path = './notebooks/X_train_scaled.csv'
# x_test_path = './notebooks/X_test_scaled.csv'
# y_train_path = './notebooks/y_train.csv'
# y_test_path = './notebooks/y_test.csv'

x_train_path = './data/processed/X_train_scaled.csv'
x_test_path = './data/processed/X_test_scaled.csv'
y_train_path = './data/processed/y_train.csv'
y_test_path = './data/processed/y_test.csv'

# # load params YAML file
# params_file_path = 'params.yaml'
# with open(params_file_path, 'r') as file:
#     params = yaml.safe_load(file)

# random_state = params['train']['params']['random_state']
# lr = params['train']['params']['lr']
# n_estimators = params['train']['params']['n_estimators']
# max_depth = params['train']['params']['max_depth']

# gbc Model
gbc_params = {
    'n_estimators': 76,
    'max_depth': 10,
    'learning_rate': 0.01,
    'random_state': 42
}

# LightGBM model
lgb_params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_classes': 11,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# SVM model
svc_params = {
    'kernel': 'rbf',
    'C': 1.0
}

# XGB model
xgb_params = {
    
    'num_class': 11,  
    'max_depth': 5,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'merror'
}

# DL Models
epochs = 100
batch_size = 128
num_classes = 1

models_dir = './notebooks/'
# model_fname  = 'model.joblib'



#### Load Preprocessed Data

In [5]:
# Load data
X_train_scaled = np.genfromtxt(x_train_path, delimiter=',')
X_test_scaled = np.genfromtxt(x_test_path, delimiter=',')
y_train = np.genfromtxt(y_train_path, delimiter=',', skip_header=1)
y_test = np.genfromtxt(y_test_path, delimiter=',', skip_header=1)

In [6]:
X_train_scaled.shape

(363326, 40)

In [24]:
unprocessed_csv = pd.read_csv('./temp/mitm-arpspoofing-4-decunprocessed.csv')
processed_data = np.genfromtxt('./temp/mitm-arpspoofing-4-dec.csv', delimiter=",")

# processed_data = np.genfromtxt('./notebooks/test_path_scaled.csv', delimiter=",")

In [35]:
print(unprocessed_csv['ip_src'].unique())

['182.162.108.112' '192.168.0.24' '192.168.0.13' '3.0.230.212'
 '163.152.1.1' '192.168.0.1' '192.168.0.14' nan '74.125.203.189'
 '172.217.161.78' '172.217.25.238' '13.228.233.32' '223.39.123.194'
 '192.168.0.15' '223.39.123.193' '168.126.63.1' '8.8.8.8' '223.39.118.176'
 '223.39.123.191' '163.152.11.6' '52.219.32.0' '192.168.56.1'
 '210.220.163.82' '223.39.123.245' '150.109.183.123' '150.109.182.50'
 '223.39.118.149' '14.129.200.42' '14.129.200.24' '121.254.133.163'
 '175.158.10.9' '42.223.9.25' '216.58.197.131' '219.241.28.254'
 '198.168.0.24' '52.219.32.24' '121.254.133.161' '175.158.10.12'
 '211.188.147.64' '222.239.240.107' '178.140.139.52' '52.219.36.164'
 '216.58.197.142' '52.219.36.92' '192.168.72.152' '104.144.1.24'
 '101.48.191.163' '192.168.0.216' '223.39.118.40' '108.177.97.109'
 '108.177.125.108']


In [10]:
from src.features.build_features import convert_to_float, inference_preprocess

optimal_features =[
                'timestamp', 'ip_len', 'ip_id', 'ip_flags', 'ip_ttl', 'ip_proto',
                'ip_checksum', 'ip_dst', 'ip_dst_host','tcp_srcport', 'tcp_dstport',
                'tcp_port', 'tcp_stream', 'tcp_completeness', 'tcp_seq_raw', 'tcp_ack',
                'tcp_ack_raw', 'tcp_flags_reset', 'tcp_flags_syn', 'tcp_window_size_value',
                'tcp_window_size', 'tcp_window_size_scalefactor', 'tcp_', 'udp_srcport',
                'udp_dstport', 'udp_port', 'udp_length', 'udp_time_delta', 'eth_dst_oui',
                'eth_addr_oui', 'eth_dst_lg', 'eth_lg', 'eth_ig', 'eth_src_oui', 'eth_type',
                'icmp_type', 'icmp_code', 'icmp_checksum', 'icmp_checksum_status', 'arp_opcode'
                ]

In [11]:
data_optimal_features = unprocessed_csv[optimal_features]

In [12]:
data_floats = convert_to_float(data_optimal_features)

Converting to float64:   0%|          | 0/796560 [00:00<?, ? data points/s]

Converting to float64: 100%|██████████| 796560/796560 [00:02<00:00, 286215.51 data points/s]


In [14]:
data_scaled = scaler.transform(data_floats)

In [12]:
data_scaled_auto = inference_preprocess(unprocessed_csv)

Converting to float64:   4%|▍         | 32314/796560 [00:00<00:02, 320023.47 data points/s]

Converting to float64: 100%|██████████| 796560/796560 [00:02<00:00, 277742.73 data points/s]


In [13]:
data_scaled[3000]

array([ 0.08719434,  0.69084304, -0.25064864,  1.06972117, -0.42493424,
       -0.04538017,  1.23987242,  0.96496015,  0.93403895,  1.85566047,
       -0.78100494,  1.85757288, -0.54323185,  0.20733066,  0.46368109,
       -0.12038041, -0.04566362,  0.61554076,  0.44208432,  2.0184838 ,
       -0.00577267, -0.04979459, -0.30901828, -0.55983609, -0.43313485,
       -0.55978361, -0.13272147, -0.55067527,  1.62585703,  1.62585703,
       -0.08128793, -0.08127078,  0.15794467, -1.41100887, -0.01785071,
       -0.08568366, -0.0020799 , -0.07641807, -0.00257187, -0.08112283])

In [14]:
data_scaled_auto[3000]

array([ 0.08719434,  0.69084304, -0.25064864,  1.06972117, -0.42493424,
       -0.04538017,  1.23987242,  0.96496015,  0.93403895,  1.85566047,
       -0.78100494,  1.85757288, -0.54323185,  0.20733066,  0.46368109,
       -0.12038041, -0.04566362,  0.61554076,  0.44208432,  2.0184838 ,
       -0.00577267, -0.04979459, -0.30901828, -0.55983609, -0.43313485,
       -0.55978361, -0.13272147, -0.55067527,  1.62585703,  1.62585703,
       -0.08128793, -0.08127078,  0.15794467, -1.41100887, -0.01785071,
       -0.08568366, -0.0020799 , -0.07641807, -0.00257187, -0.08112283])

In [13]:
scaler = joblib.load('./src/features/scaler.pkl')
# scaled_processed_data = scaler.transform(processed_data)

In [81]:
processed_data[676]

array([ 8.71940823e-02, -2.36146395e-01,  9.27919773e-01,  1.06972117e+00,
       -2.36069334e+00, -4.99669188e-02,  1.05461757e+00, -6.32500550e-02,
       -4.53444566e-02, -6.75894388e-01, -8.00346873e-01, -6.74803314e-01,
       -5.53140595e-01, -5.14943731e-01, -7.04612266e-01, -1.20394180e-01,
       -6.44965802e-01, -2.30787404e+00, -2.19384033e+00, -5.42778950e-01,
       -1.21446802e-02, -5.08131234e-02, -3.09018279e-01, -5.59836091e-01,
       -4.33134849e-01, -5.59783606e-01, -1.32721473e-01, -5.50675267e-01,
       -1.98661424e+00, -1.98661424e+00, -4.96142351e+01, -4.96245368e+01,
       -5.93651196e+00, -1.46179552e+00, -1.78507065e-02, -8.56836628e-02,
       -2.07990121e-03, -7.64180691e-02, -2.57186849e-03, -8.11228337e-02])

In [82]:
X_test_scaled[-10]

array([ 0.11450821,  0.6865286 , -0.09820089,  1.06972117, -0.85193992,
       -0.04538017, -0.71136227, -0.06325006, -0.04534446, -0.65254355,
        1.07075798, -0.6514449 , -0.53982572,  2.03202386,  1.63406263,
       -0.12039417,  0.56746835,  0.61554076,  0.44208432, -0.52236794,
       -0.0120939 , -0.04979459, -0.30901828, -0.55983609, -0.43313485,
       -0.55978361, -0.13272147, -0.55067527, -1.9234349 , -1.9234349 ,
       -0.08128793, -0.08127078,  0.15794467,  0.18565013, -0.01785071,
       -0.08568366, -0.0020799 , -0.07641807, -0.00257187, -0.08112283])

In [15]:
data_scaled.shape

(19914, 40)

In [21]:
data_optimal_features = unprocessed_csv[optimal_features]
data_floats = convert_to_float(data_optimal_features)
data_scaled = scaler.transform(data_floats)

Converting to float64: 100%|██████████| 796560/796560 [00:04<00:00, 181468.37 data points/s]


In [25]:
from collections import OrderedDict
# Load model
# MODELS_DIR = './models/'
# model = xgb.Booster()
# model.load_model(MODELS_DIR+'xgb_model.bin')

model = xgb.XGBClassifier()
model.load_model(models_dir+'xgb_model.bin')
# model = joblib.load(models_dir+'lgb_model.joblib')

predictions = []

for packet_row in range(len(processed_data)):
    packet = processed_data[packet_row]
    packet = OrderedDict(enumerate(packet))
    packet = np.array(list(packet.values())).reshape(1, 40)
    # print(packet)
    # packet = xgb.DMatrix(packet)
    packet_class = model.predict(packet)
    predictions.append(packet_class.item())

In [39]:
dt_model = DecisionTreeClassifier()

dt_model = dt_model.fit(X_train_scaled, y_train)

predictions = dt_model.predict(processed_data)



In [26]:
count = 0
for i in predictions:
    if int(i) == 7:
        count+=1
count

4481

In [24]:
# Convert the training data to XGBoost's DMatrix format
# dtrain = xgb.DMatrix(X_train_scaled, label=y_train)

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softmax')

xgb_model = xgb_model.fit(X_train_scaled, y_train)


# Save model
xgb_model.save_model(models_dir+'xgb_model.bin')

In [25]:

# dtest = xgb.DMatrix(X_test_scaled[:10])
predictions = model.predict(X_test_scaled)


In [36]:
print_f1_scores(y_test, predictions)

Class 0.0: F1-score = 1.0
Class 1.0: F1-score = 1.0
Class 2.0: F1-score = 1.0
Class 3.0: F1-score = 0.995
Class 4.0: F1-score = 1.0
Class 5.0: F1-score = 1.0
Class 6.0: F1-score = 1.0
Class 7.0: F1-score = 1.0
Class 8.0: F1-score = 1.0
Class 9.0: F1-score = 1.0
Class 10.0: F1-score = 0.997
Class macro avg: F1-score = 0.999
Class weighted avg: F1-score = 1.0


In [15]:
print(X_train_scaled[:2])

[[ 1.14508781e-01 -2.09027023e-01 -7.99560979e-01 -7.81394714e-01
  -4.24934235e-01 -4.53801661e-02  1.32617652e+00  5.33283265e-01
   5.22861179e-01  1.67893589e+00 -4.63658772e-01  1.68079096e+00
   8.87033514e-01 -3.62885965e-01 -7.04581139e-01 -1.20394170e-01
  -6.44965799e-01  6.15540764e-01  4.42084320e-01  2.27346196e+00
  -5.13832234e-03 -4.92853161e-02 -3.09018279e-01 -5.59836091e-01
  -4.33134849e-01 -5.59783606e-01 -1.32721473e-01 -5.50675267e-01
  -1.98661424e+00 -1.98661424e+00 -8.12879335e-02 -8.12707776e-02
  -5.93651196e+00 -1.46179552e+00 -1.78507065e-02 -8.56836628e-02
  -2.07990121e-03 -7.64180691e-02 -2.57186849e-03 -8.11228337e-02]
 [ 1.07107181e-01 -2.06561625e-01  1.51064739e+00 -7.81394714e-01
  -7.38071738e-01 -4.53801661e-02 -7.15912947e-01 -6.32500771e-02
  -4.53444777e-02  1.57445786e+00 -6.31894646e-01  1.57627904e+00
  -3.33290322e-01 -3.24871523e-01 -5.68177759e-01 -1.20394172e-01
  -6.44965799e-01  6.15540764e-01  1.10106548e+00 -4.60815990e-01
  -1.1940

In [16]:
y_train

array([2., 9., 1., ..., 0., 1., 7.])

#### Build ML Models

Gradient Boosting Classifier

In [17]:
# Initialize Model
gbc = GradientBoostingClassifier(**gbc_params)

# Train model
st = time.process_time()

gbc_model = gbc.fit(X_train_scaled, y_train)

gbc_train_time = time.process_time() - st

# Save model
joblib.dump(gbc_model, models_dir+'gbc_model.joblib')


['./notebooks/gbc_model.joblib']

Light GBM

In [18]:
# Define the LightGBM dataset
train_data = lgb.Dataset(X_train_scaled, label=y_train)

# Train the LightGBM model
st = time.process_time()
lgb_model = lgb.train(lgb_params, train_data, num_boost_round=100)
lgb_train_time = time.process_time() - st

# Save model
joblib.dump(lgb_model, models_dir+'lgb_model.joblib')

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


['./notebooks/lgb_model.joblib']

Support Vector Classifier - SVM

In [19]:
# Initialize the SVM model with hyperparameters
svm = SVC(**svc_params)

# Train the SVM model
st = time.process_time()
svm_model = svm.fit(X_train_scaled, y_train)
svm_train_time = time.process_time() - st

# Save model
joblib.dump(svm_model, models_dir+'svm_model.joblib')

['./notebooks/svm_model.joblib']

XGBoost

In [20]:
# Convert the training data to XGBoost's DMatrix format
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)

# Train the XGBoost model
st = time.process_time()
xgb_model = xgb.train(xgb_params, dtrain)
xgb_train_time = time.process_time() - st

# Save model
joblib.dump(xgb_model, models_dir+'xgb_model.joblib')

Parameters: { "n_estimators" } are not used.



['./notebooks/xgb_model.joblib']

#### Build DL Models

In [6]:
# This is to configure GPU and avoid OOM error by setting GPU memory consumption growth

# To avoid OOM error by setting GPU memory consumption growth
gpus = tf.config.list_physical_devices('GPU')
gpus


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Feedforward Neural Network (FNN)

In [22]:
# # Define the architecture of the neural network
# fnn_model = Sequential()
# fnn_model.add(Dense(256, input_dim=40, activation='relu', kernel_initializer='he_uniform'))
# fnn_model.add(Dropout(0.5))
# fnn_model.add(Dense(128, activation='relu'))
# fnn_model.add(Dropout(0.5))
# fnn_model.add(Dense(64, activation='relu'))
# fnn_model.add(Dropout(0.5))
# fnn_model.add(Dense(1, activation='softmax'))

# # Compile the model
# learning_rate = 0.01
# sgd_optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
# fnn_model.compile(loss='categorical_crossentropy', optimizer=sgd_optimizer, metrics=['accuracy'])

# # Train the FNN model
# st = time.process_time()
# fnn_model.fit(X_train_scaled, 
#               y_train, 
#               epochs=100, 
#               batch_size=128,
#               validation_split=0.1,
#               callbacks=[EarlyStopping(patience=4)])
# fnn_train_time = time.process_time() - st

# # Save the model
# fnn_model.save(models_dir+'fnn_model.h5')

BiLSTM Neural Network

In [7]:
# # Define the architecture of the neural network
# blstm_model = Sequential()
# blstm_model.add(LSTM(256, input_shape=(X_train_scaled.shape[1], 1), return_sequences=True))
# blstm_model.add(LeakyReLU(alpha=0.5))
# blstm_model.add(Bidirectional(LSTM(128, return_sequences=True)))
# blstm_model.add(LeakyReLU(alpha=0.5))
# blstm_model.add(Dropout(0.3))
# blstm_model.add(Bidirectional(LSTM(64, return_sequences=False)))
# blstm_model.add(Dropout(0.3))
# blstm_model.add(Dense(1, activation='softmax'))

# # Compile the model
# learning_rate = 0.01
# sgd_optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
# blstm_model.compile(loss='categorical_crossentropy', optimizer=sgd_optimizer, metrics=['accuracy'])

# # Train the FNN model
# st = time.process_time()
# blstm_model.fit(X_train_scaled, 
#                 y_train, 
#                 epochs=epochs, 
#                 batch_size=batch_size,
#                 validation_split=0.1, 
#                 callbacks=[EarlyStopping(patience=2)])
# blstm_train_time = time.process_time() - st

# # Save the model
# blstm_model.save(models_dir+'blstm_model.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
  51/2555 [..............................] - ETA: 7:24 - loss: 0.0000e+00 - accuracy: 0.1265

KeyboardInterrupt: 

CNN model

In [24]:
# # Define the architecture of the neural network
# cnn_model = Sequential()
# cnn_model.add(Conv1D(16, kernel_size=3, activation='relu', input_shape=(X_train_scaled.shape[1], 1)))
# cnn_model.add(MaxPooling1D(pool_size=2))
# cnn_model.add(Flatten())
# cnn_model.add(Dense(32, activation='relu'))
# cnn_model.add(Dropout(0.5))
# cnn_model.add(Dense(num_classes, activation='softmax'))

# # Compile the model
# learning_rate = 0.001
# adam_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
# cnn_model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])

# # Train the FNN model
# st = time.process_time()
# cnn_model.fit(X_train_scaled, 
#                 y_train, 
#                 epochs=epochs, 
#                 batch_size=64,
#                 validation_split=0.1, 
#                 callbacks=[EarlyStopping(patience=4)])
# cnn_train_time = time.process_time() - st

# # Save the model
# cnn_model.save(models_dir+'cnn_model.h5')

BiLSTM-CNN Model

In [25]:
# # Define the architecture of the neural network
# bilstm_cnn_model = Sequential()
# bilstm_cnn_model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(X_train_scaled.shape[1], 1)))
# bilstm_cnn_model.add(Conv1D(32, kernel_size=3, activation='relu'))
# bilstm_cnn_model.add(MaxPooling1D(pool_size=2))
# bilstm_cnn_model.add(Flatten())
# bilstm_cnn_model.add(Dense(128, activation='relu'))
# bilstm_cnn_model.add(Dropout(0.5))
# bilstm_cnn_model.add(Dense(num_classes, activation='softmax'))

# # Compile the model
# learning_rate = 0.001
# adam_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
# bilstm_cnn_model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])

# # Train the FNN model
# st = time.process_time()
# bilstm_cnn_model.fit(X_train_scaled, 
#                 y_train, 
#                 epochs=epochs, 
#                 batch_size=batch_size,
#                 validation_split=0.1, 
#                 callbacks=[EarlyStopping(patience=2)])
# bilstm_cnn_train_time = time.process_time() - st

# # Save the model
# bilstm_cnn_model.save(models_dir+'bilstm_cnn_model.h5')

#### Evaluate Models

In [15]:
# Classification Report for each type of attack
def print_f1_scores(y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)

    for class_label, metric in report.items():
        try:
            f1_score_float = metric['f1-score']
            print(f"Class {class_label}: F1-score = {round(f1_score_float, 3)}")
        except:
            pass

In [27]:
# GBC Predictions
st = time.process_time()
gbc_preds = gbc_model.predict(X_test_scaled)
gbc_inf_time = time.process_time() - st

# LGB Predictions
st = time.process_time()
lgb_preds = lgb_model.predict(X_test_scaled)
lgb_preds = [pred.argmax() for pred in lgb_preds]
lgb_inf_time = time.process_time() - st

# SVM Predictions
st = time.process_time()
svm_preds = svm_model.predict(X_test_scaled)
svm_inf_time = time.process_time() - st

# XGB Predictions
st = time.process_time()
dtest = xgb.DMatrix(X_test_scaled)
xgb_preds = xgb_model.predict(dtest)
xgb_inf_time = time.process_time() - st

# # FNN Predictions
# st = time.process_time()
# fnn_preds = fnn_model.predict(X_test_scaled)
# fnn_inf_time = time.process_time() - st

# # BiLSTM Predictions
# st = time.process_time()
# blstm_preds = blstm_model.predict(X_test_scaled)
# blstm_inf_time = time.process_time() - st

# # CNN Predictions
# st = time.process_time()
# cnn_preds = cnn_model.predict(X_test_scaled)
# cnn_inf_time = time.process_time() - st

# # BiLSTM-cNN Predictions
# st = time.process_time()
# bilstm_cnn_preds = bilstm_cnn_model.predict(X_test_scaled)
# bilstm_cnn_inf_time = time.process_time() - st

# Evaluate model (F1 Score)
# GBC
print('##----------GBC----------##')
f1_score_gbc = round(f1_score(y_test, gbc_preds, average='macro'), 3)
print_f1_scores(y_test, gbc_preds)
# LGB
print('\n##----------LGB----------##')
f1_score_lgb = round(f1_score(y_test, lgb_preds, average='macro'), 3)
print_f1_scores(y_test, lgb_preds)
# SVM
print('\n##----------SVM----------##')
f1_score_svm = round(f1_score(y_test, svm_preds, average='macro'), 3)
print_f1_scores(y_test, svm_preds)
# XGB
print('\n##----------XGB----------##')
f1_score_xgb = round(f1_score(y_test, xgb_preds, average='macro'), 3)
print_f1_scores(y_test, xgb_preds)


# # FNN
# print('\n##----------FNN----------##')
# f1_score_fnn = round(f1_score(y_test, fnn_preds, average='macro'), 3)
# print_f1_scores(y_test, fnn_preds)
# # BiLSTM
# print('\n##----------BiLSTM----------##')
# f1_score_blstm = round(f1_score(y_test, blstm_preds, average='macro'), 3)
# print_f1_scores(y_test, blstm_preds)
# # CNN
# print('\n##----------CNN----------##')
# f1_score_cnn = round(f1_score(y_test, cnn_preds, average='macro'), 3)
# print_f1_scores(y_test, cnn_preds)
# # BiLSTM-CNN
# print('\n##----------BiLSTM-CNN----------##')
# f1_score_bilstm_cnn = round(f1_score(y_test, bilstm_cnn_preds, average='macro'), 3)
# print_f1_scores(y_test, bilstm_cnn_preds)

##----------GBC----------##
Class 0.0: F1-score = 1.0
Class 1.0: F1-score = 1.0
Class 2.0: F1-score = 1.0
Class 3.0: F1-score = 0.995
Class 4.0: F1-score = 1.0
Class 5.0: F1-score = 1.0
Class 6.0: F1-score = 1.0
Class 7.0: F1-score = 1.0
Class 8.0: F1-score = 1.0
Class 9.0: F1-score = 1.0
Class 10.0: F1-score = 0.977
Class macro avg: F1-score = 0.997
Class weighted avg: F1-score = 1.0

##----------LGB----------##


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Class 0.0: F1-score = 0.938
Class 1.0: F1-score = 0.987
Class 2.0: F1-score = 0.925
Class 3.0: F1-score = 0.0
Class 4.0: F1-score = 0.0
Class 5.0: F1-score = 0.807
Class 6.0: F1-score = 0.917
Class 7.0: F1-score = 0.912
Class 8.0: F1-score = 0.56
Class 9.0: F1-score = 0.708
Class 10.0: F1-score = 0.0
Class macro avg: F1-score = 0.614
Class weighted avg: F1-score = 0.914

##----------SVM----------##
Class 0.0: F1-score = 0.999
Class 1.0: F1-score = 1.0
Class 2.0: F1-score = 1.0
Class 3.0: F1-score = 0.995
Class 4.0: F1-score = 1.0
Class 5.0: F1-score = 0.996
Class 6.0: F1-score = 1.0
Class 7.0: F1-score = 0.998
Class 8.0: F1-score = 1.0
Class 9.0: F1-score = 0.998
Class 10.0: F1-score = 0.985
Class macro avg: F1-score = 0.997
Class weighted avg: F1-score = 0.999

##----------XGB----------##
Class 0.0: F1-score = 1.0
Class 1.0: F1-score = 1.0
Class 2.0: F1-score = 1.0
Class 3.0: F1-score = 0.995
Class 4.0: F1-score = 1.0
Class 5.0: F1-score = 1.0
Class 6.0: F1-score = 1.0
Class 7.0: F1-s

In [28]:
# Save metrics
metrics = {
    'F1-Scores': {
        'F1_GBC':f1_score_gbc,
        'F1_LGB':f1_score_lgb,
        'F1_SVM':f1_score_svm,
        'F1_XGB':f1_score_xgb,
        # 'F1_FNN':f1_score_fnn,
        # 'F1_BiLSTM':f1_score_blstm,
        # 'F1_CNN':f1_score_cnn,
        # 'F1_BiLSTM_CNN':f1_score_bilstm_cnn
    },

    'Train_times': {
        'tt_GBC': gbc_train_time,
        'tt_LGB': lgb_train_time,
        'tt_SVM': svm_train_time,
        'tt_XGB': xgb_train_time,
        # 'tt_FNN': fnn_train_time,
        # 'tt_BiLSTM':blstm_train_time,
        # 'tt_CNN':cnn_train_time,
        # 'tt_BiLSTM_CNN':bilstm_cnn_train_time
    },

    'Inference_times': {
        'it_GBC': gbc_inf_time,
        'it_LGB': lgb_inf_time,
        'it_SVM': svm_inf_time,
        'it_XGB': xgb_inf_time,
    #     'it_FNN': fnn_inf_time,
    #     'it_BiLSTM': blstm_inf_time,
    #     'it_CNN': cnn_inf_time,
    #     'it_BiLSTM_CNN': bilstm_cnn_inf_time
    }
}

json.dump(
    obj=metrics,
    fp=open('./notebooks/exp_metrics_all.json', 'w'),
    indent = 4,
    sort_keys = True
    )



In [29]:
metrics

{'F1-Scores': {'F1_GBC': 0.997,
  'F1_LGB': 0.614,
  'F1_SVM': 0.997,
  'F1_XGB': 0.997},
 'Train_times': {'tt_GBC': 6436.265625,
  'tt_LGB': 281.5,
  'tt_SVM': 163.015625,
  'tt_XGB': 286.078125},
 'Inference_times': {'it_GBC': 5.03125,
  'it_LGB': 16.21875,
  'it_SVM': 61.265625,
  'it_XGB': 1.703125}}

In [30]:
# DVC command to run pipeline
"""
$ dvc run -n run_model_exp -d notebooks/X_train_scaled.csv -d notebooks/X_test_scaled.csv -d notebooks/y_train.csv -d notebooks/y_test.csv -d notebooks/model_exp.ipynb -o notebooks/model.joblib -M notebooks/metrics.json papermill notebooks/model_exp.ipynb notebooks/model_exp_out.ipynb
"""

'\n$ dvc run -n run_model_exp -d notebooks/X_train_scaled.csv -d notebooks/X_test_scaled.csv -d notebooks/y_train.csv -d notebooks/y_test.csv -d notebooks/model_exp.ipynb -o notebooks/model.joblib -M notebooks/metrics.json papermill notebooks/model_exp.ipynb notebooks/model_exp_out.ipynb\n'

In [31]:
"""

stages:
  one_stage_pipeline:
    cmd: papermill notebooks/model_exp.ipynb notebooks/model_exp_out.ipynb
    deps:
    - notebooks/X_test_scaled.csv
    - notebooks/X_train_scaled.csv
    - notebooks/model_exp.ipynb
    - notebooks/y_test.csv
    - notebooks/y_train.csv
    params:
    - base
    - train
    outs:
    - notebooks/model.joblib
    metrics:
    - notebooks/metrics.json:
        cache: false
    
    """



"""
base:
  project: ids_model

train:
  models_dir:
  model_fname: model.joblib
  params:
    random_state: 42
    lr: 0.01
    n_estimators: 75
    max_depth: 10

"""

'\nbase:\n  project: ids_model\n\ntrain:\n  models_dir:\n  model_fname: model.joblib\n  params:\n    random_state: 42\n    lr: 0.01\n    n_estimators: 75\n    max_depth: 10\n\n'