### Import Libraries

In [1]:
# General
import pandas as pd
import numpy as np
import joblib
import json
import yaml

# Model and Evaluation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report

# # DVC stuff
# from dvclive import Live

# import os
# os.chdir('..')

In [2]:
%pwd

'C:\\Users\\Abraham Audu\\Documents\\BizDocs Files\\THE THREE\\Intrusion Detection System\\ids_project'

#### Setup Parameters

In [3]:
# Data
x_train_path = './notebooks/X_train_scaled.csv'
x_test_path = './notebooks/X_test_scaled.csv'
y_train_path = './notebooks/y_train.csv'
y_test_path = './notebooks/y_test.csv'

# load params YAML file
params_file_path = 'params.yaml'
with open(params_file_path, 'r') as file:
    params = yaml.safe_load(file)

# random_state = params['train']['params']['random_state']
# lr = params['train']['params']['lr']
# n_estimators = params['train']['params']['n_estimators']
# max_depth = params['train']['params']['max_depth']

# gbc Model
random_state = 42
lr = 0.01
n_estimators = 76
max_depth = 10

models_dir = './notebooks/'
model_fname  = 'model.joblib'

train_params = {'n_estimators': n_estimators,
                'max_depth': max_depth,
                'lr': lr}

In [4]:
# Parameters
random_state = 42
lr = 0.01
n_estimators = 50
max_depth = 5


#### Load Preprocessed Data

In [5]:
# Load data
X_train_scaled = np.genfromtxt(x_train_path, delimiter=',')
X_test_scaled = np.genfromtxt(x_test_path, delimiter=',')
y_train = np.genfromtxt(y_train_path, delimiter=',')
y_test = np.genfromtxt(y_test_path, delimiter=',')

#### Build Models

In [6]:
# Initialize Model
gbc = GradientBoostingClassifier(random_state=random_state,
                                 learning_rate=lr,
                                 n_estimators=n_estimators,
                                 max_depth=max_depth)

# Train model
gbc.fit(X_train_scaled, y_train)

# Save model
joblib.dump(gbc, models_dir+model_fname)



['./notebooks/model.joblib']

#### Evaluate Models

In [7]:
# Classification Report for each type of attack
def print_f1_scores(y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)

    for class_label, metric in report.items():
        try:
            f1_score_float = metric['f1-score']
            print(f"Class {class_label}: F1-score = {round(f1_score_float, 3)}")
        except:
            pass

In [8]:
# Make predictions
preds = gbc.predict(X_test_scaled)

# Evaluate model (F1 Score)
f1_score_weighted = round(f1_score(y_test, preds, average='weighted'), 3)
f1_score_macro = round(f1_score(y_test, preds, average='macro'), 3)
print_f1_scores(y_test, preds)

Class 0.0: F1-score = 0.996
Class 1.0: F1-score = 0.999
Class 2.0: F1-score = 1.0
Class 3.0: F1-score = 1.0
Class 4.0: F1-score = 1.0
Class 5.0: F1-score = 0.965
Class 6.0: F1-score = 1.0
Class 7.0: F1-score = 1.0
Class 8.0: F1-score = 1.0
Class 9.0: F1-score = 0.996
Class 10.0: F1-score = 0.917
Class macro avg: F1-score = 0.988
Class weighted avg: F1-score = 0.997


In [9]:
# Save metrics
metrics = {
    'F1-Score_weighted': f1_score_weighted,
    'F1-Score_macro': f1_score_macro
    }

json.dump(
    obj=metrics,
    fp=open('./notebooks/metrics.json', 'w'),
    indent = 4,
    sort_keys = True
    )



In [10]:
# DVC command to run pipeline
"""
$ dvc run -n run_model_exp -d notebooks/X_train_scaled.csv -d notebooks/X_test_scaled.csv -d notebooks/y_train.csv -d notebooks/y_test.csv -d notebooks/model_exp.ipynb -o notebooks/model.joblib -M notebooks/metrics.json papermill notebooks/model_exp.ipynb notebooks/model_exp_out.ipynb
"""

'\n$ dvc run -n run_model_exp -d notebooks/X_train_scaled.csv -d notebooks/X_test_scaled.csv -d notebooks/y_train.csv -d notebooks/y_test.csv -d notebooks/model_exp.ipynb -o notebooks/model.joblib -M notebooks/metrics.json papermill notebooks/model_exp.ipynb notebooks/model_exp_out.ipynb\n'

In [11]:
"""

stages:
  one_stage_pipeline:
    cmd: papermill notebooks/model_exp.ipynb notebooks/model_exp_out.ipynb
    deps:
    - notebooks/X_test_scaled.csv
    - notebooks/X_train_scaled.csv
    - notebooks/model_exp.ipynb
    - notebooks/y_test.csv
    - notebooks/y_train.csv
    params:
    - base
    - train
    outs:
    - notebooks/model.joblib
    metrics:
    - notebooks/metrics.json:
        cache: false
    
    """



"""
base:
  project: ids_model

train:
  models_dir:
  model_fname: model.joblib
  params:
    random_state: 42
    lr: 0.01
    n_estimators: 75
    max_depth: 10

"""

'\nbase:\n  project: ids_model\n\ntrain:\n  models_dir:\n  model_fname: model.joblib\n  params:\n    random_state: 42\n    lr: 0.01\n    n_estimators: 75\n    max_depth: 10\n\n'