## This notebook specifically targets the binary classifier, i.e. to predict if it is an attack or a benign case.
**Sujal Adhikari**

In [31]:
### Libraries that we will be using:

import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, accuracy_score,recall_score,f1_score,roc_curve,roc_auc_score, precision_recall_curve
from sklearn.pipeline import Pipeline


In [24]:
### Dataset we will be using 
data = pd.read_csv('../Datasets/pre-processedData.csv')

## We will be excluding the column named 'Severity_Score'
binary_data = data.drop(columns=['Severity_Score', 'Unnamed: 0'])
binary_data.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,...,flag_REJ,flag_RSTO,flag_RSTR,flag_S1,flag_S3,flag_SF,flag_SH,is_malicious,bytes_ratio,total_bytes
0,0.0,6.198479,0.0,0.0,0.0,0.0,0.0,0,0.0,0,...,0,0,0,0,0,1,0,0,6.198479,6.198479
1,0.0,4.990433,0.0,0.0,0.0,0.0,0.0,0,0.0,0,...,0,0,0,0,0,1,0,0,4.990433,4.990433
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,...,0,0,0,0,0,0,0,1,0.0,0.0
3,0.0,5.451038,9.006264,0.0,0.0,0.0,0.0,1,0.0,0,...,0,0,0,0,0,1,0,0,0.544763,14.457302
4,0.0,5.298317,6.042633,0.0,0.0,0.0,0.0,1,0.0,0,...,0,0,0,0,0,1,0,0,0.752321,11.34095


### Test-Train Split

In [28]:
X = binary_data.drop(columns='is_malicious')
y = binary_data['is_malicious']

## Train_Test Split 
RANDOM_SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=RANDOM_SEED)

## Models 

### Model 1: Logistic Regression

In [None]:
## Pipeline creation 
pipeline_steps= [('scaler', StandardScaler()), ('logit', LogisticRegression(solver='liblinear'))]
logit_pipeline = Pipeline(pipeline_steps)

## Hyper tuning the pipeline model 
parameters = {
    'logit__C':[0.01,0.1,10,100,1000,10000], ## Regularization parameter
    'logit__penalty':['l1','l2'], ## Penalty for the model
    'logit__class_weight':[None, 'balanced']
}
kf = StratifiedKFold(shuffle = True, n_splits=5, random_state=RANDOM_SEED)
hypertuning = GridSearchCV(logit_pipeline, param_grid= parameters,cv = kf, n_jobs=-1)
hypertuning.fit(X_train,y_train)


hypertuned_logit_model = hypertuning.best_estimator_




{'logit__C': 100, 'logit__class_weight': 'balanced', 'logit__penalty': 'l2'}


### As per the hypertuning results, the logistic regression model uses 100 as the regularization parameter, with balanced class_weight and the dimensional reduction that the model adapts is l2 which is to shrink the coefficents to near zero, but not remove them 

---
### Analyzing the model's performance

In [33]:
logit_testing_pred = hypertuned_logit_model.predict(X_test)
logit_training_pred = hypertuned_logit_model.predict(X_train)

logit_testing_proba = hypertuned_logit_model.predict_proba(X_test)
logit_training_proba = hypertuned_logit_model.predict_proba(X_train)

### Performance Matrix:

In [None]:
testing_accuracy = accuracy_score(y_test, logit_testing_pred)
training_accuracy = accuracy_score(y_train, logit_training_pred)

print(f"The testing accuracy of the model is {testing_accuracy*100:.2f} % and the training accuracy of the model is {training_accuracy*100:.2f} %. Hence proving that the model doesnot overfit")



The testing accuracy of the model is 97.16 % and the training accuracy of the model is 97.36 %. Hence proving that the model doesnot overfit
