# DDoS Detection on CSE-CIC-IDS2018 AWS Dataset

This project aims to create a DDoS Detection Classifier trained on labelled network traffic data.

We start by importing a few libraries:

In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.utils import class_weight

RANDOM_STATE_SEED = 12

In [82]:
# read the data
df_dataset = pd.read_csv("/kaggle/input/ids-intrusion-csv/02-14-2018.csv")

In [83]:
# display data
df_dataset

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.000000,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.000000,0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,80,6,14/02/2018 10:53:23,10156986,5,5,1089,1923,587,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,Benign
1048571,80,6,14/02/2018 10:53:33,117,2,0,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,Benign
1048572,80,6,14/02/2018 10:53:28,5095331,3,1,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,Benign
1048573,80,6,14/02/2018 10:53:28,5235511,3,1,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,Benign


In [84]:
# feature information
df_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  int64  
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

In [85]:
# replace +ve and -ve infinity with NaN
df_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)

In [86]:
df_dataset.describe()

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,...,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,4876.262,8.107557,6255555.0,6.206622,7.211191,447.9936,4521.803,174.5736,8.389535,38.79579,...,2.793536,23.2797,51524.49,21361.51,87891.57,39954.77,3101206.0,729721.8,4812391.0,2126920.0
std,14443.44,4.460625,1260291000.0,44.47851,104.8682,15735.41,151502.1,287.6713,19.48279,53.31882,...,5.557106,11.06185,581558.6,218640.5,739572.5,560269.3,541478000.0,382003100.0,1522117000.0,18170130.0
min,0.0,0.0,-919011000000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,6.0,7.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,53.0,6.0,1023.0,2.0,1.0,36.0,55.0,34.0,0.0,25.66667,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,443.0,6.0,406669.0,7.0,6.0,455.0,768.0,199.0,0.0,55.5,...,4.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65533.0,17.0,120000000.0,5115.0,9198.0,8591554.0,13397730.0,64440.0,1460.0,11217.03,...,1031.0,48.0,110240100.0,57234460.0,110240100.0,110240100.0,339450300000.0,243268200000.0,979781000000.0,12603000000.0


In [87]:
# drop missing values
df_dataset.dropna(inplace=True)

In [88]:
df_dataset

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.000000,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.000000,0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,80,6,14/02/2018 10:53:23,10156986,5,5,1089,1923,587,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,Benign
1048571,80,6,14/02/2018 10:53:33,117,2,0,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,Benign
1048572,80,6,14/02/2018 10:53:28,5095331,3,1,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,Benign
1048573,80,6,14/02/2018 10:53:28,5235511,3,1,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,Benign


In [89]:
df_dataset["Label"].value_counts()

Benign            663808
FTP-BruteForce    193354
SSH-Bruteforce    187589
Name: Label, dtype: int64

In [90]:
df = df_dataset

We will now try to visualize the target feature's class distribution.

In [91]:
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
import plotly.express as px
init_notebook_mode(connected=True)
import plotly.graph_objs as go

fig = go.Figure(data=[
    go.Bar(name='Benign', 
           y=df["Label"].value_counts().values[0:1],
           x=['Benign'],
           text = df["Label"].value_counts()[0:1],
           orientation='v',
           textposition='outside',),
    go.Bar(name='FTP-BruteForce', 
           y=df["Label"].value_counts().values[1:2],
           x=['FTP-BruteForce'],
           text = df["Label"].value_counts()[1:2],
           orientation='v',
           textposition='outside',),
    go.Bar(name='SSH-Bruteforce', 
           y=df["Label"].value_counts().values[2:],
           x=['SSH-Bruteforce'],
           text = df["Label"].value_counts()[2:],
           orientation='v',
           textposition='outside',)
])
# Change the bar mode
fig.update_layout(
                  width=800,
                  height=600,
                  title=f'Class Distribution',
                  yaxis_title='Number of attacks',
                  xaxis_title='Attack Name',)
iplot(fig)

As expected, most of the traffic is benign.

Since our goal is binary classification ("attack" vs "not attack"), we can group FTP and SSH brute force attacks under one name instead of keeping them separate. Let's name them "Malicious".

In [92]:
df.replace(to_replace=["FTP-BruteForce", "SSH-Bruteforce"], value="Malicious", inplace=True)

In [93]:
df_dataset["Label"].value_counts()

Benign       663808
Malicious    380943
Name: Label, dtype: int64

In [94]:
fig = go.Figure(data=[
    go.Bar(name='Benign', 
           y=df["Label"].value_counts().values[0:1],
           x=['Benign'],
           text = df["Label"].value_counts()[0:1],
           orientation='v',
           textposition='outside',),
    go.Bar(name='Malicious', 
           y=df["Label"].value_counts().values[1:2],
           x=['Malicious'],
           text = df["Label"].value_counts()[1:2],
           orientation='v',
           textposition='outside',)
])
# Change the bar mode
fig.update_layout(
                  width=800,
                  height=600,
                  title=f'Class Distribution',
                  yaxis_title='Number of attacks',
                  xaxis_title='Attack Name',)
iplot(fig)

To prevent our classification model from being biased, we should make sure that the class distribution is even (i.e. 50% benign and 50% malicious).

Since there are 380493 instances of Malicious and 663808 instances of Benign, we can take 380493 instances of both Malicious and Benign to train our model with an even class distribution.

In [95]:
df1 = df[df["Label"] == "Benign"][:380943]
df2 = df[df["Label"] == "Malicious"][:380943]
df_equal = pd.concat([ df1,df2], axis =0)

We now encode our class labels to integer values for the model to use.

In [96]:
df_equal.replace(to_replace="Benign", value=0, inplace=True)
df_equal.replace(to_replace="Malicious", value=1, inplace=True)

We split the data into training and testing sets.

In [97]:
train, test = train_test_split(df_equal, test_size=0.3, random_state=RANDOM_STATE_SEED)

In [98]:
# display columns
train.columns

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

In [99]:
# feature info
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533320 entries, 677869 to 92080
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           533320 non-null  int64  
 1   Protocol           533320 non-null  int64  
 2   Timestamp          533320 non-null  object 
 3   Flow Duration      533320 non-null  int64  
 4   Tot Fwd Pkts       533320 non-null  int64  
 5   Tot Bwd Pkts       533320 non-null  int64  
 6   TotLen Fwd Pkts    533320 non-null  int64  
 7   TotLen Bwd Pkts    533320 non-null  int64  
 8   Fwd Pkt Len Max    533320 non-null  int64  
 9   Fwd Pkt Len Min    533320 non-null  int64  
 10  Fwd Pkt Len Mean   533320 non-null  float64
 11  Fwd Pkt Len Std    533320 non-null  float64
 12  Bwd Pkt Len Max    533320 non-null  int64  
 13  Bwd Pkt Len Min    533320 non-null  int64  
 14  Bwd Pkt Len Mean   533320 non-null  float64
 15  Bwd Pkt Len Std    533320 non-null  float64
 16

All of our data except the 'Timestamp' column is numerical. These numerical columns contain different ranges of values, which can confuse our model.
Thus, we must normalize our data.

In [100]:
min_max_scaler = MinMaxScaler().fit(train[['Flow Duration', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std',
'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total',
'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total',
'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
'Fwd Packets/s', 'Max Packet Len', 'Packet Len Mean', 'Packet Len Std', 'Packet Len Var',
'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count',
'Avg Packet Size', 'Avg Bwd Segment Size', 'Init Win Bytes Fwd', 'Init Win Bytes Bwd',
'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min',
'Dst Port', 'Protocol', 'Label']])

KeyError: "['Fwd IAT Total', 'Bwd IAT Total', 'Fwd Packets/s', 'Max Packet Len', 'Packet Len Mean', 'Packet Len Std', 'Packet Len Var', 'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Avg Packet Size', 'Avg Bwd Segment Size', 'Init Win Bytes Fwd', 'Init Win Bytes Bwd'] not in index"

In [None]:
numerical_columns = ['Flow Duration', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std',
'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total',
'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total',
'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
'Fwd Packets/s', 'Max Packet Len', 'Packet Len Mean', 'Packet Len Std', 'Packet Len Var',
'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count',
'Avg Packet Size', 'Avg Bwd Segment Size', 'Init Win Bytes Fwd', 'Init Win Bytes Bwd',
'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min',
'Dst Port', 'Protocol', 'Label']

In [None]:
train[numerical_columns] = min_max_scaler.transform(train[numerical_columns])

In [None]:
train

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.drop(['Timestamp'], axis=1,inplace=True)
test.drop(['Timestamp'],axis=1,inplace=True)

In [None]:
train.info()

In [None]:
test[numerical_columns] = min_max_scaler.transform(test[numerical_columns])

In [None]:
test[numerical_columns]

In [None]:
print("Full dataset:\n")
print("Benign: " + str(df_equal["Label"].value_counts()[[0]].sum()))
print("Malicious: " + str(df_equal["Label"].value_counts()[[1]].sum()))
print("---------------")

print("Training set:\n")
print("Benign: " + str(train["Label"].value_counts()[[0]].sum()))
print("Malicious: " + str(train["Label"].value_counts()[[1]].sum()))
print("---------------")

print("Test set:\n")
print("Benign: " + str(test["Label"].value_counts()[[0]].sum()))
print("Malicious: " + str(test["Label"].value_counts()[[1]].sum()))

In [None]:
y_train = np.array(train.pop("Label"))# pop removes "Label" from the dataframe
#y_train = np.array(train.pop("Timestamp"))
X_train = train.values

print(type(X_train))
print(type(y_train))
print(X_train.shape)
print(y_train.shape)

In [None]:
y_test = np.array(test.pop("Label")) # pop removes "Label" from the dataframe
#y_test = np.array(test.pop("Timestamp"))
X_test = test.values

print(type(X_test))
print(type(y_test))
print(X_test.shape)
print(y_test.shape)

In [None]:
model = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    ccp_alpha=0.0,
    max_samples=None
)

hyperparameters = {
    'n_estimators': [50, 75, 100, 125, 150]
}

In [None]:
clf = GridSearchCV(
    estimator=model,
    param_grid=hyperparameters,
    cv=5,
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

In [None]:
clf.fit(X=X_train, y=y_train)

In [None]:
print("Accuracy score on Validation set: \n")
print(clf.best_score_ )
print("---------------")
print("Best performing hyperparameters on Validation set: ")
print(clf.best_params_)
print("---------------")
print(clf.best_estimator_)

In [None]:
model = clf.best_estimator_

In [None]:
model

In [None]:
predictions = model.predict(X_test)

In [None]:
print(accuracy_score(y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, predictions)
import seaborn as sns
sns.heatmap(cf_matrix, annot=True)

Let us also try making a Neural Network model using Keras.

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
neuralNetModel = keras.Sequential([
    layers.InputLayer(input_shape=(78,)),
       
    layers.BatchNormalization(renorm=True),
    layers.Dense(128, activation='relu'),
    layers.Dropout(rate = 0.3),
    layers.BatchNormalization(renorm=True),
    layers.Dense(64, activation='relu'),
    layers.Dropout(rate = 0.3),
    layers.BatchNormalization(renorm=True),
    layers.Dense(32, activation='relu'),
    layers.Dropout(rate = 0.3),
    layers.Dense(1, activation='sigmoid'),
])

neuralNetModel.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(epsilon=0.01)

from tensorflow.keras import callbacks

early_stopping = callbacks.EarlyStopping(
    min_delta = 0.001,
    patience = 5,
    restore_best_weights = True
)

neuralNetModel.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

history = neuralNetModel.fit(
    X_train, y_train,
    epochs=50,
    batch_size = 256,
    callbacks=[early_stopping]
)

history_frame = pd.DataFrame(history.history)
history_frame.loc[:, ['loss']].plot()
history_frame.loc[:, ['binary_accuracy']].plot();

In [None]:
predictions=(neuralNetModel.predict(X_test) > 0.5).astype("int32")


In [None]:
print(accuracy_score(y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, predictions)
import seaborn as sns
sns.heatmap(cf_matrix, annot=True)

Thus, this model gives us the same result as the Random Forest model.

While the models are accurate, it is worth noting that they work with labelled data. 

We should also try to come up with something for unlabelled data as the majority of recent data will be unlabelled. 
An Isolation Forest model can be used for this purpose, with our original data ratio where we had more Benign requests than Malicious ones.

In [None]:
# value set to 1 and -1 as isolation forest assigns 1 to inliers and -1 to outliers
df.replace(to_replace="Benign", value=0, inplace=True)
df.replace(to_replace="Malicious", value=1, inplace=True)

We split the data into training and testing sets.

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=RANDOM_STATE_SEED)

In [None]:
# display columns
train.columns

In [None]:
# feature info
train.info()

All of our data except the 'Timestamp' column is numerical. These numerical columns contain different ranges of values, which can confuse our model.
Thus, we must normalize our data.

In [None]:
min_max_scaler = MinMaxScaler().fit(train[['Flow Duration', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std',
'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total',
'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total',
'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
'Fwd Packets/s', 'Max Packet Len', 'Packet Len Mean', 'Packet Len Std', 'Packet Len Var',
'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count',
'Avg Packet Size', 'Avg Bwd Segment Size', 'Init Win Bytes Fwd', 'Init Win Bytes Bwd',
'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min',
'Dst Port', 'Protocol', 'Label']])

In [None]:
numerical_columns = ['Flow Duration', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std',
'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total',
'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total',
'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
'Fwd Packets/s', 'Max Packet Len', 'Packet Len Mean', 'Packet Len Std', 'Packet Len Var',
'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count',
'Avg Packet Size', 'Avg Bwd Segment Size', 'Init Win Bytes Fwd', 'Init Win Bytes Bwd',
'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min',
'Dst Port', 'Protocol', 'Label']

In [None]:
train[numerical_columns] = min_max_scaler.transform(train[numerical_columns])

In [None]:
train

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.drop(['Timestamp'], axis=1,inplace=True)
test.drop(['Timestamp'],axis=1,inplace=True)

In [None]:
train.info()

In [None]:
test[numerical_columns] = min_max_scaler.transform(test[numerical_columns])

In [None]:
test[numerical_columns]

In [None]:
test.describe()

In [None]:
test.info()

In [None]:
print("Full dataset:\n")
print("Benign: " + str(df["Label"].value_counts()[[0]].sum()))
print("Malicious: " + str(df["Label"].value_counts()[[1]].sum()))
print("---------------")

print("Training set:\n")
print("Benign: " + str(train["Label"].value_counts()[[0]].sum()))
print("Malicious: " + str(train["Label"].value_counts()[[1]].sum()))
print("---------------")

print("Test set:\n")
print("Benign: " + str(test["Label"].value_counts()[[0]].sum()))
print("Malicious: " + str(test["Label"].value_counts()[[1]].sum()))

In [None]:
y_train = train.pop("Label")
X_train = train.values
y_test = test.pop("Label")
X_test = test.values


y_test

In [None]:
y_train

We need to change the labels as an Isolation Forest will give us 1 for a Benign input (inlier) and a -1 for a Malicious input (outlier).

In [None]:
def freq_count(data):
    mp = dict();
    for i in data:
        if i in mp:
            mp[i] = mp[i]+1
        else:
            mp[i] = 1
    return mp

y_train[y_train == 1] = -1
y_train[y_train == 0] = 1
print(freq_count(y_train))

y_test[y_test == 1] = -1
y_test[y_test == 0] = 1
freq_count(y_test)

In [None]:
from sklearn.ensemble import IsolationForest
iFM = IsolationForest(
    n_estimators = 100,
    max_samples = "auto",
    random_state = 42,
    warm_start = False
)

In [None]:
hyperparameters = {
    'n_estimators': [50, 75, 100, 125, 150]
}

isolationForestCV = GridSearchCV(
    estimator=iFM,
    scoring = 'accuracy',
    param_grid=hyperparameters,
    cv=5,
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

In [None]:
isolationForestCV.fit(X = X_train, y = y_train)

In [None]:
print("Accuracy score on Validation set: \n")
print(isolationForestCV.best_score_ )
print("---------------")
print("Best performing hyperparameters on Validation set: ")
print(isolationForestCV.best_params_)
print("---------------")
print(isolationForestCV.best_estimator_)

In [None]:
model = isolationForestCV.best_estimator_

In [None]:
model

In [None]:
predictions = model.predict(X_test)
freq_count(predictions)

In [None]:
print(accuracy_score(y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, predictions)
cf_matrix

In [None]:
import seaborn as sns
sns.heatmap(cf_matrix, annot=True)