<a href="https://colab.research.google.com/github/walleford/cybersecurity_ml_analysis/blob/main/RecurrentNeuralNetworks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To analyze our network traffic with recurrent neural networks we will begin by preprocessing the data. The below is importing and mounting the files needed.

In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount(
    '/datasets/'
)

from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from imblearn.over_sampling import SMOTE
import imblearn
import pandas as pd
import os
np.random.seed(0)

Mounted at /datasets/


Next we will create dataframes for our testing, training, and a dataframe containing all of the network packets.

In [2]:
testing_df = pd.read_csv('/datasets/MyDrive/datasets/UNSW_NB15_testing-set.csv')
train_df = pd.read_csv('/datasets/MyDrive/datasets/UNSW_NB15_training-set.csv')

data_files = [
    '/datasets/MyDrive/datasets/UNSW-NB15_1.csv',
    '/datasets/MyDrive/datasets/UNSW-NB15_2.csv',
    '/datasets/MyDrive/datasets/UNSW-NB15_3.csv',
    '/datasets/MyDrive/datasets/UNSW-NB15_4.csv'
]

total_df = pd.concat((pd.read_csv(f) for f in data_files), ignore_index=True)

  total_df = pd.concat((pd.read_csv(f) for f in data_files), ignore_index=True)
  total_df = pd.concat((pd.read_csv(f) for f in data_files), ignore_index=True)


I am going to resample and create a new training/testing dataset utilizing the two above, so I will combine them, one-hot encode the categorical variables, and then split them again with a 75/25 ratio.

In [3]:
combined_df = pd.concat([train_df, testing_df])

In [4]:
combined_df.select_dtypes(include=["object", "category"]).dtypes

proto         object
service       object
state         object
attack_cat    object
dtype: object

In [5]:
def one_hot(df):
    categorical_cols = ['proto','service','state']
    for col in categorical_cols:
        dummies = pd.get_dummies(df[col].astype({col: 'str'}),prefix=col, dtype=int)
        df = pd.concat([df,dummies],axis=1)
        df = df.drop(col,axis=1)

    return df

In [6]:
combined_df.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,1,3,0,0,0,1,3,0,Normal,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,1,3,0,0,0,2,3,0,Normal,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,1,3,0,0,0,2,3,0,Normal,0


In [7]:
attack_cat = combined_df.pop('attack_cat')


In [8]:
combined_df = one_hot(combined_df)
combined_df.head()

Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,state_CLO,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no
0,1,1.1e-05,2,0,496,0,90909.0902,254,0,180363632.0,...,0,0,0,0,1,0,0,0,0,0
1,2,8e-06,2,0,1762,0,125000.0003,254,0,881000000.0,...,0,0,0,0,1,0,0,0,0,0
2,3,5e-06,2,0,1068,0,200000.0051,254,0,854400000.0,...,0,0,0,0,1,0,0,0,0,0
3,4,6e-06,2,0,900,0,166666.6608,254,0,600000000.0,...,0,0,0,0,1,0,0,0,0,0
4,5,1e-05,2,0,2126,0,100000.0025,254,0,850400000.0,...,0,0,0,0,1,0,0,0,0,0


Next, to make the RNN more efficient and accurate I am going to scale all of the data to make it within the 0-1 range.

In [9]:
def scaling(df, df_columns):
    """
        This will be used to scale the data in the df to [0,1].

        Will be done using the Min-max feature scaling technique
        to bring all the values into the range [0,1]
    """
    new_normalized_df = df.copy()
    for column in df_columns:
        max_value = df[column].max()
        min_value = df[column].min()
        if max_value > min_value:
            new_normalized_df[column] = (new_normalized_df[column] - min_value) / (max_value - min_value)

    return new_normalized_df

In [10]:
scaled_df = scaling(combined_df, combined_df.columns)

In [11]:
scaled_df['label'].nunique()
scaled_df['label'].value_counts()

1.0    164673
0.0     93000
Name: label, dtype: int64

Our dataset is severely imbalanced between our labels, which we will be using to predict. So I will undersample the training data to get our training data in a sampled set. I will use sklearn's random under sampling method to undersample.

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
unlabled_df = scaled_df.drop('label', axis=1)
lables = scaled_df['label']

In [14]:
unlabled_df_train, unlabeled_df_test, lables_train, lables_test = train_test_split(unlabled_df, lables, train_size=0.75, random_state=42)

In [15]:
lables_train.value_counts()

1.0    123497
0.0     69757
Name: label, dtype: int64

In [16]:
from imblearn.under_sampling import RandomUnderSampler
under = RandomUnderSampler(sampling_strategy=1)
train_smote, label_train_smote = under.fit_resample(unlabled_df_train, lables_train)

In [17]:
label_train_smote.value_counts()

0.0    69757
1.0    69757
Name: label, dtype: int64

In [18]:
label_train_smote.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: label, dtype: float64

In [19]:
train_df_smote = pd.concat([train_smote, label_train_smote], axis=1)

In [20]:
train_df_smote['label'].value_counts()

0.0    69757
1.0    69757
Name: label, dtype: int64

In [21]:
train_df_smote.shape

(139514, 198)

I am going to begin using a keras model to see how accurately it predicts the label for malicious data.
To do so, we import numpy, tensorflow, keras, and from keras we import layers so we can set the specific layers and their types in our neural network.

In [24]:
model = keras.Sequential([
    keras.layers.Embedding(input_dim=1198, output_dim=64), #input layer
    keras.layers.LSTM(128), #hidden layer
    keras.layers.Dense(1, activation='sigmoid')
    ])

In [25]:
model.compile(
    optimizer='adam',
    loss=keras.losses.MeanSquaredError(),
    metrics=['accuracy']
)

In [None]:
model.fit(unlabled_df_train, lables_train, epochs=2)

Epoch 1/2
Epoch 2/2

In [33]:
keras.utils.plot_model(model)
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          76672     
                                                                 
 lstm (LSTM)                 (None, 128)               98816     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 175617 (686.00 KB)
Trainable params: 175617 (686.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [27]:
import tensorboard

In [30]:
modelGRU = keras.Sequential([
    keras.layers.Embedding(input_dim=1198, output_dim=64), #input layer
    keras.layers.GRU(128), #hidden layer
    keras.layers.Dense(1, activation='sigmoid')
    ])

In [31]:
modelGRU.compile(
    optimizer='adam',
    loss=keras.losses.MeanSquaredError(),
    metrics=['accuracy']
)

modelGRU.fit(unlabled_df_train, lables_train, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7ad3529558d0>

In [34]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          76672     
                                                                 
 lstm (LSTM)                 (None, 128)               98816     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 175617 (686.00 KB)
Trainable params: 175617 (686.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [35]:
modelGRU.fit(unlabled_df_train, lables_train, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7ad35c774e80>

In [39]:
modelGRU.evaluate(unlabeled_df_test, lables_test)



[0.156952366232872, 0.7800338268280029]