This is the portion where we import all the necessary modules and connect to our google drive to access the datasets. We import the datasets and convert them to Pandas dataframes, and we define our global functions for scaling and one-hot encoding our data.

I am going to use this function I found to downsize the data types in the dataset to reduce the memory usage. https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65

In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount(
    '/datasets/'
)
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from imblearn.over_sampling import SMOTE
import imblearn
import pandas as pd
import os
np.random.seed(0)
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

unsw_testing_df = pd.read_csv('/datasets/MyDrive/datasets/UNSW_NB15_testing-set.csv')
unsw_train_df = pd.read_csv('/datasets/MyDrive/datasets/UNSW_NB15_training-set.csv')
cicids_files = ['/datasets/MyDrive/datasets/Wednesday-workingHours.pcap_ISCX.csv',
                '/datasets/MyDrive/datasets/Tuesday-WorkingHours.pcap_ISCX.csv',
                '/datasets/MyDrive/datasets/Friday-WorkingHours-Morning.pcap_ISCX.csv',
                '/datasets/MyDrive/datasets/Monday-WorkingHours.pcap_ISCX.csv'
                ]
cicids_df = pd.concat((pd.read_csv(f) for f in cicids_files), ignore_index=True)


Mounted at /datasets/


Helper Functions

In [2]:

def one_hot(df):
    categorical_cols = ['proto','service','state']
    for col in categorical_cols:
        dummies = pd.get_dummies(df[col].astype({col: 'str'}),prefix=col, dtype=int)
        df = pd.concat([df,dummies],axis=1)
        df = df.drop(col,axis=1)

    return df

def scaling(df, df_columns):
    """
        This will be used to scale the data in the df to [0,1].

        Will be done using the Min-max feature scaling technique
        to bring all the values into the range [0,1]
    """
    new_normalized_df = df.copy()
    for column in df_columns:
        max_value = df[column].max()
        min_value = df[column].min()
        if max_value > min_value:
            new_normalized_df[column] = (new_normalized_df[column] - min_value) / (max_value - min_value)

    return new_normalized_df

def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in.
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings

            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()

            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all():
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)

            # test if column can be converted to an integer
            asint = props[col].fillna(0).replace([np.inf, -np.inf], 0).astype(np.int64) # Replace inf values with 0
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True


            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)

            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)

    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

loss_obj = keras.losses.BinaryCrossentropy()
def create_adversarial_examples(input, labels, model):
  with tf.GradientTape() as tape:
    tape.watch(input)
    predictions = model(input)
    loss = loss_obj(labels, predictions)
  gradient = tape.gradient(loss, input)
  signed_grad = tf.sign(gradient)
  return signed_grad

def calculate_metrics(lables, predictions):
  true_lables = lables.values.flatten()
  predicted_labels = predictions.flatten()

  accuracy = accuracy_score(true_lables, predicted_labels)
  precision = precision_score(true_lables, predicted_labels)
  recall = recall_score(true_lables, predicted_labels)
  f1 = f1_score(true_lables, predicted_labels)
  print(f'Accuracy: {accuracy:.2f}')
  print(f'Precision: {precision:.2f}')
  print(f'Recall: {recall:.2f}')
  print(f'F1 Score: {f1:.2f}')


def make_predictions(model, data, labels):
    predictions = model.predict(data)
    binary_predictions = np.where(predictions >= 0.5, 1, 0)
    return binary_predictions


This is the cleaning we do for the UNSW dataset. We have to reduce the amount of unique values in each feature to make training more efficient, we then remove the attack categories and one-hot encode every feature to bring it from a categorical to a numerical feature. Afterwards, we one hot encode every feature to bring it into the range of [0,1] which improves accuracy in training/testing.

In [3]:
df_cat = unsw_train_df.select_dtypes(exclude=[np.number])
print(df_cat.describe(include='all'))
DEBUG = 0
## reducing the amount of uniques in each feature
for feature in df_cat.columns:
    if DEBUG == 1:
        print(feature)
        print('nunique = '+str(df_cat[feature].nunique()))
        print(df_cat[feature].nunique()>7)
        print(sum(unsw_train_df[feature].isin(unsw_train_df[feature].value_counts().head().index)))
        print('----------------------------------------------------')

    if df_cat[feature].nunique()>8:
        unsw_train_df[feature] = np.where(unsw_train_df[feature].isin(unsw_train_df[feature].value_counts().head().index), unsw_train_df[feature], 'Combined')
# taking out the attack category
unsw_attack_cat = unsw_train_df.pop('attack_cat')

# going to reduce the amount of memory the dataset takes
unsw_train_df.columns = unsw_train_df.columns.str.strip()
unsw_df, NAList = reduce_mem_usage(unsw_train_df)
# replacing infinity, and neg infinity values with NaN then dropping them
unsw_df.replace([np.inf, -np.inf], np.nan, inplace=True)
unsw_df.dropna(inplace=True)
# dropping duplicates
print(f"Fully duplicate rows to drop: {unsw_df.duplicated().sum()}")
unsw_df.drop_duplicates(inplace=True)
unsw_df.reset_index(drop=True, inplace=True)
# one hot encoding the entire dataframe
unsw_e_df = one_hot(unsw_df)
# scaling the dataframe using our scaling() function
unsw_s_df = scaling(unsw_e_df, unsw_e_df.columns)
# dropping the label column in order to undersample
unsw_unlabled_df = unsw_s_df.drop('label', axis=1)
# setting the labels to a var so we don't lose them
lables = unsw_s_df['label']
print(f"UNSW Dataframe Shape After cleaning: {unsw_df.shape}")
acc_per_fold = []
loss_per_fold = []
# splitting the dataframe into testing and training with a 75/25 split
unsw_unlabled_df_train, unsw_unlabeled_df_test, unsw_lables_train, unsw_lables_test = train_test_split(unsw_unlabled_df, lables, train_size=0.75, random_state=42)
# combining them for kfold cross validation
inputs = pd.concat([unsw_unlabeled_df_test, unsw_unlabled_df_train])
targets = pd.concat([unsw_lables_test, unsw_lables_train])
# under-sampling the dataset using SMOTE
under = RandomUnderSampler(sampling_strategy=1)
unsw_train_smote, unsw_label_train_smote = under.fit_resample(unsw_unlabled_df_train, unsw_lables_train)
# adding the labels back to the dataframe
unsw_train_df_smote = pd.concat([unsw_train_smote, unsw_label_train_smote], axis=1)
# converting to a numpy array so we can reshape the input for our models
unsw_nump_train = unsw_train_smote.to_numpy()
unsw_nump_test = unsw_unlabeled_df_test.to_numpy()
unsw_train = unsw_nump_train.reshape(unsw_nump_train.shape[0], 1, unsw_nump_train.shape[1])
unsw_test = unsw_nump_test.reshape(unsw_nump_test.shape[0], 1, unsw_nump_test.shape[1])
print(f"UNSW Testing Sample Shape: {unsw_test.shape}")
print(f"UNSW Training Sample Shape: {unsw_train.shape}")

         proto service   state attack_cat
count   175341  175341  175341     175341
unique     133      13       9         10
top        tcp       -     INT     Normal
freq     79946   94168   82275      56000
Memory usage of properties dataframe is : 58.860931396484375  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  20.902398109436035  MB
This is  35.51149737784218 % of the initial size
Fully duplicate rows to drop: 0
UNSW Dataframe Shape After cleaning: (175341, 44)
UNSW Testing Sample Shape: (43836, 1, 58)
UNSW Training Sample Shape: (84058, 1, 58)


This is the data cleaning we have to do for the CICIDS dataset. To begin we convert the Label feature into a numerical column with 0 being the benign label and 1 being the malicious label and adding those values to a new column: `mal_or_not`. We then remove the Label feature and scale the data for the entire set. We do our train/test split with a 75/25 ratio of data for training and testing. Then we convert to a numpy array so we can accurately gauge the shape of the dataset to pass to the model.

In [4]:
print(f"CICIDS Dataframe Shape Before: {cicids_df.shape}")
cicids_df.loc[cicids_df[' Label'] != "BENIGN", 'mal_or_not'] = 1
cicids_df.loc[cicids_df[' Label'] == "BENIGN", 'mal_or_not'] = 0
# pop the label off, then scale as it is a categorical column
attack_labels = cicids_df.pop(" Label")
drop_columns = [ # this list includes all spellings across CIC NIDS datasets
    "Flow ID",
    'Fwd Header Length.1',
    "Source IP", "Src IP",
    "Source Port", "Src Port",
    "Destination IP", "Dst IP",
    "Destination Port", "Dst Port",
    "Timestamp",
]
temp = cicids_df
temp.columns = temp.columns.str.strip() # deleting the trailing whitespaces if there are any
temp.drop(columns=drop_columns, inplace=True, errors='ignore')
temp_df, NAList = reduce_mem_usage(temp)
temp_df.replace([np.inf, -np.inf], np.nan, inplace=True)
temp_df.dropna(inplace=True)
print(f"fully duplicate rows to remove: {temp_df.duplicated().sum()}")
temp_df.drop_duplicates(inplace=True)
temp_df.reset_index(drop=True, inplace=True)
cicids_df = temp_df
# scaling the dataframe using our scaling() function
cicids_s_df = scaling(cicids_df, cicids_df.columns)
# dropping the label column in order to undersample
cicids_unlabled_df = cicids_s_df.drop('mal_or_not', axis=1)
# setting the labels to a var so we don't lose them
lables = cicids_s_df['mal_or_not']
print(f"CICIDS Dataframe Shape After cleaning: {cicids_df.shape}")
# splitting the dataframe into testing and training with a 75/25 split
cicids_unlabled_df_train, cicids_unlabeled_df_test, cicids_lables_train, cicids_lables_test = train_test_split(cicids_unlabled_df, lables, train_size=0.75, random_state=42)
# under-sampling the dataset using SMOTE
under = RandomUnderSampler(sampling_strategy=1)
cicids_train_smote, cicids_label_train_smote = under.fit_resample(cicids_unlabled_df_train, cicids_lables_train)
# adding the labels back to the dataframe
cicids_train_df_smote = pd.concat([cicids_train_smote, cicids_label_train_smote], axis=1)
# converting to a numpy array so we can reshape the input for our models
cicids_nump_train = cicids_train_smote.to_numpy()
cicids_nump_test = cicids_unlabeled_df_test.to_numpy()
cicids_train = cicids_nump_train.reshape(cicids_nump_train.shape[0], 1, cicids_nump_train.shape[1])
cicids_test = cicids_nump_test.reshape(cicids_nump_test.shape[0], 1, cicids_nump_test.shape[1])
print(f"CICIDS Testing Sample Shape: {cicids_test.shape}")
print(f"CICIDS Training Sample Shape: {cicids_train.shape}")

CICIDS Dataframe Shape Before: (1859563, 79)
Memory usage of properties dataframe is : 1092.4252853393555  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  425.6203155517578  MB
This is  38.96104578168395 % of the initial size
fully duplicate rows to remove: 300645
CICIDS Dataframe Shape After cleaning: (1556798, 77)
CICIDS Testing Sample Shape: (389200, 1, 76)
CICIDS Training Sample Shape: (306398, 1, 76)


This is our CICIDS model using the LSTM architecture.

In [5]:
cicids_model = keras.Sequential([
    keras.layers.LSTM(units=256, input_shape=(cicids_train.shape[1], cicids_train.shape[2]), return_sequences=True),
    keras.layers.Dropout(0.5),
    keras.layers.LSTM(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')
])

In [6]:
cicids_model.compile(
    optimizer='adam',
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

In [7]:
cicids_results = cicids_model.fit(cicids_train, cicids_label_train_smote, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
clean_preds = make_predictions(cicids_model, cicids_test, cicids_lables_test)
calculate_metrics(cicids_lables_test, clean_preds)

Accuracy: 0.98
Precision: 0.85
Recall: 0.99
F1 Score: 0.91


This is our UNSW Model using the LSTM architecture.

In [9]:
unsw_model = keras.Sequential([
    keras.layers.LSTM(units=256, input_shape=(unsw_train.shape[1], unsw_train.shape[2]), return_sequences=True),
    keras.layers.Dropout(0.5),
    keras.layers.LSTM(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')
])

In [10]:
unsw_model.compile(
    optimizer='adam',
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

In [11]:
unsw_results = unsw_model.fit(unsw_train, unsw_label_train_smote, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
unsw_clean_preds = make_predictions(unsw_model, unsw_test, unsw_lables_test)
calculate_metrics(unsw_lables_test, unsw_clean_preds)

Accuracy: 0.95
Precision: 0.99
Recall: 0.93
F1 Score: 0.96


Generating the adversarial examples

---



CICIDS

In [13]:
cicids_test = tf.convert_to_tensor(cicids_test)
cicids_lables = tf.reshape(cicids_lables_test, (cicids_lables_test.shape[0], 1))
c_adv_x = create_adversarial_examples(cicids_test, cicids_lables, cicids_model) + cicids_test
c_adv_preds = make_predictions(cicids_model, c_adv_x, cicids_lables_test)
calculate_metrics(cicids_lables_test, c_adv_preds)

Accuracy: 0.09
Precision: 0.02
Recall: 0.14
F1 Score: 0.04


UNSW

In [14]:
# Convert the test set and labels to tensors
unsw_test = tf.convert_to_tensor(unsw_test)
unsw_labels = tf.reshape(unsw_lables_test, (unsw_lables_test.shape[0], 1))

# Create adversarial examples
u_adv_x = create_adversarial_examples(unsw_test, unsw_labels, unsw_model) + unsw_test

# Predict on the adversarial examples
u_adv_preds = make_predictions(unsw_model, u_adv_x, unsw_lables_test)
calculate_metrics(unsw_lables_test, u_adv_preds)

Accuracy: 0.15
Precision: 0.18
Recall: 0.07
F1 Score: 0.10


Visualizing Things

In [15]:
!pip install graphviz



In [16]:
print("Fig. 1, the summarization of the LSTM Model Trained on the UNSW Dataset")
print(unsw_model.summary())
print("Fig. 2, the summarization of the LSTM Model Trained on the CICIDS Dataset")
print(cicids_model.summary())

Fig. 1, the summarization of the LSTM Model Trained on the UNSW Dataset
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 1, 256)            322560    
                                                                 
 dropout_3 (Dropout)         (None, 1, 256)            0         
                                                                 
 lstm_3 (LSTM)               (None, 128)               197120    
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dropout_5 (Dropout)         (None, 32)                0         
                                                

In [17]:
plot_model(cicids_model, to_file='cicids_model_plot.png', show_shapes=True, show_layer_names=True)

NameError: name 'plot_model' is not defined

In [None]:
def plot_cm(labels, preds, title):
  conf_matrix = confusion_matrix(labels, preds)

# Step 5: Plot the confusion matrix using seaborn for better visualization
  plt.figure(figsize=(8, 6))
  sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Negative', 'Predicted Positive'], yticklabels=['Actual Negative', 'Actual Positive'])
  plt.xlabel('Predicted Labels')
  plt.ylabel('True Labels')
  plt.title(title)
  plt.show()


In [None]:
print(len(cicids_lables_test))
print(len(clean_preds))
print(len(unsw_lables_test))
print(len(unsw_clean_preds))
print(len(c_adv_preds))
print(len(u_adv_preds))

In [None]:
flat_preds = clean_preds.flatten()
flat_unsw_preds = unsw_clean_preds.flatten()
flat_c_adv_preds = c_adv_preds.flatten()
flat_u_adv_preds = u_adv_preds.flatten()
print(flat_preds.shape)
print(flat_unsw_preds.shape)
print(flat_c_adv_preds.shape)
print(flat_u_adv_preds.shape)

In [None]:
print(cicids_lables_test.shape)
print(flat_preds.shape)
print(unsw_lables_test.shape)
print(unsw_clean_preds.shape)
print(c_adv_preds.shape)
print(u_adv_preds.shape)

In [None]:
binary_lables = np.where(cicids_lables_test >= 0.5, 1, 0)
binary_unsw_lables = np.where(unsw_lables_test >= 0.5, 1, 0)
flat_lables = binary_lables.flatten()
flat_unsw_lables = binary_unsw_lables.flatten()

In [42]:
confusion_matrix(flat_lables, flat_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Negative', 'Predicted Positive'], yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('CICIDS Clean Predictions')
plt.show()

ValueError: Must pass 2-d input. shape=()

<Figure size 800x600 with 0 Axes>