In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount(
    '/datasets/'
)

from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from imblearn.over_sampling import SMOTE
import imblearn
import pandas as pd
import os
np.random.seed(0)
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

Mounted at /datasets/


In [2]:
unsw_testing_df = pd.read_csv('/datasets/MyDrive/datasets/UNSW_NB15_testing-set.csv')
unsw_train_df = pd.read_csv('/datasets/MyDrive/datasets/UNSW_NB15_training-set.csv')
cicids_files = ['/datasets/MyDrive/datasets/Wednesday-workingHours.pcap_ISCX.csv',
                '/datasets/MyDrive/datasets/Tuesday-WorkingHours.pcap_ISCX.csv',
                '/datasets/MyDrive/datasets/Friday-WorkingHours-Morning.pcap_ISCX.csv',
                '/datasets/MyDrive/datasets/Monday-WorkingHours.pcap_ISCX.csv'
                ]
cicids_df = pd.concat((pd.read_csv(f) for f in cicids_files), ignore_index=True)

In [3]:
def one_hot(df):
    categorical_cols = ['proto','service','state']
    for col in categorical_cols:
        dummies = pd.get_dummies(df[col].astype({col: 'str'}),prefix=col, dtype=int)
        df = pd.concat([df,dummies],axis=1)
        df = df.drop(col,axis=1)

    return df

def scaling(df, df_columns):
    """
        This will be used to scale the data in the df to [0,1].

        Will be done using the Min-max feature scaling technique
        to bring all the values into the range [0,1]
    """
    new_normalized_df = df.copy()
    for column in df_columns:
        max_value = df[column].max()
        min_value = df[column].min()
        if max_value > min_value:
            new_normalized_df[column] = (new_normalized_df[column] - min_value) / (max_value - min_value)

    return new_normalized_df

def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in.
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings

            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()

            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all():
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)

            # test if column can be converted to an integer
            asint = props[col].fillna(0).replace([np.inf, -np.inf], 0).astype(np.int64) # Replace inf values with 0
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True


            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)

            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)

    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [4]:
df_cat = unsw_train_df.select_dtypes(exclude=[np.number])
print(df_cat.describe(include='all'))
DEBUG = 0
## reducing the amount of uniques in each feature
for feature in df_cat.columns:
    if DEBUG == 1:
        print(feature)
        print('nunique = '+str(df_cat[feature].nunique()))
        print(df_cat[feature].nunique()>7)
        print(sum(unsw_train_df[feature].isin(unsw_train_df[feature].value_counts().head().index)))
        print('----------------------------------------------------')

    if df_cat[feature].nunique()>8:
        unsw_train_df[feature] = np.where(unsw_train_df[feature].isin(unsw_train_df[feature].value_counts().head().index), unsw_train_df[feature], 'Combined')
# taking out the attack category
unsw_attack_cat = unsw_train_df.pop('attack_cat')

# going to reduce the amount of memory the dataset takes
unsw_train_df.columns = unsw_train_df.columns.str.strip()
unsw_df, NAList = reduce_mem_usage(unsw_train_df)
# replacing infinity, and neg infinity values with NaN then dropping them
unsw_df.replace([np.inf, -np.inf], np.nan, inplace=True)
unsw_df.dropna(inplace=True)
# dropping duplicates
print(f"Fully duplicate rows to drop: {unsw_df.duplicated().sum()}")
unsw_df.drop_duplicates(inplace=True)
unsw_df.reset_index(drop=True, inplace=True)
# one hot encoding the entire dataframe
unsw_e_df = one_hot(unsw_df)
# scaling the dataframe using our scaling() function
unsw_s_df = scaling(unsw_e_df, unsw_e_df.columns)
# dropping the label column in order to undersample
unsw_unlabled_df = unsw_s_df.drop('label', axis=1)
# setting the labels to a var so we don't lose them
lables = unsw_s_df['label']
print(f"UNSW Dataframe Shape After cleaning: {unsw_df.shape}")
# splitting the dataframe into testing and training with a 75/25 split
unsw_unlabled_df_train, unsw_unlabeled_df_test, unsw_lables_train, unsw_lables_test = train_test_split(unsw_unlabled_df, lables, train_size=0.75, random_state=42)
# under-sampling the dataset using SMOTE
under = RandomUnderSampler(sampling_strategy=1)
unsw_train_smote, unsw_label_train_smote = under.fit_resample(unsw_unlabled_df_train, unsw_lables_train)
# adding the labels back to the dataframe
unsw_train_df_smote = pd.concat([unsw_train_smote, unsw_label_train_smote], axis=1)
# converting to a numpy array so we can reshape the input for our models
unsw_nump_train = unsw_train_smote.to_numpy()
unsw_nump_test = unsw_unlabeled_df_test.to_numpy()
unsw_train = unsw_nump_train.reshape(unsw_nump_train.shape[0], 1, unsw_nump_train.shape[1])
unsw_test = unsw_nump_test.reshape(unsw_nump_test.shape[0], 1, unsw_nump_test.shape[1])
print(f"UNSW Testing Sample Shape: {unsw_test.shape}")
print(f"UNSW Training Sample Shape: {unsw_train.shape}")

         proto service   state attack_cat
count   175341  175341  175341     175341
unique     133      13       9         10
top        tcp       -     INT     Normal
freq     79946   94168   82275      56000
Memory usage of properties dataframe is : 58.860931396484375  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  20.902398109436035  MB
This is  35.51149737784218 % of the initial size
Fully duplicate rows to drop: 0
UNSW Dataframe Shape After cleaning: (175341, 44)
UNSW Testing Sample Shape: (43836, 1, 58)
UNSW Training Sample Shape: (84058, 1, 58)


Preliminary Work on CICIDS Dataset

  To simplify the modeling, I am going to create a mal_or_not column. 0 is for benign and 1 is for malicious. Then I will just drop the Label column

In [5]:
print(f"CICIDS Dataframe Shape Before: {cicids_df.shape}")
cicids_df.loc[cicids_df[' Label'] != "BENIGN", 'mal_or_not'] = 1
cicids_df.loc[cicids_df[' Label'] == "BENIGN", 'mal_or_not'] = 0
# pop the label off, then scale as it is a categorical column
attack_labels = cicids_df.pop(" Label")
drop_columns = [ # this list includes all spellings across CIC NIDS datasets
    "Flow ID",
    'Fwd Header Length.1',
    "Source IP", "Src IP",
    "Source Port", "Src Port",
    "Destination IP", "Dst IP",
    "Destination Port", "Dst Port",
    "Timestamp",
]
temp = cicids_df
temp.columns = temp.columns.str.strip() # deleting the trailing whitespaces if there are any
temp.drop(columns=drop_columns, inplace=True, errors='ignore')
temp_df, NAList = reduce_mem_usage(temp)
temp_df.replace([np.inf, -np.inf], np.nan, inplace=True)
temp_df.dropna(inplace=True)
print(f"fully duplicate rows to remove: {temp_df.duplicated().sum()}")
temp_df.drop_duplicates(inplace=True)
temp_df.reset_index(drop=True, inplace=True)
cicids_df = temp_df
# scaling the dataframe using our scaling() function
cicids_s_df = scaling(cicids_df, cicids_df.columns)
# dropping the label column in order to undersample
cicids_unlabled_df = cicids_s_df.drop('mal_or_not', axis=1)
# setting the labels to a var so we don't lose them
lables = cicids_s_df['mal_or_not']
print(f"CICIDS Dataframe Shape After cleaning: {cicids_df.shape}")
# splitting the dataframe into testing and training with a 75/25 split
cicids_unlabled_df_train, cicids_unlabeled_df_test, cicids_lables_train, cicids_lables_test = train_test_split(cicids_unlabled_df, lables, train_size=0.75, random_state=42)
# under-sampling the dataset using SMOTE
under = RandomUnderSampler(sampling_strategy=1)
cicids_train_smote, cicids_label_train_smote = under.fit_resample(cicids_unlabled_df_train, cicids_lables_train)
# adding the labels back to the dataframe
cicids_train_df_smote = pd.concat([cicids_train_smote, cicids_label_train_smote], axis=1)
# converting to a numpy array so we can reshape the input for our models
cicids_nump_train = cicids_train_smote.to_numpy()
cicids_nump_test = cicids_unlabeled_df_test.to_numpy()
cicids_train = cicids_nump_train.reshape(cicids_nump_train.shape[0], 1, cicids_nump_train.shape[1])
cicids_test = cicids_nump_test.reshape(cicids_nump_test.shape[0], 1, cicids_nump_test.shape[1])
print(f"CICIDS Testing Sample Shape: {cicids_test.shape}")
print(f"CICIDS Training Sample Shape: {cicids_train.shape}")

CICIDS Dataframe Shape Before: (1859563, 79)
Memory usage of properties dataframe is : 1092.4252853393555  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  425.6203155517578  MB
This is  38.96104578168395 % of the initial size
fully duplicate rows to remove: 300645
CICIDS Dataframe Shape After cleaning: (1556798, 77)
CICIDS Testing Sample Shape: (389200, 1, 76)
CICIDS Training Sample Shape: (306398, 1, 76)


In [6]:
def list_different_columns(df1, df2):
    """
    List the different columns between two pandas DataFrames.

    Parameters:
    - df1: pandas DataFrame, first DataFrame.
    - df2: pandas DataFrame, second DataFrame.

    Returns:
    - A dictionary with keys 'unique_to_df1' and 'unique_to_df2' containing the column names unique to each DataFrame.
    """
    # Extract column names
    columns_df1 = set(df1.columns)
    columns_df2 = set(df2.columns)

    # Find columns unique to each DataFrame
    unique_to_df1 = columns_df1 - columns_df2
    unique_to_df2 = columns_df2 - columns_df1

    return {
        'unique_to_df1': list(unique_to_df1),
        'unique_to_df2': list(unique_to_df2)
    }
different_columns = list_different_columns(cicids_unlabeled_df_test, cicids_train_smote)

print("Columns unique to test:", different_columns['unique_to_df1'])
print("Columns unique to train:", different_columns['unique_to_df2'])

Columns unique to test: []
Columns unique to train: []


Now to build the MLP Deep Learning Network

In [7]:
cicids_model = keras.Sequential([
    keras.layers.Dense(64, input_shape=(cicids_train.shape[1], cicids_train.shape[2])),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [8]:
cicids_model.compile(
    optimizer='adam',
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

In [9]:
cicids_res = cicids_model.fit(cicids_train, cicids_label_train_smote, epochs=10, validation_split=0.1, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Displaying the metrics for the current model

In [10]:
def calculate_metrics(lables, predictions):
  true_lables = lables.values.flatten()
  predicted_labels = predictions.flatten()

  accuracy = accuracy_score(true_lables, predicted_labels)
  precision = precision_score(true_lables, predicted_labels)
  recall = recall_score(true_lables, predicted_labels)
  f1 = f1_score(true_lables, predicted_labels)
  print(f'Accuracy: {accuracy:.2f}')
  print(f'Precision: {precision:.2f}')
  print(f'Recall: {recall:.2f}')
  print(f'F1 Score: {f1:.2f}')

In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
def make_predictions(model, data, labels):
    predictions = model.predict(data)
    binary_predictions = np.where(predictions >= 0.5, 1, 0)
    return binary_predictions

clean_preds = make_predictions(cicids_model, cicids_test, cicids_lables_test)



In [12]:
calculate_metrics(cicids_lables_test, clean_preds)

Accuracy: 0.98
Precision: 0.85
Recall: 0.99
F1 Score: 0.92


Building and Modeling the same model with the UNSW dataset

In [13]:
unsw_model = keras.Sequential([
    keras.layers.Dense(64, input_shape=(unsw_train.shape[1], unsw_train.shape[2])),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [14]:
unsw_model.compile(
    optimizer='adam',
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

In [15]:
unsw_res = unsw_model.fit(unsw_train, unsw_label_train_smote, epochs=10, validation_split=0.1, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
unsw_clean_preds = make_predictions(unsw_model, unsw_test, unsw_lables_test)
calculate_metrics(unsw_lables_test, unsw_clean_preds)

Accuracy: 0.96
Precision: 0.99
Recall: 0.95
F1 Score: 0.97


Generating and fitting adversarial examples for each model:


---



In [18]:
loss_obj = keras.losses.BinaryCrossentropy()
def create_adversarial_examples(input, labels, model):
  with tf.GradientTape() as tape:
    tape.watch(input)
    predictions = model(input)
    loss = loss_obj(labels, predictions)
  gradient = tape.gradient(loss, input)
  signed_grad = tf.sign(gradient)
  return signed_grad

CICIDS

---



In [19]:
cicids_test = tf.convert_to_tensor(cicids_test)
cicids_lables = tf.reshape(cicids_lables_test, (cicids_lables_test.shape[0], 1))
c_adv_x = create_adversarial_examples(cicids_test, cicids_lables, cicids_model) + cicids_test
c_adv_preds = make_predictions(cicids_model, c_adv_x, cicids_lables_test)



In [20]:
calculate_metrics(cicids_lables_test, c_adv_preds)

Accuracy: 0.77
Precision: 0.33
Recall: 0.76
F1 Score: 0.46


UNSW

---



In [21]:

# Assuming create_adversarial_examples function is defined earlier

# Convert the test set and labels to tensors
unsw_test = tf.convert_to_tensor(unsw_test)
unsw_labels = tf.reshape(unsw_lables_test, (unsw_lables_test.shape[0], 1))

# Create adversarial examples
u_adv_x = create_adversarial_examples(unsw_test, unsw_labels, unsw_model) + unsw_test

# Predict on the adversarial examples
u_adv_preds = make_predictions(unsw_model, u_adv_x, unsw_lables_test)
calculate_metrics(unsw_lables_test, u_adv_preds)

Accuracy: 0.57
Precision: 0.83
Recall: 0.47
F1 Score: 0.60


Visualize Things

In [22]:
print("Fig. 3, the summarization of the MLP Model Trained on the UNSW Dataset")
print(unsw_model.summary())
print("Fig. 4, the summarization of the MLP Model Trained on the CICIDS Dataset")
print(cicids_model.summary())

Fig. 3, the summarization of the MLP Model Trained on the UNSW Dataset
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 1, 64)             3776      
                                                                 
 dense_6 (Dense)             (None, 1, 64)             4160      
                                                                 
 dense_7 (Dense)             (None, 1, 32)             2080      
                                                                 
 dense_8 (Dense)             (None, 1, 16)             528       
                                                                 
 dense_9 (Dense)             (None, 1, 1)              17        
                                                                 
Total params: 10561 (41.25 KB)
Trainable params: 10561 (41.25 KB)
Non-trainable params: 0 (0.00 Byte)
_____________

In [25]:
conf_matrix1 = confusion_matrix(cicids_lables_test, clean_preds)
conf_matrix2 = confusion_matrix(unsw_lables_test, unsw_clean_preds)
conf_matrix3 = confusion_matrix(cicids_lables_test, c_adv_preds)
conf_matrix4 = confusion_matrix(unsw_lables_test, u_adv_preds)

# Step 5: Plot the confusion matrix using seaborn for better visualization
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Negative', 'Predicted Positive'], yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

ValueError: Classification metrics can't handle a mix of binary and unknown targets