# Importing libraries

In [None]:
%pip install pandas seaborn scikit-learn scikit-plot
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

# Loading and Checking the dataset

We chose UNSW_NB15 dataset for this IDS project.

This is the link for [UNSW_NB15 dataset](https://www.kaggle.com/datasets/mrwellsdavid/unsw-nb15).

The training and testing sets were reversed, so we changed the names before loading them from CSV files.

In [None]:
df_train = pd.read_csv("./UNSW_NB15/UNSW_NB15_training-set.csv")
df_test = pd.read_csv("./UNSW_NB15/UNSW_NB15_testing-set.csv")
print("Length of training set: ", len(df_train))
print("Length of testing set: ", len(df_test))

In order to ensure the balance between the training and testing sets and avoid processing twice, we decided to concatenate them into one dataframe and redivide them with a different ratio later with *sklearn.model_selection.train_test_split()*.

In [None]:
df = pd.concat([df_train, df_test])
# information about the dataset
df.info()

In [None]:
df.describe(include="all")

In [None]:
df.head()

In [None]:
# Remove unnecessary features
df.drop(["proto", "service", "state", ], axis=1, inplace=True)

In [None]:
df['attack_cat'].value_counts().plot.bar()

Replace labels with the following mapping:

In [None]:
df['attack_cat'] = df['attack_cat'].replace(['Normal', 'Generic', 'Exploits','Fuzzers', 'DoS','Reconnaissance', 'Analysis', 'Backdoor', 'Shellcode', 'Worms'], ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
df['attack_cat'] = df['attack_cat'].astype(int)
df.info()

# Upscale samples for poisoning detection

In [None]:
# Separate the features and labels
y = df['attack_cat']
X = df.drop(['attack_cat', 'label'], axis=1)
# Create a dictionary to store the target number of samples for each class
target_samples = {0: 400000, 1: 100000}
# Initialize the SMOTE oversampler
smote = SMOTE(sampling_strategy=target_samples)
# Upsample the dataset
X_resampled, y_resampled = smote.fit_resample(X, y)
# Create a new DataFrame with the resampled data
df_resampled = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled, columns=['attack_cat'])], axis=1)
# Verify the class distribution after upsampling
print(df_resampled['attack_cat'].value_counts())

In [None]:
df_resampled

# Print dataset to files with expected format

In [None]:
# Print the label column into Y_attack.npy
y = y_resampled.to_numpy()
np.save("Y_attack.npy", y)
# Print others columns into X.npy
X = X_resampled.to_numpy()
np.save("X.npy", X)

## Checking for duplicates

In [None]:
print(df.duplicated().sum())

There is no duplicate record.

## Checking for missing values

In [None]:
print(df.isna().sum())

There is no missing value.

## Checking the balance between benign and attack data

In [None]:
df['label'].value_counts().plot.bar()

In [None]:
df['label'].value_counts(normalize=True)

The ratio between attack and normal data is not equal, but just slightly imbalanced.
Therefore, we will not do a sampling fix here.