In [1]:
%pip install pandas numpy matplotlib scipy scikit-learn scikit-plot

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import time

In [3]:
train_df = pd.read_csv('Data/UNSW_NB15_training-set.csv')
test_df = pd.read_csv('Data/UNSW_NB15_testing-set.csv')

In [4]:
print(f"Training set shape: {train_df.shape}")
print(f"Testing set shape: {test_df.shape}")

Training set shape: (175341, 45)
Testing set shape: (82332, 45)


In [5]:
# 1 Pre-process data

In [6]:
#1.1: drop "id" column
train_df.drop(columns=["id"], inplace=True, errors='ignore')
test_df.drop(columns=["id"], inplace=True, errors='ignore')

In [7]:
#1.2: Fill null values in "service" col
train_df['service'] = train_df['service'].replace('-', 'other').fillna('other')
test_df['service'] = test_df['service'].replace('-', 'other').fillna('other')

In [8]:
#1.3: One-hot encoding proto, service, state
nominal_features = ['proto', 'service', 'state']
train_df = pd.get_dummies(train_df, columns=nominal_features)
test_df = pd.get_dummies(test_df, columns=nominal_features)
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

print(train_df.shape)
print(test_df.shape)

(175341, 196)
(82332, 196)


In [9]:
from sklearn.preprocessing import LabelEncoder

y_train_binary = train_df['label']
y_test_binary = test_df['label']

y_train_multiclass = train_df['attack_cat']
y_test_multiclass = test_df['attack_cat']

le = LabelEncoder()
y_train_multiclass = le.fit_transform(y_train_multiclass)
y_test_multiclass = le.transform(y_test_multiclass)

X_train_base = train_df.drop(columns = ['label', 'attack_cat'])
X_test_base = test_df.drop(columns = ['label', 'attack_cat'])


print(X_train_base.shape)

(175341, 194)


In [10]:
#2 Feature reduction

k=4 #number of features to keep

In [11]:
# 2.1: Feature Selection (correlation matrix) 
start_select_time_train = time.time()                  #to calculate timeFR_train
correlation_matrix = X_train_base.corr()
C_i = correlation_matrix.mean()
selected_features = C_i.sort_values(ascending=False).head(k).index.tolist()
X_train_sel = X_train_base[selected_features]
FS_train_time = time.time() - start_select_time_train  #feature selection train time

start_select_time_test = time.time()                   #to calculate timeFR_test
X_test_sel = X_test_base[selected_features]
FS_test_time = time.time() - start_select_time_test    #feature selection time

print(selected_features)
print(X_train_sel)

['dpkts', 'spkts', 'dbytes', 'dloss']
        dpkts  spkts  dbytes  dloss
0           4      6     172      0
1          38     14   42014     17
2          16      8   13186      6
3          12     12     770      3
4           6     10     268      1
...       ...    ...     ...    ...
175336      0      2       0      0
175337      8     10     354      1
175338      0      2       0      0
175339      0      2       0      0
175340      0      2       0      0

[175341 rows x 4 columns]


In [12]:
# 2.2: Feature Extraction (PCA) 
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA 

scaler = MinMaxScaler()
pca= PCA(n_components=k)

start_extract_time_train = time.time(); #to calculate timeFR_train
X_train_scaled = scaler.fit_transform(X_train_base)
X_train_ext = pca.fit_transform(X_train_scaled)
FE_train_time = time.time() - start_extract_time_train 

start_extract_time_test = time.time(); #to calculate timeFR_test
X_test_scaled = scaler.transform(X_test_base) 
X_test_ext = pca.transform(X_test_scaled)
FE_test_time = time.time() - start_extract_time_test 

print(X_train_ext)

[[ 1.38882863  0.28883011  0.40652493 -0.39708978]
 [ 1.5398475   0.08039593 -0.05993712 -0.41949462]
 [ 1.59565277  0.13570403 -0.08215488 -0.35166054]
 ...
 [-1.44790597 -0.51825894  0.09849325  0.06423195]
 [-1.632304   -0.80224102  0.34373997 -0.20259839]
 [-1.63504559 -0.80642334  0.34753581 -0.20509896]]


In [13]:
#3: Attact classifiers  
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import precision_score, recall_score, f1_score

models = {
    "Decision Tree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(max_depth=5),
    "KNeighbors": KNeighborsClassifier(n_neighbors=5),
    "MLP": MLPClassifier(max_iter=100, hidden_layer_sizes=200),
    "Naive Bayes": BernoulliNB()
}

In [14]:
#4: Evaluate metrics 

def get_metrics(model, X_train, y_train, X_test, y_test, FR_train_time, FR_test_time):
    
    #training time = fit time + feature reduction train time
    start_train=time.time()
    model.fit(X_train, y_train)
    total_train_time= time.time() - start_train + FR_train_time

    #inference time = predict time + feature reduction test time
    start_test = time.time()
    y_pred = model.predict(X_test)
    total_infer_time = time.time() - start_test + FR_test_time
    #convert to Microseconds per Sample
    n_samples = len(y_test)
    inference_us_per_sample = (total_infer_time / n_samples) * 1_000_000 
    
    #precision, recall, f1-score
    p = precision_score(y_test, y_pred, average='weighted') * 100
    r = recall_score(y_test, y_pred, average='weighted') * 100
    f1 = f1_score(y_test, y_pred, average='weighted') * 100

    return [p, r, f1, total_train_time, inference_us_per_sample]

In [15]:
#5.1: Display binary classification results
results_data = []

for name, model in models.items():
    print(f"Evaluating {name}...")
    
    # Feature Extraction results 
    from sklearn.base import clone
    model_ext = clone(model)
    ext_metrics = get_metrics(model_ext, X_train_ext, y_train_binary, X_test_ext, y_test_binary, FE_train_time, FE_test_time)
    
    # Feature Selection results 
    model_sel = clone(model)
    sel_metrics = get_metrics(model_sel, X_train_sel, y_train_binary, X_test_sel, y_test_binary, FS_train_time, FS_test_time)
    
    row = [name] + ext_metrics + sel_metrics
    results_data.append(row)

# Format dataframe
columns = pd.MultiIndex.from_product(
    [["Feature Extraction", "Feature Selection"], 
     ["P", "R", "F1", "Training (s)", "Inference (us)"]]
)

df_results = pd.DataFrame(
    [r[1:] for r in results_data], 
    index=[r[0] for r in results_data], 
    columns=columns
)

display(df_results.round(2))
print("4 selected/extracted features and binary classification results")

Evaluating Decision Tree...
Evaluating RandomForest...
Evaluating KNeighbors...
Evaluating MLP...




Evaluating Naive Bayes...


Unnamed: 0_level_0,Feature Extraction,Feature Extraction,Feature Extraction,Feature Extraction,Feature Extraction,Feature Selection,Feature Selection,Feature Selection,Feature Selection,Feature Selection
Unnamed: 0_level_1,P,R,F1,Training (s),Inference (us),P,R,F1,Training (s),Inference (us)
Decision Tree,85.76,84.67,84.37,2.67,5.24,84.09,79.89,78.75,17.28,0.12
RandomForest,85.75,81.35,80.31,20.39,9.5,79.14,75.88,74.52,22.15,5.02
KNeighbors,86.24,84.7,84.32,1.77,11.08,52.56,48.06,42.89,17.35,1190.14
MLP,85.81,81.87,80.95,466.41,6.18,76.79,75.3,74.42,131.65,2.19
Naive Bayes,72.55,71.9,71.12,1.58,5.21,75.48,73.63,73.59,17.19,0.2


4 selected/extracted features and binary classification results


In [None]:
#5.2: Display multi-class classification results
results_data_1 = []

for name, model in models.items():
    print(f"Evaluating {name}...")
    
    # Feature Extraction results 
    from sklearn.base import clone
    model_ext = clone(model)
    ext_metrics = get_metrics(model_ext, X_train_ext, y_train_multiclass, X_test_ext, y_test_multiclass, FE_train_time, FE_test_time)
    
    # Feature Selection results 
    model_sel = clone(model)
    sel_metrics = get_metrics(model_sel, X_train_sel, y_train_multiclass, X_test_sel, y_test_multiclass, FS_train_time, FS_test_time)
    
    row = [name] + ext_metrics + sel_metrics
    results_data_1.append(row)

# Format dataframe
columns = pd.MultiIndex.from_product(
    [["Feature Extraction", "Feature Selection"], 
     ["P", "R", "F1", "Training (s)", "Inference (us)"]]
)

df_results = pd.DataFrame(
    [r[1:] for r in results_data_1], 
    index=[r[0] for r in results_data_1], 
    columns=columns
)

display(df_results.round(2))
print("4 selected/extracted features and multi-class classification results")

Evaluating Decision Tree...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Evaluating RandomForest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Evaluating KNeighbors...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Evaluating MLP...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Evaluating Naive Bayes...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0_level_0,Feature Extraction,Feature Extraction,Feature Extraction,Feature Extraction,Feature Extraction,Feature Selection,Feature Selection,Feature Selection,Feature Selection,Feature Selection
Unnamed: 0_level_1,P,R,F1,Training (s),Inference (us),P,R,F1,Training (s),Inference (us)
Decision Tree,76.16,67.39,70.97,3.1,5.33,69.48,61.26,61.18,17.33,0.22
RandomForest,77.64,64.18,65.78,24.84,13.24,55.78,58.85,55.43,22.68,7.6
KNeighbors,77.93,69.41,72.64,1.77,11.07,49.79,45.48,35.47,17.34,447.9
MLP,78.81,68.58,69.63,1071.54,6.64,58.18,62.44,54.93,233.96,6.24
Naive Bayes,62.75,50.8,53.74,1.65,5.38,41.55,59.68,48.54,17.4,0.44


4 selected/extracted features and multi-class classification results
