#### Let's first install and import the necessary librairies

In [4]:
import os
import pandas as pd
import numpy as np
import zipfile

from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix
from codecarbon import EmissionsTracker


In [5]:
#First unzip the data
zip_path = 'UNSW_NB15_training-set.zip'
extract_path = './'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
zip_path = 'UNSW_NB15_testing-set.zip'
extract_path = './'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

#### Let's load our data

First load the train and test data set

In [2]:
# Load the data file into a Pandas DataFrame
X_train = pd.read_csv("UNSW_NB15_training-set.csv", sep=",")
X_test = pd.read_csv("UNSW_NB15_testing-set.csv", sep=",")

In [3]:
y_train = X_train["label"]
y_test = X_test["label"]

Let's only keep the X data of this problem and remove the columns  "id" and "attack_cat"

In [4]:
X_train = X_train.drop("id", axis=1)
X_train=X_train.drop("label", axis=1)
X_train=X_train.drop("attack_cat", axis=1)

In [5]:
X_test =X_test.drop("id", axis=1)
X_test=X_test.drop("label", axis=1)
X_test=X_test.drop("attack_cat", axis=1)

In [6]:
kc_model = KhiopsClassifier(n_trees=0)
kc_model.fit(X_train, y_train)
y_pred = kc_model.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f"Train Accuracy with  Khiops: {accuracy}")
y_pred = kc_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy with Khiops: {accuracy}")
#cm = confusion_matrix(y_test, y_pred)
#print("test confusion_matrix:")
#print(cm)

Train Accuracy with  Khiops: 0.9215979206140991
Test Accuracy with Khiops: 0.8989226706817002


In [7]:
print(f"Features evaluated: {kc_model.n_features_evaluated_}")
print(f"Features selected : {kc_model.n_features_used_}")
for i, feature in enumerate(kc_model.feature_used_names_[:]):
    print(f"{feature} - Importance: {kc_model.feature_used_importances_[i][2]}")
print("---")

Features evaluated: 42
Features selected : 14
sbytes - Importance: 0.610312
sload - Importance: 0.391522
sttl - Importance: 0.366805
dbytes - Importance: 0.304552
smean - Importance: 0.302211
synack - Importance: 0.29232
service - Importance: 0.278198
ct_srv_dst - Importance: 0.205425
response_body_len - Importance: 0.19539
tcprtt - Importance: 0.182536
ct_dst_sport_ltm - Importance: 0.120696
is_sm_ips_ports - Importance: 0.113279
ct_flw_http_mthd - Importance: 0.0184623
ct_ftp_cmd - Importance: 0.00544889
---


In [8]:
# Handle categorical variables 
#tracker.start()
categorical_cols = ['proto', 'service', 'state']
label_encoders = {}

# Create a unified encoding for all categorical variables
for col in categorical_cols:
    # Get all unique values from both train and test
    all_values = pd.concat([X_train[col], X_test[col]]).astype(str).unique()
    
    # Create encoder and fit on all possible values
    le = LabelEncoder()
    le.fit(all_values)
    
    # Transform both train and test
    X_train[col] = le.transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    label_encoders[col] = le
#tracker.stop()

In [9]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
#tracker.start()
rf_model.fit(X_train, y_train)
#tracker.stop()
y_pred = rf_model.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f"Train Accuracy with RF: {accuracy}")
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy with RF: {accuracy}")
#cm = confusion_matrix(y_test, y_pred)
#print("test confusion_matrix:")
#print(cm)

Train Accuracy with RF: 0.9999271243259
Test Accuracy with RF: 0.9005594812394135


In [10]:
importances = rf_model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({
    'Variable': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)

             Variable  Importance
35     ct_dst_src_ltm    0.093034
31       ct_state_ttl    0.067158
11              sload    0.064964
9                sttl    0.061773
6              sbytes    0.061100
8                rate    0.059888
26              smean    0.051731
40         ct_srv_dst    0.049678
34   ct_dst_sport_ltm    0.045929
7              dbytes    0.035314
0                 dur    0.033947
24             synack    0.030805
30         ct_srv_src    0.028719
27              dmean    0.023499
12              dload    0.022250
23             tcprtt    0.022057
16             dinpkt    0.021957
10               dttl    0.021896
3               state    0.019971
25             ackdat    0.018446
15             sinpkt    0.018091
5               dpkts    0.014595
2             service    0.012771
14              dloss    0.012344
13              sloss    0.012303
18               djit    0.010727
32         ct_dst_ltm    0.010697
4               spkts    0.010469
17            

In [11]:
cb_model = CatBoostClassifier(verbose=0, random_seed=42)
#tracker.start()
cb_model.fit(X_train, y_train)
#tracker.stop()
y_pred = cb_model.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f"Train Accuracy with CB : {accuracy}")
y_pred = cb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy with CB: {accuracy}")
#cm = confusion_matrix(y_test, y_pred)
#print("test confusion_matrix:")
#print(cm)

Train Accuracy with CB : 0.9871617354127192
Test Accuracy with CB: 0.9021791822791019


In [12]:
importances = cb_model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({
    'Variable': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)

             Variable  Importance
9                sttl   26.339356
35     ct_dst_src_ltm    9.592733
26              smean    8.187397
1               proto    6.895191
6              sbytes    4.353356
2             service    4.351086
7              dbytes    4.293626
40         ct_srv_dst    4.136167
34   ct_dst_sport_ltm    3.689800
10               dttl    2.862665
30         ct_srv_src    2.794099
27              dmean    2.609508
12              dload    2.540846
24             synack    2.278605
31       ct_state_ttl    1.974784
23             tcprtt    1.332395
39         ct_src_ltm    1.313366
8                rate    1.280522
32         ct_dst_ltm    1.268434
29  response_body_len    0.977694
16             dinpkt    0.776985
25             ackdat    0.667737
33   ct_src_dport_ltm    0.666819
17               sjit    0.630457
0                 dur    0.625438
11              sload    0.598604
15             sinpkt    0.377701
5               dpkts    0.367868
18            