In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, hinge_loss
import matplotlib.pyplot as plt
from sklearn.utils import resample
import matplotlib.colors
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.decomposition import PCA

In [2]:
%%time
df_main = pd.read_csv('../../Dataset/IDS 2018 Intrusion CSVs (CSE-CIC-IDS2018)/dataset_dos.csv')

CPU times: total: 3.81 s
Wall time: 3.9 s


In [3]:
len(df_main)

1015430

In [4]:
len(df_main[df_main['Label'] == 0])

507715

In [5]:
len(df_main[df_main['Label'] == 1])

507715

In [6]:
df_normal = df_main[df_main['Label'] == 0]
df_attack = df_main[df_main['Label'] == 1]

In [7]:
df_normal_downsampled = resample(df_normal, replace=False, n_samples=5000, random_state=42)
len(df_normal_downsampled)

5000

In [8]:
df_attack_downsampled = resample(df_attack, replace=False, n_samples=5000, random_state=42)
len(df_attack_downsampled)

5000

In [9]:
df_downsample = pd.concat([df_normal_downsampled, df_attack_downsampled])
len(df_downsample)

10000

In [10]:
features_to_keep = [
    'Dst Port', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 
    'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 
    'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Std', 
    'Flow IAT Max', 'Fwd IAT Tot', 'Fwd IAT Std', 'Bwd IAT Std', 'Fwd Header Len', 'Bwd Header Len', 
    'Bwd Pkts/s', 'Pkt Len Max', 'FIN Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 
    'Down/Up Ratio', 'Bwd Seg Size Avg', 'Subflow Bwd Byts', 'Init Fwd Win Byts', 'Fwd Seg Size Min', 
    'Active Max', 'Label'
]

# Chọn các cột trong danh sách features_to_keep
df_selected = df_downsample[features_to_keep]

# In ra các cột còn lại trong DataFrame
print(df_selected.columns)

Index(['Dst Port', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Std',
       'Flow IAT Max', 'Fwd IAT Tot', 'Fwd IAT Std', 'Bwd IAT Std',
       'Fwd Header Len', 'Bwd Header Len', 'Bwd Pkts/s', 'Pkt Len Max',
       'FIN Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'Down/Up Ratio', 'Bwd Seg Size Avg', 'Subflow Bwd Byts',
       'Init Fwd Win Byts', 'Fwd Seg Size Min', 'Active Max', 'Label'],
      dtype='object')


In [11]:
# X = df_downsample.drop(columns='Label')
# y = df_downsample['Label']

# # Chia dữ liệu thành tập huấn luyện và tập kiểm tra
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X = df_selected.drop(columns='Label')
y = df_selected['Label']

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
scaler = StandardScaler()
X_train_scaled  = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
%%time
# Định nghĩa các siêu tham số cần tìm kiếm
param_grid = {
    'C': [0.5, 1, 10, 100],
    'gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'sigmoid']
}

# Tạo scorer dựa trên Hinge Loss
hinge_scorer = make_scorer(hinge_loss, greater_is_better=False)

# Khởi tạo GridSearchCV với SVM
optimal_params = GridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=3
)

# Huấn luyện GridSearchCV
optimal_params.fit(X_train_scaled, y_train)

# In ra các siêu tham số tốt nhất
print(optimal_params.best_params_)
print(f'Best score: {optimal_params.best_score_}')

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END ....C=0.5, gamma=scale, kernel=rbf;, score=0.999 total time=   0.0s
[CV 2/5] END ....C=0.5, gamma=scale, kernel=rbf;, score=0.996 total time=   0.1s
[CV 3/5] END ....C=0.5, gamma=scale, kernel=rbf;, score=0.996 total time=   0.1s
[CV 4/5] END ....C=0.5, gamma=scale, kernel=rbf;, score=0.998 total time=   0.1s
[CV 5/5] END ....C=0.5, gamma=scale, kernel=rbf;, score=0.995 total time=   0.1s
[CV 1/5] END C=0.5, gamma=scale, kernel=sigmoid;, score=0.889 total time=   0.1s
[CV 2/5] END C=0.5, gamma=scale, kernel=sigmoid;, score=0.916 total time=   0.1s
[CV 3/5] END C=0.5, gamma=scale, kernel=sigmoid;, score=0.921 total time=   0.1s
[CV 4/5] END C=0.5, gamma=scale, kernel=sigmoid;, score=0.897 total time=   0.1s
[CV 5/5] END C=0.5, gamma=scale, kernel=sigmoid;, score=0.904 total time=   0.1s
[CV 1/5] END ........C=0.5, gamma=1, kernel=rbf;, score=0.996 total time=   0.2s
[CV 2/5] END ........C=0.5, gamma=1, kernel=rbf

In [18]:
%%time
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)

train_pc1_coords = X_train_pca[:, 0]
train_pc2_coords = X_train_pca[:, 1]
pca_train_scaled = StandardScaler().fit_transform(np.column_stack((train_pc1_coords, train_pc2_coords)))

param_grid = {
    'C': [0.1, 0.5, 1, 10, 100],
    'gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'sigmoid']
}

optimal_params = GridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=3
)

optimal_params.fit(pca_train_scaled, y_train)
print(optimal_params.best_params_)
print(f'Best score: {optimal_params.best_score_}')

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.863 total time=   0.5s
[CV 2/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.858 total time=   0.7s
[CV 3/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.870 total time=   0.7s
[CV 4/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.857 total time=   0.6s
[CV 5/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.860 total time=   0.6s
[CV 1/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.827 total time=   0.5s
[CV 2/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.744 total time=   0.7s
[CV 3/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.763 total time=   0.7s
[CV 4/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.731 total time=   0.9s
[CV 5/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.711 total time=   0.9s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.863 total time=   0.5s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf

In [19]:
%%time
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_scaled)

train_pc1_coords = X_train_pca[:, 0]
train_pc2_coords = X_train_pca[:, 1]
train_pc3_coords = X_train_pca[:, 2]
pca_train_scaled = StandardScaler().fit_transform(np.column_stack((train_pc1_coords, train_pc2_coords, train_pc3_coords)))

param_grid = {
    'C': [0.5, 1, 10, 100],
    'gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'sigmoid']
}

optimal_params = GridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=3
)

optimal_params.fit(pca_train_scaled, y_train)
print(optimal_params.best_params_)
print(f'Best score: {optimal_params.best_score_}')

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END ....C=0.5, gamma=scale, kernel=rbf;, score=0.917 total time=   0.5s
[CV 2/5] END ....C=0.5, gamma=scale, kernel=rbf;, score=0.885 total time=   0.5s
[CV 3/5] END ....C=0.5, gamma=scale, kernel=rbf;, score=0.895 total time=   0.6s
[CV 4/5] END ....C=0.5, gamma=scale, kernel=rbf;, score=0.871 total time=   0.6s
[CV 5/5] END ....C=0.5, gamma=scale, kernel=rbf;, score=0.882 total time=   0.5s
[CV 1/5] END C=0.5, gamma=scale, kernel=sigmoid;, score=0.783 total time=   0.6s
[CV 2/5] END C=0.5, gamma=scale, kernel=sigmoid;, score=0.820 total time=   0.2s
[CV 3/5] END C=0.5, gamma=scale, kernel=sigmoid;, score=0.834 total time=   0.2s
[CV 4/5] END C=0.5, gamma=scale, kernel=sigmoid;, score=0.806 total time=   0.2s
[CV 5/5] END C=0.5, gamma=scale, kernel=sigmoid;, score=0.794 total time=   0.2s
[CV 1/5] END ........C=0.5, gamma=1, kernel=rbf;, score=0.940 total time=   0.4s
[CV 2/5] END ........C=0.5, gamma=1, kernel=rbf