In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, hinge_loss
import matplotlib.pyplot as plt
from sklearn.utils import resample
import matplotlib.colors
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.decomposition import PCA

In [2]:
%%time
df_main = pd.read_csv('../../Dataset/IDS 2018 Intrusion CSVs (CSE-CIC-IDS2018)/dataset_dos.csv')

CPU times: total: 422 ms
Wall time: 612 ms


In [3]:
len(df_main)

393636

In [4]:
len(df_main[df_main['Label'] == 0])

196818

In [5]:
len(df_main[df_main['Label'] == 1])

196818

In [6]:
df_normal = df_main[df_main['Label'] == 0]
df_attack = df_main[df_main['Label'] == 1]

In [7]:
df_normal_downsampled = resample(df_normal, replace=False, n_samples=5000, random_state=42)
len(df_normal_downsampled)

5000

In [8]:
df_attack_downsampled = resample(df_attack, replace=False, n_samples=5000, random_state=42)
len(df_attack_downsampled)

5000

In [9]:
df_downsample = pd.concat([df_normal_downsampled, df_attack_downsampled])
len(df_downsample)

10000

In [10]:
X = df_downsample.drop(columns='Label')
y = df_downsample['Label']

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
scaler = StandardScaler()
X_train_scaled  = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
%%time
# Định nghĩa các siêu tham số cần tìm kiếm
param_grid = {
    'C': [0.5, 1, 10, 100],
    'gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf']
}

# Tạo scorer dựa trên Hinge Loss
hinge_scorer = make_scorer(hinge_loss, greater_is_better=False)

# Khởi tạo GridSearchCV với SVM
optimal_params = GridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    scoring=hinge_scorer,
    verbose=3
)

# Huấn luyện GridSearchCV
optimal_params.fit(X_train_scaled, y_train)

# In ra các siêu tham số tốt nhất
print(optimal_params.best_params_)
print(f'Best score: {optimal_params.best_score_}')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END ...C=0.5, gamma=scale, kernel=rbf;, score=-0.570 total time=   0.5s
[CV 2/5] END ...C=0.5, gamma=scale, kernel=rbf;, score=-0.562 total time=   0.6s
[CV 3/5] END ...C=0.5, gamma=scale, kernel=rbf;, score=-0.571 total time=   0.6s
[CV 4/5] END ...C=0.5, gamma=scale, kernel=rbf;, score=-0.578 total time=   0.5s
[CV 5/5] END ...C=0.5, gamma=scale, kernel=rbf;, score=-0.571 total time=   0.5s
[CV 1/5] END .......C=0.5, gamma=1, kernel=rbf;, score=-0.521 total time=   0.4s
[CV 2/5] END .......C=0.5, gamma=1, kernel=rbf;, score=-0.524 total time=   0.3s
[CV 3/5] END .......C=0.5, gamma=1, kernel=rbf;, score=-0.530 total time=   0.4s
[CV 4/5] END .......C=0.5, gamma=1, kernel=rbf;, score=-0.531 total time=   0.4s
[CV 5/5] END .......C=0.5, gamma=1, kernel=rbf;, score=-0.524 total time=   0.4s
[CV 1/5] END .....C=0.5, gamma=0.1, kernel=rbf;, score=-0.531 total time=   0.5s
[CV 2/5] END .....C=0.5, gamma=0.1, kernel=rbf;

In [13]:
%%time
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)

train_pc1_coords = X_train_pca[:, 0]
train_pc2_coords = X_train_pca[:, 1]
pca_train_scaled = StandardScaler().fit_transform(np.column_stack((train_pc1_coords, train_pc2_coords)))

param_grid = {
    'C': [0.1, 0.5, 1, 10, 100],
    'gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf']
}

# Tạo scorer dựa trên Hinge Loss
hinge_scorer = make_scorer(hinge_loss, greater_is_better=False)

optimal_params = GridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    scoring=hinge_scorer,
    verbose=3
)

optimal_params.fit(pca_train_scaled, y_train)
print(optimal_params.best_params_)
print(f'Best score: {optimal_params.best_score_}')

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ...C=0.1, gamma=scale, kernel=rbf;, score=-0.651 total time=   1.1s
[CV 2/5] END ...C=0.1, gamma=scale, kernel=rbf;, score=-0.652 total time=   1.2s
[CV 3/5] END ...C=0.1, gamma=scale, kernel=rbf;, score=-0.644 total time=   1.2s
[CV 4/5] END ...C=0.1, gamma=scale, kernel=rbf;, score=-0.646 total time=   1.1s
[CV 5/5] END ...C=0.1, gamma=scale, kernel=rbf;, score=-0.649 total time=   1.0s
[CV 1/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.651 total time=   1.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.648 total time=   1.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.644 total time=   0.9s
[CV 4/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.647 total time=   1.0s
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.644 total time=   1.0s
[CV 1/5] END .....C=0.1, gamma=0.1, kernel=rbf;, score=-0.661 total time=   1.6s
[CV 2/5] END .....C=0.1, gamma=0.1, kernel=rbf;

In [14]:
%%time
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_scaled)

train_pc1_coords = X_train_pca[:, 0]
train_pc2_coords = X_train_pca[:, 1]
train_pc3_coords = X_train_pca[:, 2]
pca_train_scaled = StandardScaler().fit_transform(np.column_stack((train_pc1_coords, train_pc2_coords, train_pc3_coords)))

param_grid = {
    'C': [0.5, 1, 10, 100],
    'gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf']
}

optimal_params = GridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    scoring=hinge_scorer,
    verbose=3
)

optimal_params.fit(pca_train_scaled, y_train)
print(optimal_params.best_params_)
print(f'Best score: {optimal_params.best_score_}')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END ...C=0.5, gamma=scale, kernel=rbf;, score=-0.601 total time=   1.1s
[CV 2/5] END ...C=0.5, gamma=scale, kernel=rbf;, score=-0.604 total time=   1.0s
[CV 3/5] END ...C=0.5, gamma=scale, kernel=rbf;, score=-0.593 total time=   1.0s
[CV 4/5] END ...C=0.5, gamma=scale, kernel=rbf;, score=-0.601 total time=   0.9s
[CV 5/5] END ...C=0.5, gamma=scale, kernel=rbf;, score=-0.600 total time=   1.0s
[CV 1/5] END .......C=0.5, gamma=1, kernel=rbf;, score=-0.541 total time=   0.5s
[CV 2/5] END .......C=0.5, gamma=1, kernel=rbf;, score=-0.534 total time=   0.6s
[CV 3/5] END .......C=0.5, gamma=1, kernel=rbf;, score=-0.542 total time=   0.5s
[CV 4/5] END .......C=0.5, gamma=1, kernel=rbf;, score=-0.546 total time=   0.5s
[CV 5/5] END .......C=0.5, gamma=1, kernel=rbf;, score=-0.540 total time=   0.5s
[CV 1/5] END .....C=0.5, gamma=0.1, kernel=rbf;, score=-0.659 total time=   0.9s
[CV 2/5] END .....C=0.5, gamma=0.1, kernel=rbf;