<h1> Phần 1: Xây dựng Model </h1>

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

In [3]:
#load data
data = pd.read_csv("ddos.csv",low_memory=False)

In [4]:
#Chuẩn hóa dữ liệu ở cột Flow Bytes/s và Flow Packets/s, những biến nào có isfinite == False sẽ được thay thế bằng -999 (con số này là tùy chọn)
for i in range(data.shape[0]):
    if np.isfinite(data.loc[i,'Flow Bytes/s']) == False:
            data.loc[i,'Flow Bytes/s'] = -999
for i in range(data.shape[0]):
    if np.isfinite(data.loc[i,' Flow Packets/s']) == False:
            data.loc[i,' Flow Packets/s'] = -999
            
#Chuẩn hóa dữ liệu (thay thế các biến rỗng thành giá trị -999)
data.fillna((-999), inplace=True)

In [5]:
#Thực hiện LabelEncoder đối với các cột có dtype == object
from sklearn import preprocessing 
for f in data.columns: 
    if data[f].dtype=='object': 
        label = preprocessing.LabelEncoder() 
        label.fit(list(data[f].values)) 
        data[f] = label.transform(list(data[f].values))

In [6]:
#xem thông tin 5 bản ghi đầu tiên trong tập dữ liệu
data.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0


In [7]:
#Chọn các feature(cột) trong tập data để làm đặc trưng cho việc nhận diện
X = pd.concat([data.iloc[:,0], data.iloc[:,14:16], data.iloc[:,38:43]],axis=1)

#lấy nhãn của các bản ghi (các điểm dữ liệu)
Y = data.iloc[:,-1]

In [8]:
#xem thông tin 5 bản ghi đầu tiên trong tập dữ liệu X
X.head()

Unnamed: 0,Destination Port,Flow Bytes/s,Flow Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance
0,54865,4000000.0,666666.6667,6,6,6.0,0.0,0.0
1,55054,110091.7,18348.62385,6,6,6.0,0.0,0.0
2,55055,230769.2,38461.53846,6,6,6.0,0.0,0.0
3,46236,352941.2,58823.52941,6,6,6.0,0.0,0.0
4,54863,4000000.0,666666.6667,6,6,6.0,0.0,0.0


In [9]:
#chia tập dữ liệu thành train và test (test_size=0.3  nghĩa là lấy 70% của X để train và 30% của X để test)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)

In [10]:
#In số chiều của X_train, X_test, y_train, y_test
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(158021, 8)
(158021,)
(67724, 8)
(67724,)


In [11]:
#Thực hiện việc train model
model = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=0,n_jobs=-1)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, n_jobs=-1, random_state=0)

In [12]:
# Lưu lại model vừa train (phục vụ cho việc detect sau này sẽ không phải train lại model mà load thẳng từ model đã train)
filename = 'ddos_detect_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [13]:
# Load model vừa được lưu
loaded_model = pickle.load(open(filename, 'rb'))

In [14]:
# Dự đoán với tập dữ liệu test
y_pred = loaded_model.predict(X_test)

In [15]:
# Đánh giá mô hình bằng việc tính các chỉ số acc, f1-acore, Precision, Recall_score, Misclassified, confusion matrix trên tập test
accuracy = accuracy_score(y_test, y_pred)
print("accuracy:",accuracy)
f1score=f1_score(y_test, y_pred)
print("f1-acore:",f1score)
cm=confusion_matrix(y_test, y_pred)
print("confusion matrix:\n",cm)
pr=precision_score(y_test,y_pred)
print("Precision:",pr)
rs=recall_score(y_test,y_pred)
print("Recall_score:",rs)
misclassified_samples = X_test[y_test != y_pred]
mc=misclassified_samples.shape[0]
print("Misclassified :",mc)

accuracy: 0.9952454078317878
f1-acore: 0.9957940385067531
confusion matrix:
 [[29284   255]
 [   67 38118]]
Precision: 0.9933547025252131
Recall_score: 0.998245384313212
Misclassified : 322


In [16]:
#hiển thị các feature quan trọng theo thứ tự giảm dần
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
Destination Port,0.365409
Packet Length Mean,0.136189
Packet Length Variance,0.115982
Packet Length Std,0.101446
Max Packet Length,0.082114
Min Packet Length,0.07428
Flow Bytes/s,0.064659
Flow Packets/s,0.059922
