### 학습 데이터 전처리

In [1]:
import pandas as pd

# 학습 데이터 파일을 불러온다.
n_merged = pd.read_csv('실시간 트래픽 + 기존 전체 트래픽 데이터.csv')

In [2]:
df = n_merged
df

Unnamed: 0,Source IP,Destination IP,Protocol,Source Port,Destination Port,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Length,IAT,Label
0,34.185.6.135,192.168.100.50,1,0,0,0,0,0,0,0,0,0,0,1428.0,1.474943e+09,12
1,46.50.95.169,192.168.100.50,1,0,0,0,0,0,0,0,0,0,0,1428.0,1.474943e+09,12
2,59.2.126.190,192.168.100.50,1,0,0,0,0,0,0,0,0,0,0,1428.0,1.474943e+09,12
3,136.245.66.222,192.168.100.50,1,0,0,0,0,0,0,0,0,0,0,1428.0,1.474943e+09,12
4,254.175.194.181,192.168.100.50,1,0,0,0,0,0,0,0,0,0,0,1428.0,1.474943e+09,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4323726,192.168.0.3,224.0.0.251,17,5353,5353,0,0,0,0,0,0,0,0,411.0,1.446743e-01,0
4323727,192.168.0.3,224.0.0.251,17,5353,5353,0,0,0,0,0,0,0,0,255.0,7.399356e-01,0
4323728,192.168.0.3,224.0.0.251,17,5353,5353,0,0,0,0,0,0,0,0,411.0,1.483896e-01,0
4323729,192.168.0.31,64.233.189.188,6,64066,5228,0,1,0,0,1,0,0,0,41.0,4.507677e+01,0


In [3]:
# 학습을 위한 Source IP, Destination IP 전처리
df['Source IP'] = df['Source IP'].apply(lambda x: sum([int(i) * (256 ** j) for j, i in enumerate(x.split('.')[::-1])]))
df['Destination IP'] = df['Destination IP'].apply(lambda x: sum([int(i) * (256 ** j) for j, i in enumerate(x.split('.')[::-1])]))

# 변환된 데이터 확인하기
df

Unnamed: 0,Source IP,Destination IP,Protocol,Source Port,Destination Port,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Length,IAT,Label
0,582551175,3232261170,1,0,0,0,0,0,0,0,0,0,0,1428.0,1.474943e+09,12
1,775053225,3232261170,1,0,0,0,0,0,0,0,0,0,0,1428.0,1.474943e+09,12
2,990019262,3232261170,1,0,0,0,0,0,0,0,0,0,0,1428.0,1.474943e+09,12
3,2297774814,3232261170,1,0,0,0,0,0,0,0,0,0,0,1428.0,1.474943e+09,12
4,4272931509,3232261170,1,0,0,0,0,0,0,0,0,0,0,1428.0,1.474943e+09,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4323726,3232235523,3758096635,17,5353,5353,0,0,0,0,0,0,0,0,411.0,1.446743e-01,0
4323727,3232235523,3758096635,17,5353,5353,0,0,0,0,0,0,0,0,255.0,7.399356e-01,0
4323728,3232235523,3758096635,17,5353,5353,0,0,0,0,0,0,0,0,411.0,1.483896e-01,0
4323729,3232235551,1089060284,6,64066,5228,0,1,0,0,1,0,0,0,41.0,4.507677e+01,0


In [4]:
# 각 컬럼의 데이터 타입 출력
print(df.dtypes)

Source IP             int64
Destination IP        int64
Protocol              int64
Source Port           int64
Destination Port      int64
FIN Flag Count        int64
SYN Flag Count        int64
RST Flag Count        int64
PSH Flag Count        int64
ACK Flag Count        int64
URG Flag Count        int64
CWE Flag Count        int64
ECE Flag Count        int64
Length              float64
IAT                 float64
Label                 int64
dtype: object


### IDS 학습 

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [6]:
# Feature와 Label로 데이터 분리하기
X = df.drop('Label', axis=1)
y = df['Label']

# Train set과 Test set으로 데이터 분리하기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델 생성하기
model = RandomForestClassifier(n_estimators=100, random_state=42)

# 모델 학습하기
model.fit(X_train, y_train)

# Test set으로 모델 평가하기
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

#정확도
print('Accuracy:', accuracy)

Accuracy: 0.999925989913813


In [7]:
#IDS 모델 라벨링 확인
from collections import Counter

my_data_counts = Counter(y_test)
print(my_data_counts)

Counter({0: 752178, 7: 45863, 2: 31788, 1: 25825, 8: 2018, 12: 1692, 5: 1574, 9: 1190, 6: 1125, 10: 1104, 3: 384, 4: 5, 11: 1})


### 모델 저장

In [8]:
import joblib

# IDS 모델 저장하기
joblib.dump(model, 'IDS_model.pkl')

['IDS_model.pkl']