In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

## 데이터 전처리


In [3]:
col_names = np.array(["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels"])

attack_dict = {'normal': 'normal','back': 'DoS','land': 'DoS','neptune': 'DoS','pod': 'DoS',
               'smurf': 'DoS','teardrop': 'DoS','mailbomb': 'DoS','apache2': 'DoS','processtable': 'DoS',
               'udpstorm': 'DoS',

               'ipsweep': 'Probe','nmap': 'Probe','portsweep': 'Probe', 'satan': 'Probe', 'mscan': 'Probe',
               'saint': 'Probe',

               'ftp_write': 'R2L','guess_passwd': 'R2L','imap': 'R2L','multihop': 'R2L', 'phf': 'R2L','spy': 'R2L',
                'warezclient': 'R2L','warezmaster': 'R2L','sendmail': 'R2L', 'named': 'R2L', 'snmpgetattack': 'R2L',
               'snmpguess': 'R2L','xlock': 'R2L','xsnoop': 'R2L','worm': 'R2L',

               'buffer_overflow': 'U2R','loadmodule': 'U2R', 'perl': 'U2R', 'rootkit': 'U2R','httptunnel': 'U2R',
               'ps': 'U2R','sqlattack': 'U2R','xterm': 'U2R'
}

FILE_PATH = './kdd/'
dummy_list = ['protocol_type','service','flag']
log_scaling_list = ['duration','src_bytes','dst_bytes']
binary_dict = {'DoS' : 1,  'normal':0, 'Probe':1 ,'R2L':1, 'U2R':1} # 이진분류를 위해 정상과 나머지(비정상)을 비교 예측하기

### 데이터 전처리를 위한 일련의 과정

In [4]:


def _data_load():
    '''
    '''
    cnt = 0
    file_list = ['KDDTrain+','KDDTest+', 'KDDTest-21']
    for name in file_list:
        if cnt < 1:
            result = pd.read_csv(FILE_PATH+name+'.txt', header=None)
            cnt += 1
        else:
            result.append(pd.read_csv(FILE_PATH+name+'.txt', header=None))
    result = result.iloc[:, 0:42]  # 끝행 지우기
    return result

def _concat_dummy(df, var):
    
    '''
    Type : {'A','B','C','D'} ----> Type_A : [0,1] , Type_B : [0,1] ... 이런식으로 더미변수 처리시켜주는 함수.
    '''

    for name in var:
        df = pd.concat([df,pd.get_dummies(df[name])], axis=1)
    return df.drop(var , axis=1)  # 더미변수 처리하였으니 쓸모없어진 본래의 애트리뷰트는 삭제한다.


def _min_max_scalier(df, binary_classify=True):
    '''
    scikit-learn 패키지를 활용한 min-max scaling으로 0 ~ 1에 해당하는 값으로 변환
    '''

    df = df.drop(['labels'],axis=1)
    scaler = MinMaxScaler()
    return scaler.fit_transform(df)

def one_hot_encoder(df, var):
    '''
    labels : {'normal','U2R' .... 'R2U'} --> [0,1, ...1 ]
    '''
    df_y = df['labels'].map(var)         #Normal : 0 Anomal : 1의 형태로 변형
    y_ = np.array(pd.get_dummies(df_y))  #원-핫 형태로 변형

    return y_

In [5]:
df = _data_load()  # 데이터 로드
df.columns = col_names  # 헤더값 정의
df['labels'] = df['labels'].map(attack_dict)  # 위협 별 종류 맵핑  ex) ps : 'U2R'

### 가공 전 데이터의 형태

In [6]:
df.head()  

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,labels
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,DoS
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


### 수치화 해야할 변수
#### 'protocol_type' , 'service' ,'flag' ---> 더미변수처리하여 딥러닝에 학습할 수 있게끔 수치화해야함.

In [7]:
df = _concat_dummy(df, dummy_list) # 더미변수처리

#### 아래와 같이 특정 컬럼의 한 종류였던 값들이 더미변수 처리 되어 0또는 1의 값을 가지는 컬럼이 되었음.

In [9]:
print(df.iloc[:,39:-1].head())

   icmp  tcp  udp  IRC  X11  Z39_50  aol  auth  bgp  courier ...  OTH  REJ  \
0     0    1    0    0    0       0    0     0    0        0 ...    0    0   
1     0    0    1    0    0       0    0     0    0        0 ...    0    0   
2     0    1    0    0    0       0    0     0    0        0 ...    0    0   
3     0    1    0    0    0       0    0     0    0        0 ...    0    0   
4     0    1    0    0    0       0    0     0    0        0 ...    0    0   

   RSTO  RSTOS0  RSTR  S0  S1  S2  S3  SF  
0     0       0     0   0   0   0   0   1  
1     0       0     0   0   0   0   0   1  
2     0       0     0   1   0   0   0   0  
3     0       0     0   0   0   0   0   1  
4     0       0     0   0   0   0   0   1  

[5 rows x 83 columns]


### 로그스케일 해야할 변수
#### 'duration','src_bytes','dst_bytes' ----> 해당 값은 최소값과 최대값의 차이가 커서 노말라이제이션을 했을 때 
#### 변수들의 표현력이 떨어질 것을 우려하여 로그스케일처리부터 먼저 하려고 함.


In [10]:
print(df[log_scaling_list].describe(),) #로그스케일 전

            duration     src_bytes     dst_bytes
count  148517.000000  1.485170e+05  1.485170e+05
mean      276.779305  4.022795e+04  1.708885e+04
std      2460.683131  5.409612e+06  3.703525e+06
min         0.000000  0.000000e+00  0.000000e+00
25%         0.000000  0.000000e+00  0.000000e+00
50%         0.000000  4.400000e+01  0.000000e+00
75%         0.000000  2.780000e+02  5.710000e+02
max     57715.000000  1.379964e+09  1.309937e+09


In [12]:
df[log_scaling_list] = df[log_scaling_list].apply(lambda x:np.log(x+0.1)) #로그스케일 처리

In [14]:
print(df[log_scaling_list].describe()) #로그스케일 후

            duration      src_bytes      dst_bytes
count  148517.000000  148517.000000  148517.000000
mean       -1.735748       2.391284       1.960738
std         2.071255       4.024348       4.626609
min        -2.302585      -2.302585      -2.302585
25%        -2.302585      -2.302585      -2.302585
50%        -2.302585       3.786460      -2.302585
75%        -2.302585       5.627981       6.347564
max        10.963274      21.045323      20.993245


In [16]:
new_df = _min_max_scalier(df)  # min-max 스케일링 적용
y_ = one_hot_encoder(df, binary_dict)  # 타겟데이터의 원-핫 인코딩

#### 훈련데이터와 시험데이터 나누기 작업

In [18]:
X_train, X_test, y_train, y_test = train_test_split(new_df, y_, test_size = 0.33, random_state = 42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(99506, 122) (49011, 122) (99506, 2) (49011, 2)


### 가공이 모두 끝난 후의 데이터 상태

In [22]:
print('input_data\n',X_train[0:10,:])
print('label_data\n', y_train[0:10,:])

input_data
 [[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.26079616  0.30781854 ...,  0.          1.          0.        ]
 [ 0.          0.18821597  0.         ...,  0.          1.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.32826695  0.34660013 ...,  0.          1.          0.        ]
 [ 0.          0.38205229  0.         ...,  0.          1.          0.        ]]
label_data
 [[0 1]
 [1 0]
 [0 1]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [0 1]
 [1 0]
 [1 0]]


## 간단한 3층 MLP 를 Keras로 구현

In [24]:
n_in = len(X_train[0])
n_hidden = 200
n_out = len(y_train[0])
print('n_in : {}, n_hidden : {}, n_out : {}'.format(n_in,n_hidden,n_out))

n_in : 122, n_hidden : 200, n_out : 2


# 모델 아키텍처 구성
#### 은닉 2층 / learning rate = 0.01 / 최적화 = SGD / 활성화함수 = 'ReLU' / 배치사이즈 200 / Epoch = 20

In [26]:
model = Sequential()

model.add(Dense(n_hidden, input_dim=n_in ))   # 122 x 200
model.add(Activation('relu'))

model.add(Dense(400) )   # 200 x 100
model.add(Activation('relu'))

model.add(Dense(n_out))
model.add(Activation('softmax'))


epochs = 20
batch_size = 200

model.compile(loss='binary_crossentropy',
              optimizer=SGD(),
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=2)

loss_and_metrics = model.evaluate(X_test, y_test)


y_pred = model.predict_classes(X_test)

Epoch 1/20
6s - loss: 0.2827 - acc: 0.9184
Epoch 2/20
5s - loss: 0.1801 - acc: 0.9376
Epoch 3/20
4s - loss: 0.1590 - acc: 0.9400
Epoch 4/20
5s - loss: 0.1453 - acc: 0.9441
Epoch 5/20
5s - loss: 0.1353 - acc: 0.9465
Epoch 6/20
5s - loss: 0.1276 - acc: 0.9516
Epoch 7/20
5s - loss: 0.1214 - acc: 0.9549
Epoch 8/20
5s - loss: 0.1162 - acc: 0.9576
Epoch 9/20
4s - loss: 0.1116 - acc: 0.9601
Epoch 10/20
5s - loss: 0.1076 - acc: 0.9614
Epoch 11/20
5s - loss: 0.1039 - acc: 0.9624
Epoch 12/20
5s - loss: 0.1006 - acc: 0.9638
Epoch 13/20
5s - loss: 0.0974 - acc: 0.9645
Epoch 14/20
5s - loss: 0.0945 - acc: 0.9650
Epoch 15/20
4s - loss: 0.0918 - acc: 0.9656
Epoch 16/20
4s - loss: 0.0893 - acc: 0.9664
Epoch 17/20
5s - loss: 0.0869 - acc: 0.9668
Epoch 18/20
5s - loss: 0.0847 - acc: 0.9679
Epoch 19/20
5s - loss: 0.0826 - acc: 0.9686
Epoch 20/20
5s - loss: 0.0806 - acc: 0.9692

In [31]:
print('loss : {} , test_acc : {}'.format(loss_and_metrics[0],loss_and_metrics[1]))

loss : 0.08109689612145357 , test_acc : 0.9699863295994776
