In [1]:
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from sklearn.pipeline import make_pipeline
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

In [2]:
# labeling
# Get Dataset
files={
    'CICI':'/home/irteam/jiwlgus048-dcloud-dir/MLAC/data/encoded_ConcatedCICI.csv',
    'UNSW': '/home/irteam/jiwlgus048-dcloud-dir/MLAC/data/encoded_ConcatedUNSW.csv'
}

data = pd.read_csv(files['CICI'])
data=data[np.isfinite(data).all(1)] # nan값 or 무한대값을 포함하지 않음

In [3]:
multiclass_labels=data['new_attack_category']
data=data.drop(labels=['label','attack_category','new_attack_category'],axis=1)

In [4]:
multiclass_labels.value_counts()

0    5759318
2     403181
7     175828
9      59167
3      25621
4      18100
1      13835
6       7006
8       2180
5         11
Name: new_attack_category, dtype: int64

In [5]:
data.columns

Index(['protocol', 'flow_duration', 'tot_fwd_pkts', 'tot_bwd_pkts',
       'tot_len_fwd_pkts', 'tot_len_bwd_pkts', 'fwd_pkt_len_max',
       'fwd_pkt_len_min', 'fwd_pkt_len_mean', 'fwd_pkt_len_std',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_mean',
       'bwd_pkt_len_std', 'flow_byts_s', 'flow_pkts_s', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_tot',
       'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min',
       'bwd_iat_tot', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max',
       'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags',
       'bwd_urg_flags', 'fwd_header_len', 'bwd_header_len', 'fwd_pkts_s',
       'bwd_pkts_s', 'pkt_len_min', 'pkt_len_max', 'pkt_len_mean',
       'pkt_len_std', 'pkt_len_var', 'fin_flag_cnt', 'syn_flag_cnt',
       'rst_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt', 'urg_flag_cnt',
       'cwe_flag_count', 'ece_flag_cnt', 'down_up_ratio', 'pkt_size_avg',
       'fwd_seg_size_avg', 'b

In [6]:
# importance 높은 feature 삭제
del data['fwd_psh_flags']
del data['bwd_psh_flags']
del data['fwd_urg_flags']
del data['bwd_urg_flags']
del data['rst_flag_cnt']
del data['urg_flag_cnt']
del data['cwe_flag_count']
del data['ece_flag_cnt']
del data['fwd_byts_b_avg']
del data['init_fwd_win_byts']
del data['fwd_seg_size_min']

In [7]:
# Define Models
models = []
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=5, max_features=3)))    
models.append(('CART', DecisionTreeClassifier(max_depth=5)))
models.append(('NB', GaussianNB()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('LR', LogisticRegression(solver='lbfgs', max_iter=200)))
models.append(('ABoost', AdaBoostClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('MLP', MLPClassifier()))

In [8]:
df=pd.DataFrame(columns=['name','acc','f1_mi','f1_ma','f1_we','recall_mi','recall_ma','recall_we']+\
                 ['precision_mi','precision_ma','precision_we'])
eval_path='/home/irteam/jiwlgus048-dcloud-dir/MLAC/evaluation'

confusion_path='/home/irteam/jiwlgus048-dcloud-dir/MLAC/confusion_matrix/cici_definition'
if os.path.isdir(confusion_path)==False:
    os.mkdir(confusion_path)
cnt=0

In [9]:
X_train,X_test,y_train,y_test=train_test_split(data,multiclass_labels,test_size=0.3, shuffle=True, stratify=multiclass_labels, random_state=34)

In [10]:
# confusion matrix plot
def plot_confusion_matrix(con_mat,labels,title:str,cmap=plt.cm.get_cmap('Blues'),normalize=False):
    plt.imshow(con_mat,interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    marks=np.arange(len(labels))
    nlabels=[]
    for k in range(len(con_mat)):
        n=sum(con_mat[k])
        nlabel='{0}(n={1})'.format(labels[k],n)
        nlabels.append(nlabel)

    plt.xticks(marks,labels,rotation=45)
    plt.yticks(marks,nlabels)

    thresh=con_mat.max()/2.
    if normalize:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, '{0}%'.format(con_mat[i, j] * 100 / n), horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    else:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, con_mat[i, j], horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    #plt.show()
    #이미지 저장
    plt.savefig(confusion_path+'/'+title+'.png',facecolor='#eeeeee',edgecolor='blue',pad_inches=0.5)
    plt.clf()

In [11]:
for name, model in models:

    model=make_pipeline(model)
    print('training start...'+name)
    start_time=time.time()
    model.fit(X_train,y_train)
    end_time=time.time()
    
    print("모델 training 소요 시간: {:.2f}초".format(end_time - start_time))


    #evaluation
    
    print('evaluation start...')
    start_time=time.time()
    y_pred=model.predict(X_test)
    end_time=time.time()
    print("모델 test 소요 시간: {:.2f}초".format(end_time - start_time))

    #evaluation result
    model_eval=[]
    model_eval.append(name)
    
    acc = accuracy_score(y_test, y_pred)
    f1_mi = f1_score(y_test, y_pred,average='micro')
    f1_ma = f1_score(y_test, y_pred,average='macro')
    f1_we = f1_score(y_test, y_pred,average='weighted')
    recall_mi = recall_score(y_test, y_pred, average='micro')
    recall_ma = recall_score(y_test, y_pred, average='macro')
    recall_we = recall_score(y_test, y_pred, average='weighted')
    precision_mi = precision_score(y_test, y_pred, average='micro')
    precision_ma = precision_score(y_test, y_pred, average='macro')
    precision_we = precision_score(y_test, y_pred, average='weighted')
    
    model_eval.append(acc)
    model_eval.append(f1_mi)
    model_eval.append(f1_ma)
    model_eval.append(f1_we)
    model_eval.append(recall_mi)
    model_eval.append(recall_ma)
    model_eval.append(recall_we)
    model_eval.append(precision_mi)
    model_eval.append(precision_ma)
    model_eval.append(precision_we)


    #confusion_metrics
    confusion=metrics.confusion_matrix(y_test,y_pred)
    plot_confusion_matrix(confusion,labels=['Benign', 'Brute Force', 'Dos', 'Fuzzers']+\
                          ['Generic', 'Heartbleed', 'Malware', 'Reconnaissance', 'Web Attack', 'unauthorized access'],title=name)


    print(f'name:{name},acc:{acc},f1_score:{f1_mi},{f1_ma},{f1_we},recall:{recall_mi},{recall_ma},{recall_we},precision:{precision_mi},{precision_ma},{precision_we}')
    df.loc[cnt]=model_eval

    cnt+=1
    

df.to_csv(os.path.join(eval_path,'new_cici.csv'),index=False)

training start...RF
모델 training 소요 시간: 30.02초
evaluation start...
모델 test 소요 시간: 1.22초
name:RF,acc:0.9634368514006523,f1_score:0.9634368514006523,0.38409897193502907,0.9584911347791916,recall:0.9634368514006523,0.36095962990364266,0.9634368514006523,precision:0.9634368514006523,0.43172733338673774,0.9580212618819397
training start...CART
모델 training 소요 시간: 116.88초
evaluation start...
모델 test 소요 시간: 0.40초
name:CART,acc:0.9661693158525738,f1_score:0.9661693158525738,0.43472864132497097,0.9577700599436256,recall:0.9661693158525738,0.40631587215345794,0.9661693158525738,precision:0.9661693158525738,0.47474782237675833,0.9498613876025518
training start...NB
모델 training 소요 시간: 4.98초
evaluation start...
모델 test 소요 시간: 9.44초
name:NB,acc:0.7629825579146846,f1_score:0.7629825579146846,0.35620300465208543,0.8319653368165587,recall:0.7629825579146846,0.6269643669718175,0.7629825579146846,precision:0.7629825579146846,0.3473753675053436,0.9302741606287654
training start...LDA
모델 training 소요 시간: 41.3