In [1]:
import math
import numpy as np
import pandas as pd
import glob

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import preprocessing

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans

### training dataset ###
train_path = r'./training'
train_all_files = glob.glob(train_path + "/*.csv")
train_list = []

for filename in train_all_files:
    reader = pd.read_csv(filename, index_col=None, header=0,nrows=10000,low_memory=False)
    train_list.append(reader)

train_df = pd.concat(train_list, axis=0, ignore_index=True)

train_df[' Label'] = train_df[' Label'].replace({'BENIGN':0,'DrDoS_NTP':1,'TFTP':1,'Syn':2,'DrDoS_DNS':1,'DrDoS_LDAP':1,'DrDoS_NetBIOS':1,'DrDoS_SNMP':1,'UDP-lag':2,\
                                                'DrDoS_SSDP':1,'DrDoS_MSSQL':1,'DrDoS_UDP':2,'WebDDoS':1})
### testing dataset ###
test_path = r'./testing'
test_all_files = glob.glob(test_path + "/*.csv")
test_list = []

for filename in test_all_files:
    reader = pd.read_csv(filename, index_col=None, header=0,nrows=10000,low_memory=False)
    test_list.append(reader)

test_df = pd.concat(test_list, axis=0, ignore_index=True)
                             
test_df[' Label'] = test_df[' Label'].replace({'BENIGN':0,'Syn':2,'LDAP':1,'NetBIOS':1,'Portmap':1,\
                                                'MSSQL':1,'UDP':2})

### training cluster dataset ###
train_cluster_path = r'./training_clustering'
train_cluster_files = glob.glob(train_cluster_path + "/*.csv")
train_cluster_list = []

for filename in train_cluster_files:
    reader = pd.read_csv(filename, index_col=None, header=0,nrows=10000,low_memory=False)
    train_cluster_list.append(reader)
    
train_cluster_df = pd.concat(train_cluster_list, axis=0, ignore_index=True)

train_cluster_df[' Label'] = train_cluster_df[' Label'].replace({'BENIGN':0,'DrDoS_NTP':1,'TFTP':1,'Syn':2,'DrDoS_DNS':1,'DrDoS_LDAP':1,'DrDoS_NetBIOS':1,'DrDoS_SNMP':1,'UDP-lag':2,\
                                                'DrDoS_SSDP':1,'DrDoS_MSSQL':1,'DrDoS_UDP':2,'WebDDoS':1})


In [28]:
def hybrid(clf, fuzzy_threshold, preproc):
    ### Classification with all data
    X_train = train_df[[' ACK Flag Count',
                        ' Fwd Packet Length Std',
                        ' Packet Length Std',
                        'Fwd Packets/s',
                        ' Protocol',
                        ' Flow Duration']].values
    y_train = train_df[' Label'].values

    X_test = test_df[[' ACK Flag Count',
                      ' Fwd Packet Length Std',
                      ' Packet Length Std',
                      'Fwd Packets/s',
                      ' Protocol',
                      ' Flow Duration']].values
    y_test = test_df[' Label'].values
    
    if preproc:
        X_train = preprocessing.scale(X_train)
        X_test = preprocessing.scale(X_test)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc_scr = metrics.accuracy_score(y_pred ,y_test)
    cfs_mat = metrics.confusion_matrix(y_pred, y_test)
    print("# Classification with all Dataset result")
    print("Accuracy:", acc_scr)
    print("Confusion Metrix:\n", cfs_mat)
    print()
    
    ### filter fuzzy data with probability
    print("Hybrid")
    print("="*20)
    y_pred_proba = clf.predict_proba(X_test)
    L1_prob = pd.DataFrame({'y_true':y_test,
                            'y_pred':y_pred,
                            'y_P(c0)':y_pred_proba[:,0],
                            'y_P(c1)':y_pred_proba[:,1],
                            'y_P(c2)':y_pred_proba[:,2]})
    
    # old fuzzy
    #     fuzzy_idx = L1_prob[((abs(L1_prob['y_P(c0)'] - 0.5) < 0.1) & (abs(L1_prob['y_P(c1)'] - 0.5) < 0.1)) |
    #                         ((abs(L1_prob['y_P(c0)'] - 0.5) < 0.1) & (abs(L1_prob['y_P(c2)'] - 0.5) < 0.1)) |
    #                         ((abs(L1_prob['y_P(c1)'] - 0.5) < 0.1) & (abs(L1_prob['y_P(c2)'] - 0.5) < 0.1))].index
    
    fuzzy_idx = L1_prob[(L1_prob['y_P(c0)'] < fuzzy_threshold) &
                        (L1_prob['y_P(c1)'] < fuzzy_threshold) &
                        (L1_prob['y_P(c2)'] < fuzzy_threshold)].index
    # display(fuzzy_idx)
    
    ###### Layer 1 Classification ######
    L1_test_df = test_df[[' ACK Flag Count',
                           ' Fwd Packet Length Std',
                           ' Packet Length Std',
                           'Fwd Packets/s',
                           ' Protocol',
                           ' Flow Duration',
                           ' Label']].drop(fuzzy_idx)
    # display(L1_test_df)


    X_test_L1 = L1_test_df[[' ACK Flag Count',
                            ' Fwd Packet Length Std',
                            ' Packet Length Std',
                            'Fwd Packets/s',
                            ' Protocol',
                            ' Flow Duration']]
    y_test_L1 = L1_test_df[' Label']
    
    if preproc:
        X_test_L1 = preprocessing.scale(X_test_L1)
        
    # display(X_L1_test)

    y_pred_L1 = clf.predict(X_test_L1)
    # display(y_L1_test, y_L1_pred)
    
    acc_scr_L1 = metrics.accuracy_score(y_pred_L1, y_test_L1)
    cfs_mat_L1 = metrics.confusion_matrix(y_pred_L1, y_test_L1)
    
    print('# Fuzzy Dataset Size: ', len(fuzzy_idx))
    print("# Fuzzy Data Division in Confusion Matrix")
    print(cfs_mat - cfs_mat_L1)
    print()
    
    print("Hybrid Layer 1: Classification without Fuzzy Dataset Result")
    print("="*20)
    print("Accuracy:", acc_scr_L1)
    print("Confusion Metrix:\n", cfs_mat_L1)
    print()
    
    
    ###### Layer 2 Clustering ######
    ## Train
    X_train_L2 = train_cluster_df[[' Packet Length Mean',
                                   ' Flow IAT Mean',
                                   ' ACK Flag Count',
                                   ' Fwd Packet Length Std',
                                   'Fwd Packets/s',
                                   ' Protocol',
                                   ' Flow Duration']]
    X_train_L2 = preprocessing.scale(X_train_L2)
    y_train_L2 = train_cluster_df[' Label']

    # display(X_train_df)
    # display(y_train_df)
    
    ### Test
#     L2_test_df = test_df
    # display(L2_test_df)
    
    X_test_L2 = test_df[[' Packet Length Mean',
                             ' Flow IAT Mean',
                             ' ACK Flag Count',
                             ' Fwd Packet Length Std',
                             'Fwd Packets/s',
                             ' Protocol',
                             ' Flow Duration']]
    X_test_L2 = pd.DataFrame(preprocessing.scale(X_test_L2)).iloc[fuzzy_idx]
    y_test_L2 = test_df[' Label'].iloc[fuzzy_idx]

        
    # display(X_test_df)
    # display(y_test_df)

    ### Cluster 200
    cluster_num = 200;
    kmeans = KMeans(n_clusters=cluster_num, random_state=0, n_jobs=-1).fit(X_train_L2)
    y_pred_L2 = kmeans.predict(X_test_L2)

    kmeans_out_df = pd.DataFrame({'cluster_labels':kmeans.labels_,'y':y_train_L2})
    y_pred_L2_df = pd.DataFrame({'y_pred':y_pred_L2})

    pure = 0
    for i in range(0,cluster_num):
        tmp_df_0 = kmeans_out_df[(kmeans_out_df['cluster_labels']==i) & (kmeans_out_df['y']==0)]
        tmp_df_1 = kmeans_out_df[(kmeans_out_df['cluster_labels']==i) & (kmeans_out_df['y']==1)]
        tmp_df_2 = kmeans_out_df[(kmeans_out_df['cluster_labels']==i) & (kmeans_out_df['y']==2)]

        most_class = max(len(tmp_df_0),len(tmp_df_1),len(tmp_df_2))
        pure += most_class

        if most_class == len(tmp_df_0):
#             display("These are class 0:", y_pred_L2_df[y_pred_L2_df['y_pred'] == i])
            y_pred_L2_df['y_pred'] = y_pred_L2_df['y_pred'].replace({i:0})
        elif most_class == len(tmp_df_1):
#             display("These are class 1:", y_pred_L2_df[y_pred_L2_df['y_pred'] == i])
            y_pred_L2_df['y_pred'] = y_pred_L2_df['y_pred'].replace({i:1})
        else:
#             display("These are class 2:", y_pred_L2_df[y_pred_L2_df['y_pred'] == i])
            y_pred_L2_df['y_pred'] = y_pred_L2_df['y_pred'].replace({i:2})

#         print('cluster',i ,' class 0 size: ',len(tmp_df_0),' class 1 size: ',len(tmp_df_1),' class 2 size: ',len(tmp_df_2))

#     print(pure, len(y_train_L2))
    print("Hybrid Layer 2: Clustering with Fuzzy Dataset Result")
    print("="*20)
    print('Pure: ', pure/len(y_train_L2))
    
    acc_scr_L2 = metrics.accuracy_score(y_pred_L2_df['y_pred'], y_test_L2)
    cfs_mat_L2 = metrics.confusion_matrix(y_pred_L2_df['y_pred'], y_test_L2)
    print("Accuracy:", acc_scr_L2)
    print("Confusion Metrix:\n", cfs_mat_L2)
    print()
    
    ### Hybrid Accuracy
    hybrid_acc = (acc_scr_L1 * len(y_test_L1) + acc_scr_L2 * len(y_test_L2)) / len(test_df)
    print("Hybrid Result")
    print("="*20)
    print("Accuracy:", hybrid_acc)
    print("Confusion Matrix:\n", cfs_mat_L1 + cfs_mat_L2)

In [29]:
### ID3 + Kmeans

print("="*20)
print("ID3")
print("="*20)

clf_ID3 = DecisionTreeClassifier(criterion='entropy')
fuzzy_data_threshold = 0.6

hybrid(clf_ID3, fuzzy_data_threshold, False)

ID3
# Classification with all Dataset result
Accuracy: 0.7987142857142857
Confusion Metrix:
 [[ 4544    93   372]
 [  360 44896 13158]
 [    3   104  6470]]

Hybrid
# Fuzzy Dataset Size:  5648
# Fuzzy Data Division in Confusion Matrix
[[  80    0    0]
 [  43  106 5063]
 [   0   27  329]]

Hybrid Layer 1: Classification without Fuzzy Dataset Result
Accuracy: 0.8608124067628046
Confusion Metrix:
 [[ 4464    93   372]
 [  317 44790  8095]
 [    3    77  6141]]

Hybrid Layer 2: Clustering with Fuzzy Dataset Result
Pure:  0.9820833333333333
Accuracy: 0.972556657223796
Confusion Metrix:
 [[ 100    0    0]
 [  10    1    0]
 [  13  132 5392]]

Hybrid Result
Accuracy: 0.8698285714285714
Confusion Matrix:
 [[ 4564    93   372]
 [  327 44791  8095]
 [   16   209 11533]]


In [30]:
### Naive Bayes + Kmeans

print("="*20)
print("Naive Bayes")
print("="*20)

clf_NB = GaussianNB()
fuzzy_data_threshold = 0.6

hybrid(clf_NB, fuzzy_data_threshold, True)

Naive Bayes
# Classification with all Dataset result
Accuracy: 0.7906428571428571
Confusion Metrix:
 [[  880    44     0]
 [ 3605 44469 10004]
 [  422   580  9996]]

Hybrid
# Fuzzy Dataset Size:  2543
# Fuzzy Data Division in Confusion Matrix
[[   3    0    0]
 [   3    0    0]
 [   9    0 2528]]

Hybrid Layer 1: Classification without Fuzzy Dataset Result
Accuracy: 0.78292838400759
Confusion Metrix:
 [[  877    44     0]
 [ 3602 44469 10004]
 [  413   580  7468]]

Hybrid Layer 2: Clustering with Fuzzy Dataset Result
Pure:  0.9820833333333333
Accuracy: 0.9941014549744397
Confusion Metrix:
 [[   0    0]
 [  15 2528]]

Hybrid Result
Accuracy: 0.7906


ValueError: operands could not be broadcast together with shapes (3,3) (2,2) 

In [31]:
### Logistric Regression + Kmeans
print("="*20)
print("Logistic Regression")
print("="*20)

clf_LR=LogisticRegression(fit_intercept=True,C=1e15)
fuzzy_data_threshold = 0.6

hybrid(clf_LR, fuzzy_data_threshold, True)

Logistic Regression
# Classification with all Dataset result
Accuracy: 0.7218285714285714
Confusion Metrix:
 [[ 2356   168     1]
 [ 2324 44790 16617]
 [  227   135  3382]]

Hybrid
# Fuzzy Dataset Size:  9885
# Fuzzy Data Division in Confusion Matrix
[[  105     2     0]
 [  384   399 11536]
 [   93    26 -2660]]

Hybrid Layer 1: Classification without Fuzzy Dataset Result
Accuracy: 0.8763869250603011
Confusion Metrix:
 [[ 2251   166     1]
 [ 1940 44391  5081]
 [  134   109  6042]]

Hybrid Layer 2: Clustering with Fuzzy Dataset Result
Pure:  0.9820833333333333
Accuracy: 0.9134041476985332
Confusion Metrix:
 [[ 212    0   60]
 [ 199    1    0]
 [ 171  426 8816]]

Hybrid Result
Accuracy: 0.8816142857142857
Confusion Matrix:
 [[ 2463   166    61]
 [ 2139 44392  5081]
 [  305   535 14858]]
