In [1]:
import math
import numpy as np
import pandas as pd
import glob

### training dataset ###
train_path = r'./training'
train_all_files = glob.glob(train_path + "/*.csv")
train_list = []

for filename in train_all_files:
    reader = pd.read_csv(filename, index_col=None, header=0,nrows=10000,low_memory=False)
    train_list.append(reader)

train_df = pd.concat(train_list, axis=0, ignore_index=True)

train_df[' Label'] = train_df[' Label'].replace({'BENIGN':0,'DrDoS_NTP':1,'TFTP':1,'Syn':2,'DrDoS_DNS':1,'DrDoS_LDAP':1,'DrDoS_NetBIOS':1,'DrDoS_SNMP':1,'UDP-lag':2,\
                                                'DrDoS_SSDP':1,'DrDoS_MSSQL':1,'DrDoS_UDP':2,'WebDDoS':1})
### testing dataset ###
test_path = r'./testing'
test_all_files = glob.glob(test_path + "/*.csv")
test_list = []

for filename in test_all_files:
    reader = pd.read_csv(filename, index_col=None, header=0,nrows=10000,low_memory=False)
    test_list.append(reader)

test_df = pd.concat(test_list, axis=0, ignore_index=True)
                             
test_df[' Label'] = test_df[' Label'].replace({'BENIGN':0,'Syn':2,'LDAP':1,'NetBIOS':1,'Portmap':1,\
                                                'MSSQL':1,'UDP':2})

### training cluster dataset ###
train_cluster_path = r'./training_clustering'
train_cluster_files = glob.glob(train_cluster_path + "/*.csv")
train_cluster_list = []

for filename in train_cluster_files:
    reader = pd.read_csv(filename, index_col=None, header=0,nrows=10000,low_memory=False)
    train_cluster_list.append(reader)
    
train_cluster_df = pd.concat(train_cluster_list, axis=0, ignore_index=True)

train_cluster_df[' Label'] = train_cluster_df[' Label'].replace({'BENIGN':0,'DrDoS_NTP':1,'TFTP':1,'Syn':2,'DrDoS_DNS':1,'DrDoS_LDAP':1,'DrDoS_NetBIOS':1,'DrDoS_SNMP':1,'UDP-lag':2,\
                                                'DrDoS_SSDP':1,'DrDoS_MSSQL':1,'DrDoS_UDP':2,'WebDDoS':1})


In [3]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import preprocessing

### Data preprocessing
X_train = train_df[[' ACK Flag Count',' Fwd Packet Length Std',' Packet Length Std','Fwd Packets/s',' Protocol',' Flow Duration']].values
y_train = train_df.loc[:,' Label'].values

X_test = test_df[[' ACK Flag Count',' Fwd Packet Length Std',' Packet Length Std','Fwd Packets/s',' Protocol',' Flow Duration']].values
y_test = test_df.loc[:,' Label'].values

### ID3 only

clf = tree.DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_pred,y_test))
print("Confusion Metrix:\n",metrics.confusion_matrix(y_pred,y_test))

y_pred_proba = clf.predict_proba(X_test)
L1_df=pd.DataFrame({'y_ture':y_test,'y_pred':y_pred, 'y_P(c0)':y_pred_proba[:,0], 'y_P(c1)':y_pred_proba[:,1], 'y_P(c2)':y_pred_proba[:,2]})

display(L1_df)

Accuracy: 0.7985428571428571
Confusion Metrix:
 [[ 4540    93   372]
 [  365 44893 13163]
 [    2   107  6465]]


Unnamed: 0,y_ture,y_pred,y_P(c0),y_P(c1),y_P(c2)
0,1,1,0.0,0.795676,0.204324
1,1,1,0.0,0.699402,0.300598
2,1,1,0.0,0.795676,0.204324
3,1,1,0.0,0.795676,0.204324
4,1,1,0.0,0.795676,0.204324
...,...,...,...,...,...
69995,2,1,0.0,0.726974,0.273026
69996,2,1,0.0,0.699402,0.300598
69997,2,1,0.0,0.795676,0.204324
69998,2,2,0.0,0.000000,1.000000


In [5]:
### Clustering only with all training set
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score

### Data preprocessing

X_train = train_df[[' ACK Flag Count',' Fwd Packet Length Std',' Packet Length Std','Fwd Packets/s',' Protocol',' Flow Duration']].values
X_train = preprocessing.scale(X_train)
y_train = train_df.loc[:,' Label'].values

X_test = test_df[[' ACK Flag Count',' Fwd Packet Length Std',' Packet Length Std','Fwd Packets/s',' Protocol',' Flow Duration']].values
X_test = preprocessing.scale(X_test)
y_test = test_df.loc[:,' Label'].values

### Clustering only with all training setfrom sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=0, n_jobs=-1).fit(X_train)



y_pred = kmeans.predict(X_test)
print("Accuracy:", adjusted_mutual_info_score(y_pred, y_test))

Accuracy: 0.3057308681104789


In [7]:
### Clustering only with specified training set

### Data preprocessing
X_train = train_cluster_df[[' Packet Length Mean',' Flow IAT Mean',' ACK Flag Count',' Fwd Packet Length Std','Fwd Packets/s',' Protocol',' Flow Duration']].values
X_train = preprocessing.scale(X_train)
y_train = train_cluster_df[' Label'].values

X_test = test_df[[' Packet Length Mean',' Flow IAT Mean',' ACK Flag Count',' Fwd Packet Length Std','Fwd Packets/s',' Protocol',' Flow Duration']].values
X_test = preprocessing.scale(X_test)
y_test = test_df.loc[:,' Label'].values

### Clustering only with specified training set

kmeans = KMeans(n_clusters=3, random_state=0, n_jobs=-1).fit(X_train)

y_pred = kmeans.predict(X_test)

y_pred = pd.DataFrame({'y':y_pred})
y_train = pd.DataFrame({'y':y_train})

y_pred_tmp = y_pred

idx = [0, 1, 2]

for i in range(3):
    tmp_df_0 = y_pred_tmp[(y_pred_tmp['y'] == i) & (y_train['y'] == 0)]
    tmp_df_1 = y_pred_tmp[(y_pred_tmp['y'] == i) & (y_train['y'] == 1)]
    tmp_df_2 = y_pred_tmp[(y_pred_tmp['y'] == i) & (y_train['y'] == 2)]    
    
    most_class = max(len(tmp_df_0),max(len(tmp_df_1),len(tmp_df_2)))
    
    if most_class == len(tmp_df_0):
        idx[i] = 0
    elif most_class == len(tmp_df_1):
        idx[i] = 1
    else:
        idx[i] = 2

    print('cluster',i ,' class 0 size: ',len(tmp_df_0),' class 1 size: ',len(tmp_df_1),' class 2 size: ',len(tmp_df_2))
    
y_pred = y_pred.replace([0, 1, 2], [idx[0], idx[1], idx[2]])
print("Accuracy:", metrics.accuracy_score(y_pred,y_test))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_pred,y_test))

cluster 0  class 0 size:  0  class 1 size:  15  class 2 size:  747
cluster 1  class 0 size:  2  class 1 size:  213  class 2 size:  11138
cluster 2  class 0 size:  34  class 1 size:  29738  class 2 size:  18113
Accuracy: 0.8482428571428572
Confusion Matrix:
 [[    0     0     0]
 [ 3421 44459  5082]
 [ 1486   634 14918]]


In [179]:
### Hybrid
fuzzy_df = L1_df[((L1_df['y_P(c0)'] > 0.4) & (L1_df['y_P(c0)'] < 0.6) & (L1_df['y_P(c1)'] > 0.4) & (L1_df['y_P(c1)'] < 0.6))\
              | ((L1_df['y_P(c0)'] > 0.4) & (L1_df['y_P(c0)'] < 0.6) & (L1_df['y_P(c2)'] > 0.4) & (L1_df['y_P(c2)'] < 0.6))\
              | ((L1_df['y_P(c2)'] > 0.4) & (L1_df['y_P(c2)'] < 0.6) & (L1_df['y_P(c1)'] > 0.4) & (L1_df['y_P(c1)'] < 0.6))]

display(fuzzy_df)

Unnamed: 0,y_ture,y_pred,y_P(c0),y_P(c1),y_P(c2)
5289,0,1,0.0,0.578756,0.421244
5290,0,1,0.0,0.594120,0.405880
5292,0,1,0.0,0.594120,0.405880
5293,0,1,0.0,0.578756,0.421244
5294,0,1,0.0,0.578756,0.421244
...,...,...,...,...,...
49989,2,1,0.0,0.597403,0.402597
49991,2,1,0.0,0.594120,0.405880
49992,2,1,0.0,0.571429,0.428571
49994,2,1,0.0,0.578756,0.421244


In [180]:
print(fuzzy_df.index)

Int64Index([ 5289,  5290,  5292,  5293,  5294,  5327,  5328,  5329,  5330,
             5331,
            ...
            49980, 49982, 49983, 49984, 49987, 49989, 49991, 49992, 49994,
            49999],
           dtype='int64', length=5650)


In [181]:
test_df_tmp = test_df[[' ACK Flag Count',' Fwd Packet Length Std',' Packet Length Std','Fwd Packets/s',' Protocol',' Flow Duration',' Label']]
display(test_df_tmp)

test_df_fuzzy_remove = test_df_tmp.drop(X_test_fuzzy_remove_tmp.index[fuzzy_df.index])
display(test_df_fuzzy_remove)

X_test_fuzzy_remove = test_df_fuzzy_remove[[' ACK Flag Count',' Fwd Packet Length Std',' Packet Length Std','Fwd Packets/s',' Protocol',' Flow Duration']].values
y_test_fuzzy_remove = test_df_fuzzy_remove[' Label'].values

y_pred_fuzzy_remove = clf.predict(X_test_fuzzy_remove)

print("Accuracy:",metrics.accuracy_score(y_pred_fuzzy_remove,y_test_fuzzy_remove))
print("Confusion Metrix:\n",metrics.confusion_matrix(y_pred_fuzzy_remove,y_test_fuzzy_remove))


Unnamed: 0,ACK Flag Count,Fwd Packet Length Std,Packet Length Std,Fwd Packets/s,Protocol,Flow Duration,Label
0,0,0.00000,0.00000,2.000000e+06,17,1,1
1,0,0.00000,0.00000,4.166667e+04,17,48,1
2,0,0.00000,0.00000,2.000000e+06,17,1,1
3,0,0.00000,0.00000,2.000000e+06,17,1,1
4,0,0.00000,0.00000,2.000000e+06,17,1,1
...,...,...,...,...,...,...,...
69995,0,0.00000,0.00000,0.000000e+00,17,0,2
69996,0,0.00000,0.00000,4.166667e+04,17,48,2
69997,0,0.00000,0.00000,2.000000e+06,17,1,2
69998,0,22.51666,21.36118,3.720342e+01,17,107517,2


Unnamed: 0,ACK Flag Count,Fwd Packet Length Std,Packet Length Std,Fwd Packets/s,Protocol,Flow Duration,Label
0,0,0.00000,0.00000,2.000000e+06,17,1,1
1,0,0.00000,0.00000,4.166667e+04,17,48,1
2,0,0.00000,0.00000,2.000000e+06,17,1,1
3,0,0.00000,0.00000,2.000000e+06,17,1,1
4,0,0.00000,0.00000,2.000000e+06,17,1,1
...,...,...,...,...,...,...,...
69995,0,0.00000,0.00000,0.000000e+00,17,0,2
69996,0,0.00000,0.00000,4.166667e+04,17,48,2
69997,0,0.00000,0.00000,2.000000e+06,17,1,2
69998,0,22.51666,21.36118,3.720342e+01,17,107517,2


Accuracy: 0.8608547008547008
Confusion Metrix:
 [[ 4464    93   372]
 [  315 44787  8091]
 [    3    80  6145]]


In [224]:
from sklearn import preprocessing 
import glob

path = r'./training_clustering'

train_files_cluster = glob.glob(path + "/*.csv")
train_list_cluster = []

for filename in train_files_cluster:
    reader = pd.read_csv(filename, index_col=None, header=0,nrows=10000,low_memory=False)
    train_list_cluster.append(reader)
    
train_df_cluster = pd.concat(train_list_cluster, axis=0, ignore_index=True)

train_df_cluster[' Label'] = train_df_cluster[' Label'].replace({'BENIGN':0,'DrDoS_NTP':1,'TFTP':1,'Syn':2,'DrDoS_DNS':1,'DrDoS_LDAP':1,'DrDoS_NetBIOS':1,'DrDoS_SNMP':1,'UDP-lag':2,\
                                                'DrDoS_SSDP':1,'DrDoS_MSSQL':1,'DrDoS_UDP':2,'WebDDoS':1})


L2_train_df = train_df_cluster[[' Packet Length Mean',' Flow IAT Mean',' ACK Flag Count',' Fwd Packet Length Std','Fwd Packets/s',' Protocol',' Flow Duration']]


L2_test_df = test_df[[' Packet Length Mean',' Flow IAT Mean',' ACK Flag Count',' Fwd Packet Length Std','Fwd Packets/s',' Protocol',' Flow Duration']].values
L2_test_df = preprocessing.scale(L2_test_df)
L2_test_df = pd.DataFrame(L2_test_df)

X_train_L2 = preprocessing.scale(L2_train_df)
y_train_L2 = train_df_cluster[' Label'].values


X_test_L2 = L2_test_df.iloc[fuzzy_df.index,:]
X_test_L2 = X_test_L2.values
y_test_L2= test_df[[' Label']].iloc[fuzzy_df.index,:]
y_test_L2 = y_test_L2.values


In [226]:
display(L2_train_df)
display(y_train_L2)

Unnamed: 0,Packet Length Mean,Flow IAT Mean,ACK Flag Count,Fwd Packet Length Std,Fwd Packets/s,Protocol,Flow Duration
0,0.0,106.395910,0,0.00000,9.395904e+03,0,9141643
1,1472.0,1.000000,0,0.00000,2.000000e+06,17,1
2,1472.0,2.000000,0,0.00000,1.000000e+06,17,2
3,1472.0,1.000000,0,0.00000,2.000000e+06,17,1
4,1472.0,2.000000,0,0.00000,1.000000e+06,17,2
...,...,...,...,...,...,...,...
59995,345.6,35347.000000,0,22.51666,3.772126e+01,17,106041
59996,383.0,1.000000,0,0.00000,2.000000e+06,17,1
59997,401.0,1.000000,0,0.00000,2.000000e+06,17,1
59998,345.6,35760.333333,0,22.51666,3.728526e+01,17,107281


array([1, 1, 1, ..., 2, 2, 2], dtype=int64)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=0m n_jobs=-1).fit(X_train_L2)
y_pred_L2 = kmeans.predict(X_test_L2)

kmeans_out_df = pd.DataFrame({'cluster_labels':kmeans.labels_,'y':y_train_L2})
y_pred_L2_df = pd.DataFrame({'y_pred':y_pred_L2})

In [227]:
from sklearn.cluster import KMeans
"""
X_train_L2 = L2_train_df[[' Flow IAT Mean',' ACK Flag Count',' Packet Length Mean','Fwd Packets/s',' Protocol',' Flow Duration']].values
X_train_L2 = preprocessing.scale(X_train_L2)
y_train_L2 = L2_train_df[' Label'].values
"""

cluster_num = 200;
kmeans = KMeans(n_clusters=cluster_num, random_state=0, n_jobs=-1).fit(X_train_L2)
y_pred_L2 = kmeans.predict(X_test_L2)

kmeans_out_df = pd.DataFrame({'cluster_labels':kmeans.labels_,'y':y_train_L2})
y_pred_L2_df = pd.DataFrame({'y_pred':y_pred_L2})



#tmp_df = kmeans_out_df.iloc[fuzzy_df.index,:]
#tmp_df2 = y_pred_L2_df

#tmp_df2 = pd.DataFrame({'y_pred_cluster':y_pred_L2,'kmeans_cluster':tmp_df['cluster_labels']})
#tmp_df2.to_csv('tmp_out.csv', index=False)

#y_pred_L2_df.to_csv('kmeans_y_pred.csv', index=False)



pure = 0
for i in range(0,cluster_num):
    tmp_df_0 = kmeans_out_df[(kmeans_out_df['cluster_labels']==i) & (kmeans_out_df['y']==0)]
    tmp_df_1 = kmeans_out_df[(kmeans_out_df['cluster_labels']==i) & (kmeans_out_df['y']==1)]
    tmp_df_2 = kmeans_out_df[(kmeans_out_df['cluster_labels']==i) & (kmeans_out_df['y']==2)]
    
    
    most_class = max(len(tmp_df_0),max(len(tmp_df_1),len(tmp_df_2)))
    pure+= most_class
    
    if most_class == len(tmp_df_0):
        #kmeans_out_df['cluster_labels'] = kmeans_out_df['cluster_labels'].replace({i:0})
        y_pred_L2_df['y_pred'] = y_pred_L2_df['y_pred'].replace({i:0})
    elif most_class == len(tmp_df_1):
        #kmeans_out_df['cluster_labels'] = kmeans_out_df['cluster_labels'].replace({i:1})
        y_pred_L2_df['y_pred'] = y_pred_L2_df['y_pred'].replace({i:1})
    else:
        #kmeans_out_df['cluster_labels'] = kmeans_out_df['cluster_labels'].replace({i:2})
        y_pred_L2_df['y_pred'] = y_pred_L2_df['y_pred'].replace({i:2})
    

 
    print('cluster',i ,' class 0 size: ',len(tmp_df_0),' class 1 size: ',len(tmp_df_1),' class 2 size: ',len(tmp_df_2))

print('precision: ',pure/len(X_train_L2))
#kmeans_out_df.to_csv('kmeans_out.csv', index=False)
#y_pred_L2_df.to_csv('kmeans_y_pred_postprocess.csv', index=False)


cluster 0  class 0 size:  0  class 1 size:  11889  class 2 size:  0
cluster 1  class 0 size:  0  class 1 size:  0  class 2 size:  54
cluster 2  class 0 size:  0  class 1 size:  0  class 2 size:  3080
cluster 3  class 0 size:  0  class 1 size:  0  class 2 size:  3704
cluster 4  class 0 size:  0  class 1 size:  654  class 2 size:  2604
cluster 5  class 0 size:  0  class 1 size:  5160  class 2 size:  0
cluster 6  class 0 size:  0  class 1 size:  0  class 2 size:  1310
cluster 7  class 0 size:  0  class 1 size:  198  class 2 size:  382
cluster 8  class 0 size:  0  class 1 size:  0  class 2 size:  3183
cluster 9  class 0 size:  0  class 1 size:  0  class 2 size:  96
cluster 10  class 0 size:  0  class 1 size:  1263  class 2 size:  0
cluster 11  class 0 size:  0  class 1 size:  0  class 2 size:  642
cluster 12  class 0 size:  1  class 1 size:  0  class 2 size:  0
cluster 13  class 0 size:  0  class 1 size:  0  class 2 size:  18
cluster 14  class 0 size:  0  class 1 size:  131  class 2 size: 

cluster 184  class 0 size:  0  class 1 size:  0  class 2 size:  79
cluster 185  class 0 size:  0  class 1 size:  0  class 2 size:  4
cluster 186  class 0 size:  0  class 1 size:  0  class 2 size:  179
cluster 187  class 0 size:  0  class 1 size:  0  class 2 size:  6
cluster 188  class 0 size:  0  class 1 size:  57  class 2 size:  0
cluster 189  class 0 size:  0  class 1 size:  0  class 2 size:  102
cluster 190  class 0 size:  0  class 1 size:  330  class 2 size:  1
cluster 191  class 0 size:  0  class 1 size:  4  class 2 size:  0
cluster 192  class 0 size:  0  class 1 size:  99  class 2 size:  321
cluster 193  class 0 size:  0  class 1 size:  1  class 2 size:  0
cluster 194  class 0 size:  0  class 1 size:  0  class 2 size:  89
cluster 195  class 0 size:  0  class 1 size:  777  class 2 size:  11
cluster 196  class 0 size:  0  class 1 size:  0  class 2 size:  101
cluster 197  class 0 size:  0  class 1 size:  27  class 2 size:  0
cluster 198  class 0 size:  0  class 1 size:  0  class 2 s

In [230]:
print("Accuracy:",metrics.accuracy_score(y_pred_L2_df['y_pred'],y_test_L2))0

Accuracy: 0.9725663716814159


In [215]:
display(tmp_df)
display(tmp_df2)

Unnamed: 0,cluster_labels,y
5289,8,0
5290,2,0
5292,2,0
5293,8,0
5294,8,0
...,...,...
49989,2,2
49991,2,2
49992,2,2
49994,8,2


Unnamed: 0,y_pred
0,8
1,2
2,2
3,8
4,8
...,...
5645,2
5646,2
5647,2
5648,8


In [152]:

display(tmp_df)
display(y_pred_L2)


Unnamed: 0,cluster_labels,y
5289,2,1
5290,2,1
5292,2,1
5293,2,1
5294,2,1
...,...,...
49989,7,0
49991,69,1
49992,69,1
49994,7,1


array([164, 142, 142, ..., 143, 366, 366])

In [201]:
from sklearn.cluster import KMeans
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],[10, 2], [10, 4], [10, 0]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
display(kmeans.labels_)
display(kmeans.predict([[1, 0], [10, 2]]))


array([1, 1, 1, 0, 0, 0])

array([1, 0])