In [1]:
# import libs
import numpy as np
import pandas as pd
import os
import scipy.io as io
from sklearn import preprocessing
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from numpy import array
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [2]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess2017(data):
    data.dropna(how='any', inplace=True)
#    print(len(data))
    data.reset_index(inplace=True, drop=True)
    labels = data[' Label']
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', ' Label']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    

    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    return data,labels

def preprocess2019(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data

# 数据切割 
def split_train_test(data,test_ratio):
    #设置随机数种子，保证每次生成的结果都是一样的
    np.random.seed(42)
    #permutation随机生成0-len(data)随机序列
    shuffled_indices = np.random.permutation(len(data))
    #test_ratio为测试集所占的百分比
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    #iloc选择参数序列中所对应的行
    return data.iloc[train_indices],data.iloc[test_indices]

def meanScaler(data):
    cols = data.columns
    means = data.mean()
    loc = np.where(means == 0.0)[0]
    data = data.drop(cols[loc],axis=1)
    means = means.drop(cols[loc])
    # print(len(allX), len(means))
    # 均值归一化
    data = data / means
    return data

## 数据载入与预处理
载入训练集与测试集，训练集数据规模为200000，测试集数据规模为50000

In [3]:
train_set_dir = '/mnt/hgfs/linuxfile/train_set/'
test_set_dir = '/mnt/hgfs/linuxfile/test_set/'

In [4]:
trainfiles = os.listdir(train_set_dir)
testfiles = os.listdir(test_set_dir)

In [5]:
# 数据过大，使用较小的数据用于学习

#训练集使用每个标签各200000行的demo用于训练
train_set_demo = pd.DataFrame()
numfiles = len(trainfiles)
for i in range(numfiles):
    tdata = pd.read_csv(train_set_dir+trainfiles[i], nrows=200000)
    print('from ' + trainfiles[i]+' get '+str(len(tdata)) + ' rows data')
    train_set_demo = train_set_demo.append(tdata)
    train_set_demo.reset_index(inplace=True, drop=True)

# 测试集使用每个标签各50000行的demo测试
test_set_demo = pd.DataFrame()
numfiles = len(testfiles)
for i in range(numfiles):
    tdata = pd.read_csv(test_set_dir+testfiles[i], nrows=50000)
    print('from ' + testfiles[i]+' get '+str(len(tdata)) + ' rows data')
    test_set_demo = test_set_demo.append(tdata)
    test_set_demo.reset_index(inplace=True, drop=True)

from train_BENIGN_data.csv get 42527 rows data
from train_DrDoS_DNS.csv get 200000 rows data
from train_DrDoS_LDAP.csv get 200000 rows data
from train_DrDoS_MSSQL.csv get 200000 rows data
from train_DrDoS_NetBIOS.csv get 200000 rows data
from train_DrDoS_NTP.csv get 200000 rows data
from train_DrDoS_SNMP.csv get 200000 rows data
from train_DrDoS_SSDP.csv get 200000 rows data
from train_DrDoS_UDP.csv get 200000 rows data
from train_Syn.csv get 200000 rows data
from train_TFTP.csv get 200000 rows data
from test_BENIGN_data.csv get 10631 rows data
from test_DrDoS_DNS.csv get 50000 rows data
from test_DrDoS_LDAP.csv get 50000 rows data
from test_DrDoS_MSSQL.csv get 50000 rows data
from test_DrDoS_NetBIOS.csv get 50000 rows data
from test_DrDoS_NTP.csv get 50000 rows data
from test_DrDoS_SNMP.csv get 50000 rows data
from test_DrDoS_SSDP.csv get 50000 rows data
from test_DrDoS_UDP.csv get 50000 rows data
from test_Syn.csv get 50000 rows data
from test_TFTP.csv get 50000 rows data


In [6]:
## 20features
ExtraTreeFeatures = [' Total Fwd Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Max', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', ' Subflow Fwd Bytes']

LogisticRegressionFeatures =['Total Length of Fwd Packets', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', ' Bwd Packet Length Min', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', 'Fwd IAT Total', ' Fwd IAT Std', 'Fwd Packets/s', ' Min Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes']
# ## 30 features
# ExtraTreeFeatures=[' Flow Duration', ' Total Fwd Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes']
# LogisticRegressionFeatures=[' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', ' Bwd Packet Length Min', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Std', ' Bwd Header Length', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' PSH Flag Count', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes', ' Active Min']
allcols = list(set(ExtraTreeFeatures).union(set(LogisticRegressionFeatures)))
allcols.append(' Label')
numcols = len(allcols)

In [None]:
train_features, train_labels = preprocess2017(train_set_demo)
test_features, test_labels = preprocess2017(test_set_demo)
del train_set_demo, test_set_demo

In [None]:
## 数据处理
# labels 转换成用数表示其分类结果
labelset = set(train_labels)
label_dict = {}
i = 0
for l in labelset:
    label_dict[l] = i
    train_labels.replace(to_replace=l, value=i, inplace=True)
    test_labels.replace(to_replace=l, value=i, inplace=True)
    i = i + 1

* 'DrDoS_SNMP': 0,
* 'BENIGN': 1,
* 'DrDoS_DNS': 2,
* 'DrDoS_SSDP': 3,
* 'Syn': 4,
* 'WebDDoS': 5,
* 'DrDoS_UDP': 6,
* 'DrDoS_NTP': 7,
* 'DrDoS_MSSQL': 8,
* 'DrDoS_LDAP': 9,
* 'UDP-lag': 10,
* 'DrDoS_NetBIOS': 11,
* 'TFTP': 12

In [None]:
# X,Y数据提取
train_X = meanScaler(train_features).values
train_Y = train_labels.values

test_X = meanScaler(test_features).values
test_Y = test_labels.values
del train_features, train_labels, test_features, test_labels

In [None]:
len(test_X), len(test_Y)

In [None]:
## 数据无量纲化处理
# 标准化
# transScaler = preprocessing.StandardScaler().fit(X)
# X_ = transScaler.transform(X)
# # min_max归一化
# transScaler = preprocessing.MinMaxScaler().fit(X)
# X_ = transScaler.transform(X)
# X_ = X
## mean归一化


## 训练，使用多种模型尝试

### 极限树分类

In [None]:
# 极限树分类
Extra_model = ExtraTreesClassifier(random_state = 1).fit(train_X,train_Y)

test_Y_hat = Extra_model.predict(test_X)



In [None]:
print(accuracy_score(test_Y, test_Y_hat))
print(precision_score(test_Y, test_Y_hat,average='micro'))
print(recall_score(test_Y, test_Y_hat,average='micro'))
print(f1_score(test_Y, test_Y_hat,average='micro'))
print(classification_report(test_Y, test_Y_hat))

#### extraTree 30 features
0.7399039497848786
0.7399039497848786
0.7399039497848786
0.7399039497848786

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     51199
           1       0.48      0.35      0.40     49221
           2       1.00      0.99      1.00     48560
           3       0.59      0.98      0.74     43546
           4       0.50      0.64      0.56     49063
           5       0.49      0.65      0.55     49340
           6       0.32      0.28      0.30        53
           7       0.90      0.21      0.34     44564
           8       0.61      0.87      0.72     49897
           9       0.93      0.95      0.94     48595
          10       0.95      0.99      0.97     48395
          11       0.84      0.22      0.35     48389
          12       0.99      1.00      1.00     49083

    accuracy                           0.74    579905
   macro avg       0.74      0.70      0.68    579905
weighted avg       0.77      0.74      0.72    579905



### 随机森林分类

In [None]:
RandomForest_model = RandomForestClassifier().fit(train_X,train_Y)

test_Y_hat = RandomForest_model.predict(test_X)

print(accuracy_score(test_Y, test_Y_hat))
print(precision_score(test_Y, test_Y_hat,average='micro'))
print(recall_score(test_Y, test_Y_hat,average='micro'))
print(f1_score(test_Y, test_Y_hat,average='micro'))
print(classification_report(test_Y, test_Y_hat))

#### randomforest 30 features

0.7465533147670739

0.7465533147670739

0.7465533147670739

0.7465533147670738

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     51199
           1       0.48      0.27      0.34     49221
           2       1.00      0.99      1.00     48560
           3       0.59      0.98      0.74     43546
           4       0.51      0.68      0.58     49063
           5       0.48      0.73      0.58     49340
           6       0.36      0.19      0.25        53
           7       0.90      0.21      0.34     44564
           8       0.67      0.85      0.75     49897
           9       0.93      0.95      0.94     48595
          10       0.95      0.99      0.97     48395
          11       0.79      0.28      0.41     48389
          12       0.99      1.00      1.00     49083

    accuracy                           0.75    579905
   macro avg       0.74      0.70      0.68    579905
weighted avg       0.78      0.75      0.72    579905

### SVM分类

In [None]:
# from sklearn.svm import SVC

In [None]:
# svm_model = SVC(kernel='linear', C = 1.0, random_state=1).fit(train_X, train_Y)

# test_Y_hat = svm_model.predict(test_X)

# print(accuracy_score(test_Y, test_Y_hat))
# print(precision_score(test_Y, test_Y_hat,average='micro'))
# print(recall_score(test_Y, test_Y_hat,average='micro'))
# print(f1_score(test_Y, test_Y_hat,average='micro'))
# # print(classification_report(test_Y, test_Y_hat))

### KNN分类

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import cross_val_score

In [None]:
# ## 交叉验证选择合适的K值
# k_range = range(1,20)
# k_error = []
# for k in k_range:
#     print('第' + str(k) + '次检验')
#     knn = KNeighborsClassifier(n_neighbors=k)
#     scores = cross_val_score(knn, test_X, test_Y, cv = 6, scoring='accuracy')
#     k_error.append(1-scores.mean())

### GBDT 分类

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBDT_model = GradientBoostingClassifier(n_estimators=200).fit(train_X, train_Y)
test_Y_hat = GBDT_model.predict(test_X)

print(accuracy_score(test_Y, test_Y_hat))
print(precision_score(test_Y, test_Y_hat,average='micro'))
print(recall_score(test_Y, test_Y_hat,average='micro'))
print(f1_score(test_Y, test_Y_hat,average='micro'))
print(classification_report(test_Y, test_Y_hat))

### GDBT 30 features
0.7754756401586644

0.7754756401586644

0.7754756401586644

0.7754756401586644

            precision    recall  f1-score   support

           0       0.52      0.76      0.62     49340
           1       0.74      0.27      0.40     48389
           2       0.92      0.95      0.93     48595
           3       0.53      0.25      0.34     49221
           4       0.99      1.00      1.00     43546
           5       0.50      0.66      0.57     49063
           6       0.95      0.99      0.97     48395
           7       0.67      0.85      0.75     49897
           8       1.00      0.99      1.00     48560
           9       0.99      0.99      0.99     11296
          10       0.99      1.00      1.00     49083

    accuracy                           0.78    495385
   macro avg       0.80      0.79      0.78    495385
weighted avg       0.78      0.78      0.76    495385



### AdaBoost 分类

In [None]:
from sklearn.ensemble import AdaBoostClassifier

AdaBoost_model = AdaBoostClassifier().fit(train_X, train_Y)
test_Y_hat = AdaBoost_model.predict(test_X)

print(accuracy_score(test_Y, test_Y_hat))
print(precision_score(test_Y, test_Y_hat,average='micro'))
print(recall_score(test_Y, test_Y_hat,average='micro'))
print(f1_score(test_Y, test_Y_hat,average='micro'))
print(classification_report(test_Y, test_Y_hat))

### AdaBoost 30 features
0.5558747236997487

0.5558747236997487

0.5558747236997487

0.5558747236997487

              precision    recall  f1-score   support

           0       0.32      0.08      0.12     49340
           1       0.12      0.13      0.12     48389
           2       0.37      0.87      0.52     48595
           3       0.44      0.35      0.39     49221
           4       0.98      1.00      0.99     43546
           5       0.40      0.73      0.52     49063
           6       0.93      0.99      0.96     48395
           7       0.33      0.00      0.00     49897
           8       0.99      0.54      0.70     48560
           9       0.93      0.94      0.93     11296
          10       0.80      0.86      0.83     49083

    accuracy                           0.56    495385
   macro avg       0.60      0.59      0.55    495385
weighted avg       0.57      0.56      0.52    495385



In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_model = LogisticRegression(penalty='l2').fit(train_X, train_Y)
test_Y_hat = lr_model.predict(test_X)

In [None]:
print(accuracy_score(test_Y, test_Y_hat))
print(precision_score(test_Y, test_Y_hat,average='micro'))
print(recall_score(test_Y, test_Y_hat,average='micro'))
print(f1_score(test_Y, test_Y_hat,average='micro'))
# print(classification_report(test_Y, test_Y_hat))