In [4]:
# import libs
import numpy as np
import pandas as pd
import os
import scipy.io as io
from sklearn import preprocessing
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from numpy import array
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [5]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess2017(data):
    data.dropna(how='any', inplace=True)
#    print(len(data))
    data.reset_index(inplace=True, drop=True)
    labels = data[' Label']
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', ' Label']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    

    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    return data,labels

def preprocess2019(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data

# 数据切割 
def split_train_test(data,test_ratio):
    #设置随机数种子，保证每次生成的结果都是一样的
    np.random.seed(42)
    #permutation随机生成0-len(data)随机序列
    shuffled_indices = np.random.permutation(len(data))
    #test_ratio为测试集所占的百分比
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    #iloc选择参数序列中所对应的行
    return data.iloc[train_indices],data.iloc[test_indices]

def meanScaler(data):
    cols = data.columns
    means = data.mean()
    loc = np.where(means == 0.0)[0]
    data = data.drop(cols[loc],axis=1)
    means = means.drop(cols[loc])
    # print(len(allX), len(means))
    # 均值归一化
    data = data / means
    return data

## 数据载入与预处理
载入训练集与测试集，训练集数据规模为200000，测试集数据规模为50000

In [6]:
train_set_dir = '/mnt/hgfs/linuxfile/train_set/'
test_set_dir = '/mnt/hgfs/linuxfile/test_set/'

In [7]:
trainfiles = os.listdir(train_set_dir)
testfiles = os.listdir(test_set_dir)

In [8]:
# 数据过大，使用较小的数据用于学习

#训练集使用每个标签各200000行的demo用于训练
train_set_demo = pd.DataFrame()
numfiles = len(trainfiles)
for i in range(numfiles):
    tdata = pd.read_csv(train_set_dir+trainfiles[i], nrows=200000)
    train_set_demo = train_set_demo.append(tdata)
    train_set_demo.reset_index(inplace=True, drop=True)

# 测试集使用每个标签各50000行的demo测试
test_set_demo = pd.DataFrame()
numfiles = len(testfiles)
for i in range(numfiles):
    tdata = pd.read_csv(test_set_dir+testfiles[i], nrows=50000)
    print('from ' + testfiles[i]+' get '+str(len(tdata)) + ' rows data')
    test_set_demo = test_set_demo.append(tdata)
    test_set_demo.reset_index(inplace=True, drop=True)

from test_BENIGN_data.csv get 10631 rows data
from test_DrDoS_DNS.csv get 50000 rows data
from test_DrDoS_LDAP.csv get 50000 rows data
from test_DrDoS_MSSQL.csv get 50000 rows data
from test_DrDoS_NetBIOS.csv get 50000 rows data
from test_DrDoS_NTP.csv get 50000 rows data
from test_DrDoS_SNMP.csv get 50000 rows data
from test_DrDoS_SSDP.csv get 50000 rows data
from test_DrDoS_UDP.csv get 50000 rows data
from test_Syn.csv get 50000 rows data
from test_TFTP.csv get 50000 rows data


In [9]:
ExtraTreeFeatures=[' Flow Duration', ' Total Fwd Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes']
LogisticRegressionFeatures=[' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', ' Bwd Packet Length Min', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Std', ' Bwd Header Length', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' PSH Flag Count', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes', ' Active Min']
allcols = list(set(ExtraTreeFeatures).union(set(LogisticRegressionFeatures)))
allcols.append(' Label')
numcols = len(allcols)

In [10]:
train_features, train_labels = preprocess2017(train_set_demo)
test_features, test_labels = preprocess2017(test_set_demo)
del train_set_demo, test_set_demo

In [11]:
## 数据处理
# labels 转换成用数表示其分类结果
labelset = set(train_labels)
label_dict = {}
i = 0
for l in labelset:
    label_dict[l] = i
    train_labels.replace(to_replace=l, value=i, inplace=True)
    test_labels.replace(to_replace=l, value=i, inplace=True)
    i = i + 1

* 'DrDoS_SNMP': 0,
* 'BENIGN': 1,
* 'DrDoS_DNS': 2,
* 'DrDoS_SSDP': 3,
* 'Syn': 4,
* 'WebDDoS': 5,
* 'DrDoS_UDP': 6,
* 'DrDoS_NTP': 7,
* 'DrDoS_MSSQL': 8,
* 'DrDoS_LDAP': 9,
* 'UDP-lag': 10,
* 'DrDoS_NetBIOS': 11,
* 'TFTP': 12

In [12]:
# X,Y数据提取
train_X = meanScaler(train_features).values
train_Y = train_labels.values

test_X = meanScaler(test_features).values
test_Y = test_labels.values
del train_features, train_labels, test_features, test_labels

In [13]:
len(test_X), len(test_Y)

(495385, 495385)

### 将处理好的数据给保存成mat形式，用于MATLAB处理

In [14]:
## train_set
train_X_path = '/mnt/hgfs/linuxfile/mat/train_X.mat'
io.savemat(train_X_path,{'train_X':train_X})

train_Y_path = '/mnt/hgfs/linuxfile/mat/train_Y.mat'
io.savemat(train_Y_path,{'train_Y':train_Y})
## test_set
test_X_path = '/mnt/hgfs/linuxfile/mat/test_X.mat'
io.savemat(test_X_path,{'test_X':test_X})

test_Y_path = '/mnt/hgfs/linuxfile/mat/test_Y.mat'
io.savemat(test_Y_path,{'test_Y':test_Y})

### 数据无量纲化处理

In [15]:

# 标准化
# transScaler = preprocessing.StandardScaler().fit(X)
# X_ = transScaler.transform(X)
# # min_max归一化
# transScaler = preprocessing.MinMaxScaler().fit(X)
# X_ = transScaler.transform(X)
# X_ = X
## mean归一化


## 训练，使用多种模型尝试

### 极限树分类

In [16]:
# # 极限树分类
# Extra_model = ExtraTreesClassifier(random_state = 1).fit(train_X,train_Y)

# test_Y_hat = Extra_model.predict(test_X)



In [17]:
# print(accuracy_score(test_Y, test_Y_hat))
# print(precision_score(test_Y, test_Y_hat,average='micro'))
# print(recall_score(test_Y, test_Y_hat,average='micro'))
# print(f1_score(test_Y, test_Y_hat,average='micro'))
# print(classification_report(test_Y, test_Y_hat))

#### extraTree
0.7399039497848786
0.7399039497848786
0.7399039497848786
0.7399039497848786

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     51199
           1       0.48      0.35      0.40     49221
           2       1.00      0.99      1.00     48560
           3       0.59      0.98      0.74     43546
           4       0.50      0.64      0.56     49063
           5       0.49      0.65      0.55     49340
           6       0.32      0.28      0.30        53
           7       0.90      0.21      0.34     44564
           8       0.61      0.87      0.72     49897
           9       0.93      0.95      0.94     48595
          10       0.95      0.99      0.97     48395
          11       0.84      0.22      0.35     48389
          12       0.99      1.00      1.00     49083

    accuracy                           0.74    579905
   macro avg       0.74      0.70      0.68    579905
weighted avg       0.77      0.74      0.72    579905



### 随机森林分类

In [18]:
# RandomForest_model = RandomForestClassifier().fit(train_X,train_Y)

# test_Y_hat = RandomForest_model.predict(test_X)

# print(accuracy_score(test_Y, test_Y_hat))
# print(precision_score(test_Y, test_Y_hat,average='micro'))
# print(recall_score(test_Y, test_Y_hat,average='micro'))
# print(f1_score(test_Y, test_Y_hat,average='micro'))
# print(classification_report(test_Y, test_Y_hat))

#### randomforest

0.7465533147670739

0.7465533147670739

0.7465533147670739

0.7465533147670738

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     51199
           1       0.48      0.27      0.34     49221
           2       1.00      0.99      1.00     48560
           3       0.59      0.98      0.74     43546
           4       0.51      0.68      0.58     49063
           5       0.48      0.73      0.58     49340
           6       0.36      0.19      0.25        53
           7       0.90      0.21      0.34     44564
           8       0.67      0.85      0.75     49897
           9       0.93      0.95      0.94     48595
          10       0.95      0.99      0.97     48395
          11       0.79      0.28      0.41     48389
          12       0.99      1.00      1.00     49083

    accuracy                           0.75    579905
   macro avg       0.74      0.70      0.68    579905
weighted avg       0.78      0.75      0.72    579905

### SVM分类

In [19]:
# from sklearn import svm

In [20]:
# svm_model = SVC(probability=True).fit(train_X, train_Y)

# test_Y_hat = svm_model.predict(test_X)

# print(accuracy_score(test_Y, test_Y_hat))
# print(precision_score(test_Y, test_Y_hat,average='micro'))
# print(recall_score(test_Y, test_Y_hat,average='micro'))
# print(f1_score(test_Y, test_Y_hat,average='micro'))
# print(classification_report(test_Y, test_Y_hat))

### LogisticRegression 分类

In [21]:
from sklearn.linear_model import LogisticRegression

In [23]:
lr_model = LogisticRegression(penalty='l2').fit(train_X, train_Y)
test_Y_hat = lr_model.predict(test_X)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [24]:
print(accuracy_score(test_Y, test_Y_hat))
print(precision_score(test_Y, test_Y_hat,average='micro'))
print(recall_score(test_Y, test_Y_hat,average='micro'))
print(f1_score(test_Y, test_Y_hat,average='micro'))
print(classification_report(test_Y, test_Y_hat))

0.6105513893234555
0.6105513893234555
0.6105513893234555
0.6105513893234555
              precision    recall  f1-score   support

           0       0.38      0.91      0.53     48595
           1       0.98      0.95      0.97     11296
           2       0.05      0.01      0.01     49221
           3       0.50      0.47      0.49     49340
           4       0.90      0.85      0.87     48395
           5       0.93      0.92      0.92     49083
           6       0.51      0.73      0.60     49897
           7       0.99      0.54      0.70     48560
           8       0.46      0.61      0.52     49063
           9       0.99      1.00      0.99     43546
          10       0.14      0.03      0.05     48389

    accuracy                           0.61    495385
   macro avg       0.62      0.64      0.61    495385
weighted avg       0.59      0.61      0.57    495385



### LogisticRegression 30 feature
0.6105513893234555

0.6105513893234555

0.6105513893234555

0.6105513893234555

            precision    recall  f1-score   support

           0       0.38      0.91      0.53     48595
           1       0.98      0.95      0.97     11296
           2       0.05      0.01      0.01     49221
           3       0.50      0.47      0.49     49340
           4       0.90      0.85      0.87     48395
           5       0.93      0.92      0.92     49083
           6       0.51      0.73      0.60     49897
           7       0.99      0.54      0.70     48560
           8       0.46      0.61      0.52     49063
           9       0.99      1.00      0.99     43546
          10       0.14      0.03      0.05     48389

    accuracy                           0.61    495385
   macro avg       0.62      0.64      0.61    495385
weighted avg       0.59      0.61      0.57    495385