In [1]:
# import libs
import numpy as np
import pandas as pd
import os
import scipy.io as io
from sklearn import preprocessing
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from numpy import array
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
import math
from scipy.linalg import expm,logm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [2]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess2017(data):
    
    data.dropna(how='any', inplace=True)
#    print(len(data))
    data.reset_index(inplace=True, drop=True)
    
    labels = pd.DataFrame(data[' Label'])
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', ' Label']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    

    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    return data,labels

def preprocess2019(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data

def create_spd_matrix(X,Y, nums4group):
    nums, nfeatures = X.shape
    step = nums4group // 1
    n = nums - (nums-(nums4group)) % step
    nmatrix = int((n - (nums4group - step)) / step)
    X_matrix = np.zeros(shape = (nums4group, nfeatures, nmatrix))
    Y_ = np.zeros(shape=(nmatrix, 1))
    for i in range(0,int(n - (nums4group - step)), step):
        X_matrix[:,:, i // step] = X[i:i+nums4group,:]
        
    X_spd_matrix = np.zeros(shape = (nmatrix, nfeatures, nfeatures))
    for i in range(nmatrix):
        cov_matrix = np.cov(X_matrix[:,:,i].T)
        X_spd_matrix[i,:,:] = cov_matrix + 0.01 * (np.trace(cov_matrix) + 1e-8) * np.eye(nfeatures, nfeatures)
    train_spd_X = X_spd_matrix
    for i in range(0,int(n - (nums4group - step)), step):
        Y_[i // step] = np.argmax(np.bincount(Y[i:i+nums4group]))
    train_spd_Y = Y_
    return train_spd_X, train_spd_Y  
## 将协方差矩阵转成上三角矩阵
def dealSPD(A):
    n = A.shape[0]
    B = np.zeros(shape=(1, n*(n+1)//2))
    j = 0
    for i in range(0,n):
        B[:,j:j+n-i] = A[i,i:n]
        j = j + n -i
    return B

In [3]:
selcols = [' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Flow IAT Mean', 'Fwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Average Packet Size', ' Avg Fwd Segment Size']
## 20features
ExtraTreeFeatures = [' Total Fwd Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Max', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', ' Subflow Fwd Bytes']

LogisticRegressionFeatures =['Total Length of Fwd Packets', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', ' Bwd Packet Length Min', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', 'Fwd IAT Total', ' Fwd IAT Std', 'Fwd Packets/s', ' Min Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes']
# ## 30 features
# ExtraTreeFeatures=[' Flow Duration', ' Total Fwd Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes']
# LogisticRegressionFeatures=[' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', ' Bwd Packet Length Min', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Std', ' Bwd Header Length', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' PSH Flag Count', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes', ' Active Min']
allcols = list(set(ExtraTreeFeatures).union(set(LogisticRegressionFeatures)))
# allcols.append(' Label')
numcols = len(allcols)

## 计算基线

In [6]:
# 从文件夹'/mnt/hgfs/linuxfile/2017/'中读取数据文件
filedir = '/mnt/hgfs/linuxfile/2017/'
files = os.listdir(filedir)
# 读取星期一的数据作为安全基线
benign = pd.read_csv(filedir+files[4])

alldata, labels = preprocess2017(benign)
mean = alldata.mean()
cols = alldata.columns

# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
loc = np.where(mean == 0.0)[0] # 定位
mean = mean.drop(cols[loc])    # 删除
benigndata = alldata.drop(cols[loc],axis=1) # 删除
benigndata = benigndata / mean

# 选择反应数据风险值的特征
selcols = [' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Flow IAT Mean', 'Fwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Average Packet Size', ' Avg Fwd Segment Size']
# select_cols = [' Fwd Packet Length Min', ' Min Packet Length', ' Packet Length Mean', ' Average Packet Size', ' Subflow Fwd Bytes']

In [7]:
files

['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
 'Friday-WorkingHours-Morning.pcap_ISCX.csv',
 'Monday-WorkingHours.pcap_ISCX.csv',
 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
 'Tuesday-WorkingHours.pcap_ISCX.csv',
 'Wednesday-workingHours.pcap_ISCX.csv']

In [9]:
benigndata

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Subflow Bwd Packets,Subflow Bwd Bytes,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,0.000019,0.176777,0.183519,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.183519,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
1,0.000009,0.176777,0.367038,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.367038,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
2,0.011190,0.176777,0.183519,0.086397,0.010184,0.341841,2.791598,1.151725,0.0,0.148443,...,0.183519,0.010184,0.00000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
3,0.004145,0.176777,0.183519,0.086397,0.010184,0.341841,2.791598,1.151725,0.0,0.148443,...,0.183519,0.010184,0.00000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
4,12.446044,25.986216,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,28.96055,51.63469,36.841744,0.002533,3.642448,22.01967,5.056822,2.304609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288037,0.066163,0.353554,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
288038,0.133012,0.353554,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
288039,0.000001,0.176777,1.651673,0.010800,0.011457,0.042730,0.348950,0.143966,0.0,0.018555,...,1.651673,0.011457,0.00000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
288040,0.000002,1.767770,0.000000,4.265864,0.000000,1.687841,13.783517,5.686642,0.0,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000


In [12]:
benigndata = benigndata[allcols]
benign_data = benigndata.values

In [6]:
bmatpath = '/mnt/hgfs/linuxfile/benign.mat'
benign_data = benigndata.values
io.savemat(bmatpath,{'benign':benign_data})

## 计算风险数据

In [4]:
# 读取自己拼接的riskfile文件
riskfile = '/mnt/hgfs/linuxfile/2019riskdata2.csv'

In [5]:
riskdata = pd.read_csv(riskfile)

In [6]:
alldata, labels = preprocess2017(riskdata)
mean = alldata.mean()
cols = alldata.columns


# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
loc = np.where(mean == 0.0)[0] # 定位
mean = mean.drop(cols[loc])    # 删除
riskdata = alldata.drop(cols[loc],axis=1) # 删除
riskdata = riskdata / mean

In [7]:
riskdata = riskdata[allcols]

In [12]:
set(labels[' Label'])

{'BENIGN',
 'DrDoS_DNS',
 'DrDoS_LDAP',
 'DrDoS_MSSQL',
 'DrDoS_NTP',
 'DrDoS_NetBIOS',
 'DrDoS_SNMP',
 'DrDoS_SSDP',
 'DrDoS_UDP',
 'Syn',
 'TFTP'}

In [13]:
label_dict = {
'DrDoS_SSDP': 1,
'DrDoS_LDAP': 2,
'DrDoS_SNMP': 3,
'BENIGN': 4,
'Syn': 5,
'TFTP': 6,
'DrDoS_NTP': 7,
'DrDoS_UDP': 8,
'DrDoS_NetBIOS': 9,
'DrDoS_MSSQL': 10,
'DrDoS_DNS': 11,
}

In [14]:
'BENIGN' in label_dict

True

In [15]:
for l in label_dict:
    labels[labels == l] = label_dict[l]

In [17]:
riskmatpath = '/mnt/hgfs/linuxfile/risk.mat'
risk_data = riskdata.values
io.savemat(riskmatpath,{'risk':risk_data})

labelsmatpath = '/mnt/hgfs/linuxfile/labels.mat'
labels_data = labels.values
io.savemat(labelsmatpath,{'labels':labels_data})

In [18]:
len(labels_data)

431758

In [19]:
labels_data = labels[' Label'].values.astype('int64')-1
test_spd_X, test_spd_Y = create_spd_matrix(risk_data, labels_data, nums4group=200)

In [20]:
test_spd_X.shape

(2158, 26, 26)

In [21]:
labels_data

array([3, 3, 3, ..., 2, 2, 2])

In [22]:
# maintain = []
# for i in range(len(selcols)):
#     maintain.append(allcols.index(selcols[i]))

In [23]:
train_spd_X_path = '/mnt/hgfs/linuxfile/spd_train_npy_set/train_spd_X.npy'
train_spd_Y_path = '/mnt/hgfs/linuxfile/spd_train_npy_set/train_spd_Y.npy'

train_spd_X = np.load(train_spd_X_path)
train_spd_Y = np.load(train_spd_Y_path)

# train_spd_X = train_spd_X[:,:,maintain]
# train_spd_X = train_spd_X[:,maintain,:]

trainshape = train_spd_X.shape
testshape = test_spd_X.shape

train_spd_X_ = np.zeros(shape=(trainshape[0],trainshape[1] * (trainshape[2]+1) // 2))
test_spd_X_ = np.zeros(shape=(testshape[0],testshape[1] * (testshape[2]+1) // 2))

for i in range(trainshape[0]):
    train_spd_X_[i] = dealSPD(train_spd_X[i])
for i in range(testshape[0]):
    test_spd_X_[i] = dealSPD(test_spd_X[i])
    
train_X = train_spd_X_
train_Y = train_spd_Y
test_X = test_spd_X_
test_Y = test_spd_Y

In [24]:
Extra_model = ExtraTreesClassifier(random_state = 1).fit(train_X,train_Y)
# test_X = train_X
test_Y_hat = Extra_model.predict(test_X)
# test_Y = train_Y
print(accuracy_score(test_Y, test_Y_hat))
print(precision_score(test_Y, test_Y_hat,average='micro'))
print(recall_score(test_Y, test_Y_hat,average='micro'))
print(f1_score(test_Y, test_Y_hat,average='micro'))
print(classification_report(test_Y, test_Y_hat))

  """Entry point for launching an IPython kernel.


0.05560704355885079
0.05560704355885079
0.05560704355885079
0.05560704355885079
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       120
         1.0       0.00      0.00      0.00       118
         2.0       0.00      0.00      0.00       117
         3.0       0.00      0.00      0.00      1000
         4.0       0.00      0.00      0.00       109
         5.0       0.00      0.00      0.00       105
         6.0       0.00      0.01      0.01       119
         7.0       0.00      0.00      0.00       118
         8.0       0.08      1.00      0.15       116
         9.0       0.00      0.00      0.00       118
        10.0       0.07      0.03      0.04       118

    accuracy                           0.06      2158
   macro avg       0.01      0.09      0.02      2158
weighted avg       0.01      0.06      0.01      2158



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
