In [1]:
# import libs
import numpy as np
import pandas as pd
import os
import scipy.io as io

In [21]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess2017(data):
    
    data.dropna(how='any', inplace=True)
#    print(len(data))
    data.reset_index(inplace=True, drop=True)
    
    labels = pd.DataFrame(data[' Label'])
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', ' Label']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    

    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    return data,labels

def preprocess2019(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data

def create_spd_matrix(X,Y, nums4group):
    nums, nfeatures = X.shape
    step = nums4group // 1
    n = nums - (nums-(nums4group)) % step
    nmatrix = int((n - (nums4group - step)) / step)
    X_matrix = np.zeros(shape = (nums4group, nfeatures, nmatrix))
    Y_ = np.zeros(shape=(nmatrix, 1))
    for i in range(0,int(n - (nums4group - step)), step):
        X_matrix[:,:, i // step] = X[i:i+nums4group,:]
        
    X_spd_matrix = np.zeros(shape = (nmatrix, nfeatures, nfeatures))
    for i in range(nmatrix):
        cov_matrix = np.cov(X_matrix[:,:,i].T)
        X_spd_matrix[i,:,:] = cov_matrix + 0.01 * (np.trace(cov_matrix) + 1e-8) * np.eye(nfeatures, nfeatures)
    train_spd_X = X_spd_matrix
    for i in range(0,int(n - (nums4group - step)), step):
        Y_[i // step] = np.argmax(np.bincount(Y[i:i+nums4group]))
    train_spd_Y = Y_
    return train_spd_X, train_spd_Y  
## 将协方差矩阵转成上三角矩阵
def dealSPD(A):
    n = A.shape[0]
    B = np.zeros(shape=(1, n*(n+1)//2))
    j = 0
    for i in range(0,n):
        B[:,j:j+n-i] = A[i,i:n]
        j = j + n -i
    return B

## 计算基线

In [3]:
# 从文件夹'/mnt/hgfs/linuxfile/2017/'中读取数据文件
filedir = '/mnt/hgfs/linuxfile/2017/'
files = os.listdir(filedir)
# 读取星期一的数据作为安全基线
benign = pd.read_csv(filedir+files[4])

alldata, labels = preprocess2017(benign)
mean = alldata.mean()
cols = alldata.columns

# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
loc = np.where(mean == 0.0)[0] # 定位
mean = mean.drop(cols[loc])    # 删除
benigndata = alldata.drop(cols[loc],axis=1) # 删除
benigndata = benigndata / mean

# 选择反应数据风险值的特征
selcols = [' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Flow IAT Mean', 'Fwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Average Packet Size', ' Avg Fwd Segment Size']
# select_cols = [' Fwd Packet Length Min', ' Min Packet Length', ' Packet Length Mean', ' Average Packet Size', ' Subflow Fwd Bytes']

In [4]:
files

['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
 'Friday-WorkingHours-Morning.pcap_ISCX.csv',
 'Monday-WorkingHours.pcap_ISCX.csv',
 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
 'Tuesday-WorkingHours.pcap_ISCX.csv',
 'Wednesday-workingHours.pcap_ISCX.csv']

In [5]:
benigndata = benigndata[selcols]

In [6]:
bmatpath = '/mnt/hgfs/linuxfile/benign.mat'
benign_data = benigndata.values
io.savemat(bmatpath,{'benign':benign_data})

## 计算风险数据

In [7]:
# 读取自己拼接的riskfile文件
riskfile = '/mnt/hgfs/linuxfile/2019riskdata2.csv'

In [8]:
riskdata = pd.read_csv(riskfile)

In [9]:
alldata, labels = preprocess2017(riskdata)
mean = alldata.mean()
cols = alldata.columns


# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
loc = np.where(mean == 0.0)[0] # 定位
mean = mean.drop(cols[loc])    # 删除
riskdata = alldata.drop(cols[loc],axis=1) # 删除
riskdata = riskdata / mean

In [16]:
riskdata = riskdata[selcols]

In [17]:
set(labels[' Label'])

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}

In [12]:
label_dict = {
'DrDoS_SSDP': 1,
'DrDoS_LDAP': 2,
'DrDoS_SNMP': 3,
'BENIGN': 4,
'Syn': 5,
'TFTP': 6,
'DrDoS_NTP': 7,
'DrDoS_UDP': 8,
'DrDoS_NetBIOS': 9,
'DrDoS_MSSQL': 10,
'DrDoS_DNS': 11,
}

In [13]:
'BENIGN' in label_dict

True

In [14]:
for l in label_dict:
    labels[labels == l] = label_dict[l]

In [18]:
riskmatpath = '/mnt/hgfs/linuxfile/risk.mat'
risk_data = riskdata.values
io.savemat(riskmatpath,{'risk':risk_data})

labelsmatpath = '/mnt/hgfs/linuxfile/labels.mat'
labels_data = labels.values
io.savemat(labelsmatpath,{'labels':labels_data})

In [19]:
len(labels_data)

432819

In [50]:
labels_data = labels[' Label'].values.astype('int64')
test_spd_X, test_spd_Y = create_spd_matrix(risk_data, labels_data, nums4group=200)

In [51]:
train_spd_X_path = '/mnt/hgfs/linuxfile/spd_train_npy_set/train_spd_X.npy'
train_spd_Y_path = '/mnt/hgfs/linuxfile/spd_train_npy_set/train_spd_Y.npy'

train_spd_X = np.load(train_spd_X_path)
train_spd_Y = np.load(train_spd_Y_path)

trainshape = train_spd_X.shape
testshape = test_spd_X.shape

train_spd_X_ = np.zeros(shape=(trainshape[0],trainshape[1] * (trainshape[2]+1) // 2))
test_spd_X_ = np.zeros(shape=(testshape[0],testshape[1] * (testshape[2]+1) // 2))

for i in range(trainshape[0]):
    train_spd_X_[i] = dealSPD(train_spd_X[i])
for i in range(testshape[0]):
    test_spd_X_[i] = dealSPD(test_spd_X[i])
    
train_X = train_spd_X_
train_Y = train_spd_Y
test_X = test_spd_X_
test_Y = test_spd_Y

NameError: name 'dealSPD' is not defined

In [None]:
Extra_model = ExtraTreesClassifier(random_state = 1).fit(train_X,train_Y)

test_Y_hat = Extra_model.predict(test_X)

print(accuracy_score(test_Y, test_Y_hat))
print(precision_score(test_Y, test_Y_hat,average='micro'))
print(recall_score(test_Y, test_Y_hat,average='micro'))
print(f1_score(test_Y, test_Y_hat,average='micro'))
print(classification_report(test_Y, test_Y_hat))