In [1]:
# import libs
import numpy as np
import pandas as pd
import os
import scipy.io as io

In [2]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess2017(data):
    data.dropna(how='any', inplace=True)
#    print(len(data))
    data.reset_index(inplace=True, drop=True)
    labels = data[' Label']
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', ' Label']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    

    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    return data,labels

def preprocess2019(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data

# 数据切割 
def split_train_test(data,test_ratio):
    #设置随机数种子，保证每次生成的结果都是一样的
    np.random.seed(42)
    #permutation随机生成0-len(data)随机序列
    shuffled_indices = np.random.permutation(len(data))
    #test_ratio为测试集所占的百分比
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    #iloc选择参数序列中所对应的行
    return data.iloc[train_indices],data.iloc[test_indices]

def meanScaler(data):
    cols = data.columns
    means = data.mean()
    loc = np.where(means == 0.0)[0]
    data = data.drop(cols[loc],axis=1)
    means = means.drop(cols[loc])
    # print(len(allX), len(means))
    # 均值归一化
    data = data / means
    return data

def create_spd_matrix(X,Y, nums4group):
    nums, nfeatures = X.shape
    step = nums4group // 1
    n = nums - (nums-(nums4group)) % step
    nmatrix = int((n - (nums4group - step)) / step)
    X_matrix = np.zeros(shape = (nums4group, nfeatures, nmatrix))
    Y_ = np.zeros(shape=(nmatrix, 1))
    for i in range(0,int(n - (nums4group - step)), step):
        X_matrix[:,:, i // step] = X[i:i+nums4group,:]
        
    X_spd_matrix = np.zeros(shape = (nmatrix, nfeatures, nfeatures))
    for i in range(nmatrix):
        cov_matrix = np.cov(X_matrix[:,:,i].T)
        X_spd_matrix[i,:,:] = cov_matrix + 0.01 * (np.trace(cov_matrix) + 1e-8) * np.eye(nfeatures, nfeatures)
    train_spd_X = X_spd_matrix
    for i in range(0,int(n - (nums4group - step)), step):
        Y_[i // step] = np.argmax(np.bincount(Y[i:i+nums4group]))
    train_spd_Y = Y_
    return train_spd_X, train_spd_Y

## create benign data

In [3]:

benign_path = '/mnt/hgfs/linuxfile/Monday-WorkingHours.pcap_ISCX.csv'


In [4]:
## 20features
ExtraTreeFeatures = [' Total Fwd Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Max', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', ' Subflow Fwd Bytes']

LogisticRegressionFeatures =['Total Length of Fwd Packets', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', ' Bwd Packet Length Min', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', 'Fwd IAT Total', ' Fwd IAT Std', 'Fwd Packets/s', ' Min Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes']
# ## 30 features
# ExtraTreeFeatures=[' Flow Duration', ' Total Fwd Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes']
# LogisticRegressionFeatures=[' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', ' Bwd Packet Length Min', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Std', ' Bwd Header Length', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' PSH Flag Count', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes', ' Active Min']
allcols = list(set(ExtraTreeFeatures).union(set(LogisticRegressionFeatures)))
allcols.append(' Label')
numcols = len(allcols)

In [5]:
benign_data = pd.read_csv(benign_path,usecols=allcols)
benign_features, benign_labels = preprocess2017(benign_data)

In [6]:
## 数据处理
# labels 转换成用数表示其分类结果
labelset = set(benign_labels)
label_dict = {}
i = 0
for l in labelset:
    label_dict[l] = i
    benign_labels.replace(to_replace=l, value=i, inplace=True)
    i = i + 1
    print(l,i)

BENIGN 1


In [8]:
# X,Y数据提取
benign_X = meanScaler(benign_features).values

In [9]:
bmatpath = '/mnt/hgfs/linuxfile/mat/benign.mat'
benign_data = benign_X
io.savemat(bmatpath,{'benign':benign_data})

## create riskdata

In [19]:
riskfile = '/mnt/hgfs/linuxfile/2019riskdata.csv'
riskdata = pd.read_csv(riskfile,usecols=allcols)

In [20]:
riskdata

Unnamed: 0,Total Fwd Packets,Total Length of Fwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Min,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,...,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,ACK Flag Count,Average Packet Size,Avg Fwd Segment Size,Subflow Fwd Packets,Subflow Fwd Bytes,Label
0,48.0,9668.0,403.0,0.0,201.416667,203.548293,316.0,1.740123e+02,0.636630,1.592894e+06,...,923.0,275.109589,233.833248,54677.98782,1.0,278.930556,201.416667,48.0,9668.0,BENIGN
1,68.0,11364.0,403.0,0.0,167.117647,171.919413,126.0,2.122254e+02,0.951762,1.060502e+06,...,1139.0,224.633027,198.999612,39600.84557,1.0,226.712963,167.117647,68.0,11364.0,BENIGN
2,150.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000e+00,1.250568,8.050035e+05,...,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,150.0,0.0,BENIGN
3,9.0,2330.0,1093.0,0.0,258.888889,409.702161,0.0,1.087088e+02,0.265508,4.017462e+06,...,1460.0,385.352941,532.094087,283124.11760,0.0,409.437500,258.888889,9.0,2330.0,BENIGN
4,2.0,102.0,51.0,51.0,51.000000,0.000000,161.0,1.576208e+06,14869.888480,8.966667e+01,...,161.0,95.000000,60.249481,3630.00000,0.0,118.750000,51.000000,2.0,102.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406949,12.0,6192.0,516.0,516.0,516.000000,0.000000,0.0,4.028846e+02,0.780784,1.397197e+06,...,516.0,516.000000,0.000000,0.00000,0.0,559.000000,516.000000,12.0,6192.0,
406950,2.0,1032.0,516.0,516.0,516.000000,0.000000,0.0,3.440000e+08,666666.666667,3.000000e+00,...,516.0,516.000000,0.000000,0.00000,0.0,774.000000,516.000000,2.0,1032.0,
406951,6.0,3096.0,516.0,516.0,516.000000,0.000000,0.0,5.160161e+02,1.000031,1.199963e+06,...,516.0,516.000000,0.000000,0.00000,0.0,602.000000,516.000000,6.0,3096.0,
406952,14.0,7224.0,516.0,516.0,516.000000,0.000000,0.0,4.013313e+02,0.777774,1.384622e+06,...,516.0,516.000000,0.000000,0.00000,0.0,552.857143,516.000000,14.0,7224.0,


In [15]:
alldata, labels = preprocess2017(riskdata)
mean = alldata.mean()
cols = alldata.columns

# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
loc = np.where(mean == 0.0)[0] # 定位
mean = mean.drop(cols[loc])    # 删除
riskdata = alldata.drop(cols[loc],axis=1) # 删除
riskdata = riskdata / mean
labels[labels == 'BENIGN'] = 0
labels[labels != 0] = 1

riskdata = riskdata[allcols]

KeyError: "[' Label'] not in index"

In [None]:
riskmatpath = '/mnt/hgfs/linuxfile/riskdata.mat'
risk_data = riskdata.values
io.savemat(riskmatpath,{'riskdata':risk_data})

labelsmatpath = '/mnt/hgfs/linuxfile/labels.mat'
labels_data = labels.values
io.savemat(labelsmatpath,{'risklabel':labels_data})

In [11]:
# test_X_path = '/mnt/hgfs/linuxfile/test_npy_set/test_X.npy'
# test_Y_path = '/mnt/hgfs/linuxfile/test_npy_set/test_Y.npy'
# test_X = np.load(test_X_path)
# test_Y = np.load(test_Y_path)

In [13]:
# riskdatamatpath = '/mnt/hgfs/linuxfile/mat/riskdata.mat'
# risk_data = test_X
# io.savemat(riskdatamatpath,{'riskdata':risk_data})
# risklabelmatpath = '/mnt/hgfs/linuxfile/mat/risklabel.mat'
# risk_label = test_Y
# io.savemat(risklabelmatpath,{'risklabel':risk_label})