In [1]:
# import libs
import numpy as np
import pandas as pd
import os
import scipy.io as io

In [2]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess2017(data):
    data.dropna(how='any', inplace=True)
#    print(len(data))
    data.reset_index(inplace=True, drop=True)
    labels = data[' Label']
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', ' Label']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    

    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    return data,labels

def preprocess2019(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data

# 数据切割 
def split_train_test(data,test_ratio):
    #设置随机数种子，保证每次生成的结果都是一样的
    np.random.seed(42)
    #permutation随机生成0-len(data)随机序列
    shuffled_indices = np.random.permutation(len(data))
    #test_ratio为测试集所占的百分比
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    #iloc选择参数序列中所对应的行
    return data.iloc[train_indices],data.iloc[test_indices]

def meanScaler(data):
    cols = data.columns
    means = data.mean()
    loc = np.where(means == 0.0)[0]
    print(cols[loc])
    data = data.drop(cols[loc],axis=1)
    means = means.drop(cols[loc])
    # print(len(allX), len(means))
    # 均值归一化
    data = data / means
    return data

def create_spd_matrix(X,Y, nums4group):
    nums, nfeatures = X.shape
    step = nums4group // 1
    n = nums - (nums-(nums4group)) % step
    nmatrix = int((n - (nums4group - step)) / step)
    X_matrix = np.zeros(shape = (nums4group, nfeatures, nmatrix))
    Y_ = np.zeros(shape=(nmatrix, 1))
    for i in range(0,int(n - (nums4group - step)), step):
        X_matrix[:,:, i // step] = X[i:i+nums4group,:]
        
    X_spd_matrix = np.zeros(shape = (nmatrix, nfeatures, nfeatures))
    for i in range(nmatrix):
        cov_matrix = np.cov(X_matrix[:,:,i].T)
        X_spd_matrix[i,:,:] = cov_matrix + 0.01 * (np.trace(cov_matrix) + 1e-8) * np.eye(nfeatures, nfeatures)
    train_spd_X = X_spd_matrix
    for i in range(0,int(n - (nums4group - step)), step):
        Y_[i // step] = np.argmax(np.bincount(Y[i:i+nums4group]))
    train_spd_Y = Y_
    return train_spd_X, train_spd_Y

In [8]:
ExtraTreeFeatures = [' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Flow IAT Mean', 'Fwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Average Packet Size', ' Avg Fwd Segment Size']

# ## 20features
# ExtraTreeFeatures = [' Total Fwd Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Max', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', ' Subflow Fwd Bytes']

# LogisticRegressionFeatures =['Total Length of Fwd Packets', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', ' Bwd Packet Length Min', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', 'Fwd IAT Total', ' Fwd IAT Std', 'Fwd Packets/s', ' Min Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes']
# ## 30 features
# ExtraTreeFeatures=[' Flow Duration', ' Total Fwd Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes']
# LogisticRegressionFeatures=[' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', ' Bwd Packet Length Min', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Std', ' Bwd Header Length', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' PSH Flag Count', ' ACK Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', 'Subflow Fwd Packets', ' Subflow Fwd Bytes', ' Active Min']
# allcols = list(set(ExtraTreeFeatures).union(set(LogisticRegressionFeatures)))
allcols = ExtraTreeFeatures
allcols.append(' Label')
numcols = len(allcols)

In [10]:
## create benign data
benign_path = '/mnt/hgfs/linuxfile/Monday-WorkingHours.pcap_ISCX.csv'
benign_data = pd.read_csv(benign_path,usecols=allcols)



In [12]:
benign_features, benign_labels = preprocess2017(benign_data)
benign_X = meanScaler(benign_features).values
bmatpath = '/mnt/hgfs/linuxfile/mat/benign.mat'
benign_data = benign_X
io.savemat(bmatpath,{'benign':benign_data})

KeyError: ' Label'

In [11]:
benign_data

Unnamed: 0,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Flow IAT Mean,Fwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Average Packet Size,Avg Fwd Segment Size,Label
0,6.0,6.0,6.0,4.000000,5.000000e+05,6.0,6.0,6.0,9.0,6.0,BENIGN
1,6.0,6.0,6.0,1.000000,2.000000e+06,6.0,6.0,6.0,9.0,6.0,BENIGN
2,6.0,6.0,6.0,1.000000,2.000000e+06,6.0,6.0,6.0,9.0,6.0,BENIGN
3,6.0,6.0,6.0,1.000000,2.000000e+06,6.0,6.0,6.0,9.0,6.0,BENIGN
4,6.0,6.0,6.0,3.000000,6.666667e+05,6.0,6.0,6.0,9.0,6.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...
529913,6.0,6.0,6.0,18738.000000,5.336749e+01,6.0,6.0,6.0,9.0,6.0,BENIGN
529914,40.0,40.0,40.0,20265.666667,3.289636e+01,40.0,78.0,55.2,69.0,40.0,BENIGN
529915,32.0,32.0,32.0,51.333333,1.298701e+04,32.0,48.0,38.4,48.0,32.0,BENIGN
529916,40.0,40.0,40.0,51.666667,1.290323e+04,40.0,72.0,52.8,66.0,40.0,BENIGN


In [6]:
benign_features, benign_labels = preprocess2017(benign_data)

KeyError: ' Label'

In [None]:
## create risk data
# 从文件夹'/mnt/hgfs/linuxfile/victim2019/'中读取数据文件
filedir = '/mnt/hgfs/linuxfile/victim2019/'
files = os.listdir(filedir)


nrows = 50000 # 每个文件中读取nrows行数据
start = 0
alldata = pd.DataFrame()
allX = np.zeros(shape=(1,20))
allY = np.zeros(0)
label_dict = {
'DrDoS_SSDP': 1,
'DrDoS_LDAP': 2,
'DrDoS_SNMP': 3,
'BENIGN': 4,
'Syn': 5,
'TFTP': 6,
'DrDoS_NTP': 7,
'DrDoS_UDP': 8,
'DrDoS_NetBIOS': 9,
'DrDoS_MSSQL': 10,
'DrDoS_DNS': 11,
}

for f in files: # 循环读入数据
    label = f[:-9]
    print('starting to deal with ' + f)
    allrows = getrows(filedir+f)
    skiprow = allrows // (nrows + int(0.2 * nrows));
    if skiprow == 0:
        print('data is too small in ' + f)
        continue
    cols = getcols(filedir+f)
    
    data = pd.read_csv(filedir+f, usecols=allcols, skiprows=lambda x:  (x % skiprow) > 0, nrows = nrows)
    tdata,labels = preprocess2017(data)  
    t_X = meanScaler(tdata).values
    allX = np.vstack((allX,t_X))
    for l in label_dict:
        labels.replace(to_replace=l, value=label_dict[l]-1, inplace=True)
    t_Y = labels.values
    allY = np.hstack((allY,t_Y))
    allX = np.vstack((allX,benign_data[start:start+nrows-1]))
    allY = np.hstack((allY,3+np.zeros(nrows)))
    start += nrows
    

In [None]:
a = np.zeros(shape=(1,3))
b = np.zeros(shape=(1,3))

In [None]:
3+np.zeros(nrows)

In [None]:
benign_data.shape

In [None]:
benign_data[start:start+nrows-1].shape

In [None]:
allY.shape

In [None]:
allX = np.empty

In [None]:
allX = np.vstack((a,b))

In [None]:
data = pd.read_csv(filedir+f, usecols=allcols, skiprows=lambda x:  (x % skiprow) > 0, nrows = nrows)
tdata,labels = preprocess2017(data)  

In [None]:
label_dict = {
'DrDoS_SSDP': 1,
'DrDoS_LDAP': 2,
'DrDoS_SNMP': 3,
'BENIGN': 4,
'Syn': 5,
'TFTP': 6,
'DrDoS_NTP': 7,
'DrDoS_UDP': 8,
'DrDoS_NetBIOS': 9,
'DrDoS_MSSQL': 10,
'DrDoS_DNS': 11,
}

In [None]:
data

In [None]:
data = pd.read_csv(filedir+f, usecols=allcols, skiprows=lambda x:  (x % skiprow) > 0, nrows = nrows)

In [None]:
data