# 将数据预处理并转换成matlab可以处理的数据格式

In [1]:
# import libs
import numpy as np
import pandas as pd
import os

In [2]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data
    

In [3]:
# 从文件夹'newdata/'中读取数据文件
filedir = '/mnt/hgfs/vmfiles/newdata/'
files = os.listdir(filedir)

In [4]:
nrows = 10000 # 每个文件中读取nrows行数据
rows4matrix = 10 # 每个矩阵所需要的数据的条数
labelnum = -1
meanofalldata = [0] * len(files)
allX = pd.DataFrame()
allY = pd.DataFrame()
for f in files: # 循环读入数据
    labelnum += 1
    label = f[:-9]
    print('starting to deal with ' + f)
    allrows = getrows(filedir+f)
    skiprow = allrows // (nrows + int(0.2 * nrows));
    if skiprow < rows4matrix:
        print('data is too small in ' + f)
        continue
    alldata = pd.DataFrame()
    cols = getcols(filedir+f)
    
    tdata = pd.read_csv(filedir+f, names=cols, skiprows=lambda x: (x % skiprow) > 0, nrows = nrows + int(0.2 * nrows))
    tdata = preprocess(tdata, cols, labelnum, label)

    
    X = tdata.drop([' Label'], axis=1)
    Y = tdata[' Label']
    X = X[0:10000]
    Y = Y[0:10000]
    meanofalldata[labelnum] = X.mean()
    allX = allX.append(X, ignore_index=True)
    allY = allY.append(Y, ignore_index=True)



starting to deal with BENIGN_data.csv
data is too small in BENIGN_data.csv
starting to deal with DrDoS_DNS_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_LDAP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_MSSQL_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_NetBIOS_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_NTP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_SNMP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_SSDP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_UDP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with Syn_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with TFTP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# 计算总体mean，用于后续计算
means = np.array(meanofalldata).mean()
print(means)
# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
loc = np.where(means == 0.0)[0]
allX = allX.drop(allX.columns[loc],axis=1)
means = np.delete(means,loc)
print(len(allX), len(means))
# 均值归一化
allX = allX / means
# 记录此时的列名
newcols = allX.columns

[1.06722567e+06 8.76096364e+00 3.55363636e-02 3.85639842e+03
 2.49836364e-01 6.16367018e+02 6.03137191e+02 6.12488259e+02
 5.14718737e+00 6.53363636e-02 0.00000000e+00 1.65589348e-02
 2.63484131e-02 7.78574110e+08 1.05195788e+06 1.21227101e+05
 1.82599399e+05 4.08725147e+05 3.40427000e+01 1.06714719e+06
 1.27616667e+05 1.85012026e+05 4.08804008e+05 3.40917455e+01
 6.17872717e+04 1.81300461e+04 3.08650446e+04 5.57355525e+04
 7.38818182e-02 1.81818182e-05 0.00000000e+00 0.00000000e+00
 0.00000000e+00 7.17090909e-01 1.05169745e+06 2.60429050e+02
 6.03136100e+02 6.16399891e+02 6.12182969e+02 4.96585247e+00
 2.92669765e+02 0.00000000e+00 9.09090909e-06 1.81818182e-05
 0.00000000e+00 9.17363636e-02 3.00000000e-04 2.63636364e-04
 0.00000000e+00 1.03727273e-02 8.85644093e+02 6.12488259e+02
 1.65589348e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 8.76096364e+00
 3.85639842e+03 3.55363636e-02 2.49836364e-01 3.86508566e+02
 5.47814003e+02 1.260198

  


In [6]:
newcols = allX.columns
newcols

Index([' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd Header Length',
       'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length',
       ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std',
       ' Packet Length Variance', ' SYN Flag Count', ' RST Flag Count',
       ' ACK Flag Count', ' URG Flag Count', ' CWE Flag Count',
       ' Down/Up Ratio', ' Average P

In [7]:
X_ = allX.values
Y_ = allY.values

In [8]:
Y_ = Y_.reshape(1,m*n)[0]

NameError: name 'm' is not defined

In [None]:
Y_.shape

In [None]:
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectKBest
from numpy import array
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
n_features = 5
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=n_features)
X_new = rfe.fit_transform(X_, Y_)
print(rfe.ranking_)

In [None]:
X_new

In [None]:
X_new.shape

In [None]:
import scipy.io as io
xmatpath = '/mnt/hgfs/vmfiles/mat/X.mat'
ymatpath = '/mnt/hgfs/vmfiles/mat/Y.mat'
xmat = X_new

ymat = Y_
io.savemat(xmatpath,{'X':xmat})
io.savemat(ymatpath,{'Y':ymat})

In [None]:
print(rfe.ranking_ == 1)

In [None]:
newcols[rfe.ranking_ == 1]

In [None]:
X_new[8]

In [None]:
np.max(X_new)

In [None]:
X_new.max()

In [None]:
Y_

In [None]:
help(RFE)