# 将数据预处理并转换成matlab可以处理的数据格式

In [1]:
# import libs
import numpy as np
import pandas as pd
import os

In [2]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess(data, cols, labelnum, label):
    # print(data.columns)
    
        # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0', 'Unnamed: 0.1']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    data.dropna(how='any', inplace=True)
#    print(len(data))
    data.reset_index(inplace=True, drop=True)
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)

   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)

   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data
    

In [3]:
# 从文件夹'newdata/'中读取数据文件
filedir = '/mnt/hgfs/linuxfile/newdata/'
files = os.listdir(filedir)

In [4]:
nrows = 10000 # 每个文件中读取nrows行数据
rows4matrix = 10 # 每个矩阵所需要的数据的条数
labelnum = -1
meanofalldata = [0] * len(files)
allX = pd.DataFrame()
allY = pd.DataFrame()
for f in files: # 循环读入数据
    labelnum += 1
    label = f[:-9]
    print('starting to deal with ' + f)
    allrows = getrows(filedir+f)
    skiprow = allrows // (nrows + int(0.2 * nrows));
    if skiprow < rows4matrix:
        print('data is too small in ' + f)
        continue
    alldata = pd.DataFrame()
    cols = getcols(filedir+f)
    
    tdata = pd.read_csv(filedir+f, names=cols, skiprows=lambda x: (x==0 or (x % skiprow) > 0), nrows = nrows + int(0.2 * nrows))
    tdata = preprocess(tdata, cols, labelnum, label)

    
    X = tdata.drop([' Label'], axis=1)
    Y = tdata[' Label']
    X = X[0:10000]
    Y = Y[0:10000]
    meanofalldata[labelnum] = X.mean()
    allX = allX.append(X, ignore_index=True)
    allY = allY.append(Y, ignore_index=True)



starting to deal with BENIGN_data.csv
starting to deal with DrDoS_DNS_data.csv
starting to deal with DrDoS_LDAP_data.csv
starting to deal with DrDoS_MSSQL_data.csv
starting to deal with DrDoS_NetBIOS_data.csv
starting to deal with DrDoS_NTP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_SNMP_data.csv
starting to deal with DrDoS_SSDP_data.csv
starting to deal with DrDoS_UDP_data.csv
starting to deal with Syn_data.csv
starting to deal with TFTP_data.csv


In [5]:
# 计算总体mean，用于后续计算
means = pd.DataFrame(meanofalldata).mean()
# print(means)
# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
cols = allX.columns
loc = np.where(means == 0.0)[0]
allX = allX.drop(cols[loc],axis=1)
means = means.drop(cols[loc])
# print(len(allX), len(means))
# 均值归一化
allX = allX / means
# 记录此时的列名
newcols = allX.columns

In [6]:
means.min()

1.8181818181818182e-05

In [7]:
allX

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Subflow Bwd Packets,Subflow Bwd Bytes,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,3.560803e-02,2.043860,20.852362,0.873581,20.092346,2.033000,0.000000,0.290820,35.478510,72.649824,...,20.852362,20.092346,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,4.611484e-07,0.215143,0.000000,0.003072,0.000000,0.009434,0.009920,0.009717,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,4.948579e-01,9.466298,191.462595,0.165909,619.441537,0.345909,0.000000,0.011925,3.494397,102.831813,...,191.462595,619.441537,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,3.463568e+01,0.860573,13.269685,0.098316,1.014582,0.075471,0.079361,0.077734,0.000000,1.136263,...,13.269685,1.014582,744.734245,1506.53277,681.595745,26.796141,59.117974,443.185837,90.187323,12.635306
4,1.277407e+01,3.334719,68.244093,0.609868,20.998223,0.641504,0.000000,0.124437,9.916548,23.104017,...,68.244093,20.998223,125.167669,0.00000,57.914661,204.833785,47.670036,0.000000,40.002826,55.964067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109995,1.383510e+00,0.430286,0.000000,0.528450,0.000000,0.811313,0.853131,0.835640,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
109996,1.383614e+00,0.430286,0.000000,0.528450,0.000000,0.811313,0.853131,0.835640,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
109997,1.385292e+00,0.430286,0.000000,0.528450,0.000000,0.811313,0.853131,0.835640,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
109998,1.384466e+00,0.430286,0.000000,0.528450,0.000000,0.811313,0.853131,0.835640,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [8]:
allX.loc[28878:28899]

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Subflow Bwd Packets,Subflow Bwd Bytes,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
28878,4.611484e-07,0.215143,0.0,0.753758,0.0,2.314444,2.433739,2.383841,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28879,9.222968e-07,0.215143,0.0,0.753758,0.0,2.314444,2.433739,2.383841,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28880,1.383445e-06,0.215143,0.0,0.753758,0.0,2.314444,2.433739,2.383841,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28881,2.397972e-05,0.215143,0.0,0.753758,0.0,2.314444,2.433739,2.383841,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28882,2.213512e-05,0.215143,0.0,0.749661,0.0,2.301866,2.420512,2.370885,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28883,4.611484e-07,0.215143,0.0,0.753758,0.0,2.314444,2.433739,2.383841,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28884,4.611484e-07,0.215143,0.0,0.753758,0.0,2.314444,2.433739,2.383841,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28885,4.611484e-07,0.215143,0.0,0.753758,0.0,2.314444,2.433739,2.383841,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28886,4.611484e-07,0.215143,0.0,0.753758,0.0,2.314444,2.433739,2.383841,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28887,4.611484e-07,0.215143,0.0,0.753758,0.0,2.314444,2.433739,2.383841,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
newcols = allX.columns
newcols

Index([' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count',
       ' SYN Flag Count', ' RST Flag Count', ' PSH Flag Count',
       ' ACK Flag Count', ' UR

In [10]:
X_ = allX.values
Y_ = allY.values

In [11]:
X_.shape

(110000, 62)

In [12]:
X_

array([[3.56080350e-02, 2.04385986e+00, 2.08523618e+01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.61148402e-07, 2.15143144e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.94857889e-01, 9.46629832e+00, 1.91462595e+02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.38529210e+00, 4.30286287e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.38446573e+00, 4.30286287e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.38253351e+00, 4.30286287e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [13]:
Y_ = Y_.reshape(1,110000)[0]

In [14]:
Y_.shape

(110000,)

In [15]:
Y_

array([ 0.,  0.,  0., ..., 10., 10., 10.])

In [16]:
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from numpy import array
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
n_features = 20
# ## REF
# # LogisticRegression
# rfe = RFE(estimator=LogisticRegression(), n_features_to_select=n_features)

# # SVC
# svc = SVC(kernel="linear", C=1)
# rfe = RFE(estimator=svc, n_features_to_select=n_features)

## RandomForest

## SelectKBest

# 卡方检验
SKB = SelectKBest(chi2, k=n_features)
X_new = SKB.fit_transform(X_, Y_)
# ## SelectFromModel
# # GBDT
# X_new = SelectFromModel(GradientBoostingClassifier()).fit_transform(X_, Y_)

## Extra_tree
Extra_model = ExtraTreesClassifier(random_state = 1)
rfe = RFE(estimator=Extra_model, n_features_to_select=n_features)
rfe.fit(X_, Y_)

# print(rfe.ranking_)

RFE(estimator=ExtraTreesClassifier(random_state=1), n_features_to_select=20)

In [17]:
X_new = rfe.fit_transform(X_, Y_)

In [18]:
Extra_model.feature_importances_

NotFittedError: This ExtraTreesClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [19]:
X_new.shape, X_.shape

((110000, 20), (110000, 62))

In [20]:
import scipy.io as io
xmatpath = '/mnt/hgfs/linuxfile/mat/X.mat'
ymatpath = '/mnt/hgfs/linuxfile/mat/Y.mat'
xmat = X_new

ymat = Y_
io.savemat(xmatpath,{'X':xmat})
io.savemat(ymatpath,{'Y':ymat})

In [21]:
print(rfe.ranking_ == 1)

[False  True False  True False  True  True  True False False False False
 False  True  True  True False  True False  True  True False  True False
 False False False False False False False  True False  True  True  True
 False False False False False False  True False False False False  True
  True False False  True False False False False False False False False
 False False]


In [22]:
newcols[rfe.ranking_ == 1]

Index([' Total Fwd Packets', 'Total Length of Fwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Mean',
       ' Fwd IAT Max', 'Fwd Packets/s', ' Min Packet Length',
       ' Max Packet Length', ' Packet Length Mean', ' ACK Flag Count',
       ' Average Packet Size', ' Avg Fwd Segment Size', ' Subflow Fwd Bytes'],
      dtype='object')

### 1. LogisticRegression
selcols = [' Fwd Packet Length Std', ' Packet Length Mean',
       ' Packet Length Variance', ' Average Packet Size',
       ' Avg Fwd Segment Size']
       
selcols = [' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Flow Bytes/s', ' Fwd IAT Std',
       ' Min Packet Length', ' Packet Length Mean', ' Packet Length Variance',
       ' Average Packet Size', ' Avg Fwd Segment Size']
### 2.  ExtraTreesClassifier
selcols = [' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Flow IAT Mean', 'Fwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Average Packet Size', ' Avg Fwd Segment Size']
       
 selcols = [' Total Fwd Packets', 'Total Length of Fwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Mean',
       ' Fwd IAT Max', 'Fwd Packets/s', ' Min Packet Length',
       ' Max Packet Length', ' Packet Length Mean', ' ACK Flag Count',
       ' Average Packet Size', ' Avg Fwd Segment Size', ' Subflow Fwd Bytes']

In [None]:
X_new[8]