# 将数据预处理并转换成matlab可以处理的数据格式

In [1]:
# import libs
import numpy as np
import pandas as pd
import os

In [2]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data
    

In [3]:
# 从文件夹'newdata/'中读取数据文件
filedir = '/mnt/hgfs/vmfiles/newdata/'
files = os.listdir(filedir)

In [5]:
nrows = 10000 # 每个文件中读取nrows行数据
rows4matrix = 10 # 每个矩阵所需要的数据的条数
labelnum = -1
meanofalldata = [0] * len(files)
allX = pd.DataFrame()
allY = pd.DataFrame()
for f in files: # 循环读入数据
    labelnum += 1
    label = f[:-9]
    print('starting to deal with ' + f)
    allrows = getrows(filedir+f)
    skiprow = allrows // (nrows + int(0.2 * nrows));
    if skiprow < rows4matrix:
        print('data is too small in ' + f)
        continue
    alldata = pd.DataFrame()
    cols = getcols(filedir+f)
    
    tdata = pd.read_csv(filedir+f, names=cols, skiprows=lambda x: (x % skiprow) > 0, nrows = nrows + int(0.2 * nrows))
#    print(len(tdata))
    tdata = preprocess(tdata, cols, labelnum, label)
    
#    idx_tuple = np.where(tdata.max(axis=1).values == np.inf)
#    idx = list(idx_tuple[0])
#    print(len(idx))
    
    X = tdata.drop([' Label'], axis=1)
    Y = tdata[' Label']
    X = X[0:10000]
    Y = Y[0:10000]
    meanofalldata[labelnum] = X.mean()
#    print(len(X))
    allX = allX.append(X, ignore_index=True)
    allY = allY.append(Y, ignore_index=True)
#    idx_tuple = np.where(allX.max(axis=1).values == np.inf)
#    idx_tuple
#    idx = list(idx_tuple[0])
#    print(len(idx))
# 计算总体mean，用于后续计算
means = np.array(meanofalldata).mean()

# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
loc = np.where(means == 0.0)[0]
allX = allX.drop(allX.columns[loc],axis=1)

# 均值归一化
allX = allX / means

starting to deal with BENIGN_data.csv
data is too small in BENIGN_data.csv
starting to deal with DrDoS_DNS_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


12000
379
357
0
10000
0
starting to deal with DrDoS_LDAP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


12000
224
213
0
10000
0
starting to deal with DrDoS_MSSQL_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


12000
334
322
0
10000
0
starting to deal with DrDoS_NetBIOS_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


12000
377
342
0
10000
0
starting to deal with DrDoS_NTP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


12000
71
71
0
10000
0
starting to deal with DrDoS_SNMP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


12000
18
17
0
10000
0
starting to deal with DrDoS_SSDP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


12000
195
189
0
10000
0
starting to deal with DrDoS_UDP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


12000
166
160
0
10000
0
starting to deal with Syn_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


12000
0
0
0
10000
0
starting to deal with TFTP_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


12000
333
315
0
10000
0




In [43]:
allX.loc[0].values

array([2.80521738e-02, 2.28285390e+01, 0.00000000e+00, 2.28192190e+01,
       0.00000000e+00, 7.13860390e-01, 7.29518933e-01, 7.18381118e-01,
       0.00000000e+00, 0.00000000e+00,            nan, 0.00000000e+00,
       0.00000000e+00, 3.77537356e-03, 6.35051373e-03, 1.24099488e-03,
       9.18076096e-04, 2.22888170e-03, 2.93748733e-02, 2.80542368e-02,
       1.17886021e-03, 9.06104035e-04, 2.22845174e-03, 2.93326137e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00,            nan,            nan,
                  nan, 0.00000000e+00, 6.35208629e-03, 0.00000000e+00,
       7.29520253e-01, 7.13822320e-01, 7.18739368e-01, 0.00000000e+00,
       0.00000000e+00,            nan, 0.00000000e+00, 0.00000000e+00,
                  nan, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
                  nan, 0.00000000e+00, 4.99297634e-01, 7.18381118e-01,
       0.00000000e+00,            nan,            nan,            nan,
      

In [76]:
x = allY[9999]
x = x[0:5]
x

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
Name: 9999, dtype: float64

In [66]:
X_ = allX.values
Y_ = allY.values

In [51]:
loc = np.where(means == 0.0)[0]
loc

array([10, 30, 31, 32, 41, 44, 48, 53, 54, 55, 56, 57, 58])

In [50]:
means

array([1.06722567e+06, 8.76096364e+00, 3.55363636e-02, 3.85639842e+03,
       2.49836364e-01, 6.16367018e+02, 6.03137191e+02, 6.12488259e+02,
       5.14718737e+00, 6.53363636e-02, 0.00000000e+00, 1.65589348e-02,
       2.63484131e-02, 7.78574110e+08, 1.05195788e+06, 1.21227101e+05,
       1.82599399e+05, 4.08725147e+05, 3.40427000e+01, 1.06714719e+06,
       1.27616667e+05, 1.85012026e+05, 4.08804008e+05, 3.40917455e+01,
       6.17872717e+04, 1.81300461e+04, 3.08650446e+04, 5.57355525e+04,
       7.38818182e-02, 1.81818182e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 7.17090909e-01, 1.05169745e+06, 2.60429050e+02,
       6.03136100e+02, 6.16399891e+02, 6.12182969e+02, 4.96585247e+00,
       2.92669765e+02, 0.00000000e+00, 9.09090909e-06, 1.81818182e-05,
       0.00000000e+00, 9.17363636e-02, 3.00000000e-04, 2.63636364e-04,
       0.00000000e+00, 1.03727273e-02, 8.85644093e+02, 6.12488259e+02,
       1.65589348e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [54]:
np.delete(means, loc)

array([1.06722567e+06, 8.76096364e+00, 3.55363636e-02, 3.85639842e+03,
       2.49836364e-01, 6.16367018e+02, 6.03137191e+02, 6.12488259e+02,
       5.14718737e+00, 6.53363636e-02, 1.65589348e-02, 2.63484131e-02,
       7.78574110e+08, 1.05195788e+06, 1.21227101e+05, 1.82599399e+05,
       4.08725147e+05, 3.40427000e+01, 1.06714719e+06, 1.27616667e+05,
       1.85012026e+05, 4.08804008e+05, 3.40917455e+01, 6.17872717e+04,
       1.81300461e+04, 3.08650446e+04, 5.57355525e+04, 7.38818182e-02,
       1.81818182e-05, 7.17090909e-01, 1.05169745e+06, 2.60429050e+02,
       6.03136100e+02, 6.16399891e+02, 6.12182969e+02, 4.96585247e+00,
       2.92669765e+02, 9.09090909e-06, 1.81818182e-05, 9.17363636e-02,
       3.00000000e-04, 2.63636364e-04, 1.03727273e-02, 8.85644093e+02,
       6.12488259e+02, 1.65589348e-02, 8.76096364e+00, 3.85639842e+03,
       3.55363636e-02, 2.49836364e-01, 3.86508566e+02, 5.47814003e+02,
       1.26019895e+03, 1.16909382e+02, 1.72427382e+05, 5.81272301e+04,
      

In [59]:
help(allX.drop)
allX = allX.drop(allX.columns[loc],axis=1)

Help on method drop in module pandas.core.frame:

drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise') method of pandas.core.frame.DataFrame instance
    Drop specified labels from rows or columns.
    
    Remove rows or columns by specifying label names and corresponding
    axis, or by specifying directly index or column names. When using a
    multi-index, labels on different levels can be removed by specifying
    the level.
    
    Parameters
    ----------
    labels : single label or list-like
        Index or column labels to drop.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Whether to drop labels from the index (0 or 'index') or
        columns (1 or 'columns').
    index : single label or list-like
        Alternative to specifying axis (``labels, axis=0``
        is equivalent to ``index=labels``).
    columns : single label or list-like
        Alternative to specifying axis (``labels, axis=1``
        is equivale

In [61]:
allX

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Subflow Bwd Packets,Subflow Bwd Bytes,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,2.805217e-02,22.828539,0.0,22.819219,0.0,0.713860,0.729519,0.718381,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.811027e-06,0.228285,0.0,0.763407,0.0,2.388187,2.440572,2.403311,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.687904e-02,22.828539,0.0,22.819219,0.0,0.713860,0.729519,0.718381,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.370089e-07,0.228285,0.0,0.763407,0.0,2.388187,2.440572,2.403311,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.502657e-02,18.034546,0.0,17.952502,0.0,0.713860,0.490767,0.715405,3.137556,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2.811163e+00,0.456571,0.0,0.535214,0.0,0.837164,0.855527,0.842465,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,9.370089e-07,0.228285,0.0,0.267607,0.0,0.837164,0.855527,0.842465,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,2.807936e+00,0.456571,0.0,0.535214,0.0,0.837164,0.855527,0.842465,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,2.812139e+00,0.456571,0.0,0.535214,0.0,0.837164,0.855527,0.842465,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
allX.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
Flow Duration,0.028052,3e-06,0.026879,9.370089e-07,0.025027,0.242703,0.024348,0.020069,0.037853,3e-06,...,9.370089e-07,2.811018,2.810872,2.812052,2.810813,2.811163,9.370089e-07,2.807936,2.812139,2.811474
Total Fwd Packets,22.828539,0.228285,22.828539,0.2282854,18.034546,18.034546,18.034546,22.828539,22.828539,0.228285,...,0.2282854,0.4565708,0.4565708,0.4565708,0.4565708,0.4565708,0.2282854,0.4565708,0.4565708,0.4565708
Total Backward Packets,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Total Length of Fwd Packets,22.819219,0.763407,22.819219,0.7634066,17.952502,17.952502,17.952502,22.819219,22.819219,0.763407,...,0.2676072,0.5352144,0.5352144,0.5352144,0.5352144,0.5352144,0.2676072,0.5352144,0.5352144,0.5352144
Total Length of Bwd Packets,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fwd Packet Length Max,0.71386,2.388187,0.71386,2.388187,0.71386,0.71386,0.71386,0.71386,0.71386,2.388187,...,0.8371635,0.8371635,0.8371635,0.8371635,0.8371635,0.8371635,0.8371635,0.8371635,0.8371635,0.8371635
Fwd Packet Length Min,0.729519,2.440572,0.729519,2.440572,0.490767,0.490767,0.490767,0.729519,0.729519,2.440572,...,0.8555267,0.8555267,0.8555267,0.8555267,0.8555267,0.8555267,0.8555267,0.8555267,0.8555267,0.8555267
Fwd Packet Length Mean,0.718381,2.403311,0.718381,2.403311,0.715405,0.715405,0.715405,0.718381,0.718381,2.403311,...,0.8424651,0.8424651,0.8424651,0.8424651,0.8424651,0.8424651,0.8424651,0.8424651,0.8424651,0.8424651
Fwd Packet Length Std,0.0,0.0,0.0,0.0,3.137556,3.137556,3.137556,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bwd Packet Length Max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
list(allX.isna().any().values)

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]

In [69]:
len(Y_)

10

In [67]:
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectKBest
from numpy import array
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
X_new = RFE(estimator=LogisticRegression(), n_features_to_select=10).fit_transform(X_, Y_)

ValueError: Found input variables with inconsistent numbers of samples: [100000, 10]

In [34]:
help(X.dropna)

Help on method dropna in module pandas.core.frame:

dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) method of pandas.core.frame.DataFrame instance
    Remove missing values.
    
    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Determine if rows or columns which contain missing values are
        removed.
    
        * 0, or 'index' : Drop rows which contain missing values.
        * 1, or 'columns' : Drop columns which contain missing value.
    
        .. versionchanged:: 1.0.0
    
           Pass tuple or list to drop on multiple axes.
           Only a single axis is allowed.
    
    how : {'any', 'all'}, default 'any'
        Determine if row or column is removed from DataFrame, when we have
        at least one NA or all NA.
    
        * 'any' : If any NA values are present, dro