In [2]:
# import libs
import numpy as np
import pandas as pd
import os
import scipy.io as io

In [3]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess2017(data):
    labels = data[' Label']
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', ' Label']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0', 'Unnamed: 0.1']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    
    data.dropna(how='any', inplace=True)
#    print(len(data))
    data.reset_index(inplace=True, drop=True)
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    return data,labels

def preprocess2019(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data
    

In [4]:
# 从文件夹'/mnt/hgfs/linuxfile/victim2019/'中读取数据文件
filedir = '/mnt/hgfs/linuxfile/victim2019/'
files = os.listdir(filedir)

In [4]:
filedir2 = '/mnt/hgfs/linuxfile/2017/' 
benign_file = os.listdir(filedir2)[4]
benign_data = pd.read_csv(filedir2+benign_file, usecols=)
tdata,labels = preprocess2017(benign_data)
benign = tdata.join(labels)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
nrows = 20000 # 每个文件中读取nrows行数据
start = 0
alldata = pd.DataFrame()
for f in files: # 循环读入数据
    label = f[:-9]
    print('starting to deal with ' + f)
    allrows = getrows(filedir+f)
    skiprow = allrows // (nrows + int(0.2 * nrows));
    if skiprow == 0:
        print('data is too small in ' + f)
        continue
    cols = getcols(filedir+f)
    
    data = pd.read_csv(filedir+f, names=cols, skiprows=lambda x: (x==0 or (x % skiprow) > 0), nrows = nrows + int(0.2 * nrows))
    tdata,labels = preprocess2017(data)
    tdata = data.join(labels)
    alldata = alldata.append(benign.loc[start:start+nrows-1])
    alldata = alldata.append(tdata)
#    print(len(alldata))
    start += nrows
    
alldata.reset_index(inplace=True, drop=True)
alldata.to_csv('/mnt/hgfs/linuxfile/2019riskdata.csv', mode='w+',index=False)
    

starting to deal with DrDoS_DNS_victim_data.csv
starting to deal with DrDoS_LDAP_victim_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_MSSQL_victim_data.csv
starting to deal with DrDoS_NetBIOS_victim_data.csv
starting to deal with DrDoS_NTP_victim_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_SNMP_victim_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with DrDoS_SSDP_victim_data.csv
starting to deal with DrDoS_UDP_victim_data.csv


  interactivity=interactivity, compiler=compiler, result=result)


starting to deal with Syn_victim_data.csv
starting to deal with TFTP_victim_data.csv


In [6]:
# nrows = 20000 # 每个文件中读取nrows行数据
# start = 0
# for f in files: # 循环读入数据
#     label = f[:-9]
#     print('starting to deal with ' + f)
#     allrows = getrows(filedir+f)
#     skiprow = allrows // (nrows + int(0.2 * nrows));
#     if skiprow == 0:
#         print('data is too small in ' + f)
#         continue
#     cols = getcols(filedir+f)
    
#     tdata = pd.read_csv(filedir+f, names=cols, skiprows=lambda x: (x==0 or (x % skiprow) > 0), nrows = nrows + int(0.2 * nrows))
#     data,labels = preprocess2017(tdata)
#     alldata = data.join(labels)
#     if alldata.empty == False:
#         if os.path.exists(filedir + 'riskdata.csv') == False:
#             alldata.to_csv(filedir +'riskdata.csv', mode='a+')
#         else:
#             alldata.to_csv(filedir +'riskdata.csv', mode='a+', header = False)
    

In [7]:
alldata

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Subflow Bwd Bytes,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,113095465.0,48.0,24.0,9668.0,10012.0,403.0,0.0,201.416667,203.548293,923.0,...,10012.0,203985.500,5.758373e+05,1629110.0,379.0,13800000.0,4.277541e+06,16500000.0,6737603.0,BENIGN
1,113473706.0,68.0,40.0,11364.0,12718.0,403.0,0.0,167.117647,171.919413,1139.0,...,12718.0,178326.875,5.034269e+05,1424245.0,325.0,13800000.0,4.229413e+06,16500000.0,6945512.0,BENIGN
2,119945515.0,150.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,6909777.333,1.170000e+07,20400000.0,6.0,24400000.0,2.430000e+07,60100000.0,5702188.0,BENIGN
3,60261928.0,9.0,7.0,2330.0,4221.0,1093.0,0.0,258.888889,409.702161,1460.0,...,4221.0,0.000,0.000000e+00,0.0,0.0,0.0,0.000000e+00,0.0,0.0,BENIGN
4,269.0,2.0,2.0,102.0,322.0,51.0,51.0,51.000000,0.000000,161.0,...,322.0,0.000,0.000000e+00,0.0,0.0,0.0,0.000000e+00,0.0,0.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406949,15369167.0,12.0,0.0,6192.0,0.0,516.0,516.0,516.000000,0.000000,0.0,...,0.0,0.000,0.000000e+00,0.0,0.0,0.0,0.000000e+00,0.0,0.0,
406950,3.0,2.0,0.0,1032.0,0.0,516.0,516.0,516.000000,0.000000,0.0,...,0.0,0.000,0.000000e+00,0.0,0.0,0.0,0.000000e+00,0.0,0.0,
406951,5999813.0,6.0,0.0,3096.0,0.0,516.0,516.0,516.000000,0.000000,0.0,...,0.0,0.000,0.000000e+00,0.0,0.0,0.0,0.000000e+00,0.0,0.0,
406952,18000090.0,14.0,0.0,7224.0,0.0,516.0,516.0,516.000000,0.000000,0.0,...,0.0,0.000,0.000000e+00,0.0,0.0,0.0,0.000000e+00,0.0,0.0,


In [8]:
data = pd.read_csv('/mnt/hgfs/linuxfile/2019riskdata.csv')

In [14]:
set(data[' Label'])

{'BENIGN',
 'DrDoS_DNS',
 'DrDoS_LDAP',
 'DrDoS_MSSQL',
 'DrDoS_NTP',
 'DrDoS_NetBIOS',
 'DrDoS_SNMP',
 'DrDoS_SSDP',
 'DrDoS_UDP',
 'Syn',
 'TFTP',
 'Web Attack � Brute Force',
 'Web Attack � Sql Injection',
 'Web Attack � XSS',
 nan}

In [10]:
help(pd.DataFrame.to_csv)

Help on function to_csv in module pandas.core.generic:

to_csv(self, path_or_buf: 'Optional[FilePathOrBuffer]' = None, sep: 'str' = ',', na_rep: 'str' = '', float_format: 'Optional[str]' = None, columns: 'Optional[Sequence[Label]]' = None, header: 'Union[bool_t, List[str]]' = True, index: 'bool_t' = True, index_label: 'Optional[IndexLabel]' = None, mode: 'str' = 'w', encoding: 'Optional[str]' = None, compression: 'CompressionOptions' = 'infer', quoting: 'Optional[int]' = None, quotechar: 'str' = '"', line_terminator: 'Optional[str]' = None, chunksize: 'Optional[int]' = None, date_format: 'Optional[str]' = None, doublequote: 'bool_t' = True, escapechar: 'Optional[str]' = None, decimal: 'str' = '.', errors: 'str' = 'strict', storage_options: 'StorageOptions' = None) -> 'Optional[str]'
    Write object to a comma-separated values (csv) file.
    
    .. versionchanged:: 0.24.0
        The order of arguments for Series was changed.
    
    Parameters
    ----------
    path_or_buf : str o

In [1]:
os.listdir(filedir2)[4]

NameError: name 'os' is not defined