# 抽取受害者流量，用于风险计算

In [1]:
# import libs
import numpy as np
import pandas as pd
import os
import scipy.io as io

In [2]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess2017(data):
    labels = data[' Label']
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', ' Label']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    
    data.dropna(how='any', inplace=True)
    print(len(data))
    data.reset_index(inplace=True, drop=True)
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    return data,labels

def preprocess2019(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data

In [3]:
# 从文件夹'/mnt/hgfs/linuxfile/2017/'中读取数据文件
filedir = '/mnt/hgfs/linuxfile/2017/'
files = os.listdir(filedir)
# 选择反应数据风险值的特征
select_cols = [' Fwd Packet Length Min', ' Min Packet Length', ' Packet Length Mean', ' Average Packet Size', ' Subflow Fwd Bytes']

In [4]:
# 读取攻击数据文件
attackfilesdir2019 = '/mnt/hgfs/linuxfile/2019/'
attackfilesdir2017 = '/mnt/hgfs/linuxfile/2017/'
attackfiles2019 = os.listdir(attackfilesdir2019)
attackfiles2017 = os.listdir(attackfilesdir2017)
attackfiles2019, attackfiles2017

(['DrDoS_DNS.csv',
  'DrDoS_LDAP.csv',
  'DrDoS_MSSQL.csv',
  'DrDoS_NetBIOS.csv',
  'DrDoS_NTP.csv',
  'DrDoS_SNMP.csv',
  'DrDoS_SSDP.csv',
  'DrDoS_UDP.csv',
  'Syn.csv',
  'TFTP.csv',
  'UDPLag.csv'],
 ['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
  'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
  'Friday-WorkingHours-Morning.pcap_ISCX.csv',
  'Monday-WorkingHours.pcap_ISCX.csv',
  'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
  'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
  'Tuesday-WorkingHours.pcap_ISCX.csv',
  'Wednesday-workingHours.pcap_ISCX.csv'])

In [5]:
attack = pd.read_csv(attackfilesdir2017+attackfiles2017[0])

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
victimIP = '192.168.10.50'
attack = attack[attack[' Destination IP'] == victimIP]

In [9]:
attack

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Subflow Bwd Packets,Subflow Bwd Bytes,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,126188,20,25,176,440,43,0,8.800000,12.680694,52,...,25,440,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8003,15,6,19897,0,2896,0,1326.466667,1136.570208,0,...,6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2,0,617,0,617,0,308.500000,436.284884,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,50,1,2,0,0,0,0,0.000000,0.000000,0,...,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1333970,41,42,2664,6954,456,0,64.975610,109.864573,976,...,42,6954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128828,79,1,1,6,6,6,6,6.000000,0.000000,6,...,1,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128829,1259121,41,44,2664,6954,456,0,64.975610,109.864573,976,...,44,6954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128830,1238384,41,42,2728,6634,456,0,66.536585,110.129945,976,...,42,6634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128831,108,1,1,0,0,0,0,0.000000,0.000000,0,...,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
alldata, labels = preprocess2017(attack)
mean = alldata.mean()
cols = alldata.columns

# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
loc = np.where(mean == 0.0)[0] # 定位
mean = mean.drop(cols[loc])    # 删除
alldata = alldata.drop(cols[loc],axis=1) # 删除
alldata = alldata / mean
alldata = alldata[select_cols]

labels[labels != 'BENIGN'] = 1
labels[labels == 'BENIGN'] = 0

128833


In [8]:
# 输出到attack.mat
# 输出到labels.mat
xmatpath = '/mnt/hgfs/linuxfile/'+'attack.mat'
ymatpath = '/mnt/hgfs/linuxfile/'+'labels.mat'
attack_data = alldata.values
label_data = labels.values.astype(int)
io.savemat(xmatpath,{'atk':attack_data})
io.savemat(ymatpath,{'labels':label_data})