# 使用黎曼度量计算风险值大小
## 1. 数据预处理
### 1.1. 数据清洗
* 删除列： 删除非数值型的列，删除含有大量负数、inf、NA值的列。
* 删除行： 删除个别在其他列仍含有负数、inf、NA值的行。

### 1.2. 数据均一化
采用均值归一化的方法，计算每一列的均值，用于归一化处理。

x = x / x.mean。

ps, 在此过程中删除均值为0的列。（均值为0说明所有值均为0）
### 1.3. 特征选择
选择最能反应数据特点的n个特征，对数据进行降维。
## 2. 构建基线。
计算正常情况下数据的黎曼均值作为风险基线。并且该值可根据最新的安全状态的流量情况进行更新。
## 3. 风险计算。
计算测试数据与基线之间的黎曼距离，根据距离的大小反应风险的高低。

In [1]:
# import libs
import numpy as np
import pandas as pd
import os
import scipy.io as io

In [2]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess2017(data):
    labels = data[' Label']
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', ' Label']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    
    data.dropna(how='any', inplace=True)
    print(len(data))
    data.reset_index(inplace=True, drop=True)
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    return data,labels

def preprocess2019(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data
    

In [3]:
# 从文件夹'/mnt/hgfs/linuxfile/2017/'中读取数据文件
filedir = '/mnt/hgfs/linuxfile/2017/'
files = os.listdir(filedir)
# 选择反应数据风险值的特征
select_cols = [' Fwd Packet Length Min', ' Min Packet Length', ' Packet Length Mean', ' Average Packet Size', ' Subflow Fwd Bytes']

# 计算基线

In [4]:
# 读取星期一的数据作为安全基线
benign = pd.read_csv(filedir+files[4])


In [5]:
alldata, labels = preprocess2017(benign)
mean = alldata.mean()
cols = alldata.columns

# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
loc = np.where(mean == 0.0)[0] # 定位
mean = mean.drop(cols[loc])    # 删除
alldata = alldata.drop(cols[loc],axis=1) # 删除
alldata = alldata / mean

288584


In [6]:
newcols = alldata.columns

In [7]:
# 选取经过特征选择后的列
alldata = alldata[select_cols]

In [8]:
bmatpath = '/mnt/hgfs/linuxfile/benign.mat'
benign_data = alldata.values
io.savemat(bmatpath,{'benign':benign_data})

# 攻击数据处理

In [9]:
# 读取攻击数据文件
attackfilesdir2019 = '/mnt/hgfs/linuxfile/2019/'
attackfilesdir2017 = '/mnt/hgfs/linuxfile/2017/'
attackfiles2019 = os.listdir(attackfilesdir2019)
attackfiles2017 = os.listdir(attackfilesdir2017)
attackfiles2019, attackfiles2017

(['DrDoS_DNS.csv',
  'DrDoS_LDAP.csv',
  'DrDoS_MSSQL.csv',
  'DrDoS_NetBIOS.csv',
  'DrDoS_NTP.csv',
  'DrDoS_SNMP.csv',
  'DrDoS_SSDP.csv',
  'DrDoS_UDP.csv',
  'Syn.csv',
  'TFTP.csv',
  'UDPLag.csv'],
 ['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
  'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
  'Friday-WorkingHours-Morning.pcap_ISCX.csv',
  'Monday-WorkingHours.pcap_ISCX.csv',
  'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
  'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
  'Tuesday-WorkingHours.pcap_ISCX.csv',
  'Wednesday-workingHours.pcap_ISCX.csv'])

In [10]:
# 读取攻击数据
#attack = pd.read_csv(attackfilesdir2017+attackfiles2017[0])
#len(attack)
attackfile2019 = attackfilesdir2019+attackfiles2019[5]
attack = pd.read_csv(attackfile2019, nrows = 500000)

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
alldata, labels = preprocess2017(attack)
mean = alldata.mean()
cols = alldata.columns

# 将平均值为0的数舍去，因为平均值为0说明所有值都是0
loc = np.where(mean == 0.0)[0] # 定位
mean = mean.drop(cols[loc])    # 删除
alldata = alldata.drop(cols[loc],axis=1) # 删除
alldata = alldata / mean
alldata = alldata[select_cols]

labels[labels != 'BENIGN'] = 1
labels[labels == 'BENIGN'] = 0

499979


In [12]:
# 输出到attack.mat
# 输出到labels.mat
xmatpath = '/mnt/hgfs/linuxfile/'+'attack.mat'
ymatpath = '/mnt/hgfs/linuxfile/'+'labels.mat'
attack_data = alldata.values
label_data = labels.values.astype(int)
io.savemat(xmatpath,{'atk':attack_data})
io.savemat(ymatpath,{'labels':label_data})