In [2]:
# import libs
import numpy as np
import pandas as pd
import os
import scipy.io as io

In [3]:
# def function

# 获取csv文件的列名
def getcols(filename):
    data = pd.read_csv(filename, nrows = 2)
    #print(data.columns)
    return data.columns

# 获取文件行数
def getrows(filename):
    cols = getcols(filename)
    col = cols[0]
    data = pd.read_csv(filename, usecols=[col])
    return data.shape[0]

# 数据预处理- step1: 删除无法用于计算的列与含空值、负值、inf值的行
def preprocess2017(data):
    labels = data[' Label']
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', ' Label']
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    # 删除数据中可能出现的非数值型数据列
    drop_ifexist = ['External IP','SimillarHTTP', ' Inbound','Unnamed: 0']
    
    # 如果有list中的columns则删除
    data.drop(data.columns[data.columns.isin(dropset+drop_nega+drop_ifexist)], axis = 1, inplace=True)
    
    data.dropna(how='any', inplace=True)
    print(len(data))
    data.reset_index(inplace=True, drop=True)
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值 (需要连续执行两次，不知道原因，但连续执行后就可删除数据中的inf)
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
   # print(idx)
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))

    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    labels.drop(axis=0, index = idx, inplace = True)
    labels.reset_index(inplace=True, drop=True)
   # print(len(data))
    return data,labels

def preprocess2019(data, cols, labelnum, label):
    data.dropna(how='any', inplace=True)
    
    # print(data.columns)
    data.drop(cols[0:2],axis = 1, inplace = True)
    
    # 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
    dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
    
    # 删除含有大量负数的列
    drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
           ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
    data.drop(dropset, axis=1,inplace=True)
    data.drop(drop_nega, axis=1,inplace=True)
    
    # 将标签替换为数字
    data.replace(to_replace=label, value=labelnum, inplace=True)
    
    # 将所有数据类型转成float型
    data = data.astype('float')
    
    # 删除数据中inf的值
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    idx_tuple = np.where(data.max(axis=1).values == np.inf)
    idx = list(idx_tuple[0])
#    print(len(idx))
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    # 删除数据中含负数的数据条数
    idx_tuple = np.where(data.min(axis=1).values < 0)
    idx = list(idx_tuple[0])
    data.drop(axis=0, index = idx, inplace = True)
    data.reset_index(inplace=True, drop=True)
    
    return data

In [3]:
# 从文件夹'/mnt/hgfs/linuxfile/2017/'中读取数据文件
filedir = '/mnt/hgfs/linuxfile/2017/'
files = os.listdir(filedir)

In [4]:
files

['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
 'Friday-WorkingHours-Morning.pcap_ISCX.csv',
 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
 'Tuesday-WorkingHours.pcap_ISCX.csv',
 'Wednesday-workingHours.pcap_ISCX.csv']

In [5]:
# 受害者IP
DDos_victim_IP = ['192.168.10.50'] # 128834
PortScan_victim_IP = ['192.168.10.50'] # 160716
Botnet_victim_IP = ['192.168.10.15', '192.168.10.9', '192.168.10.14', '192.168.10.5', '192.168.10.8'] # 16029 
WebAttack_victim_IP = ['192.168.10.50'] # 4230
Infiltration_victim_IP = ['192.168.10.8', '192.168.10.25'] # 15943
BruteForce_victim_IP = ['192.168.10.50'] # 4188
DDos2_victim_IP = ['192.168.10.50','205.174.165.66','192.168.10.51'] # 23926
ips = [DDos_victim_IP, PortScan_victim_IP, Botnet_victim_IP, Infiltration_victim_IP, WebAttack_victim_IP, BruteForce_victim_IP, DDos2_victim_IP]

In [6]:
victims = ['DDos_victim', 'PortScan_victim', 'Botnet_victim', 'WebAttack_victim', 'Infiltration_victim', 'BruteForce_victim', 'DDos2_victim']

In [7]:
ips_len = len(ips)
for i in range(ips_len):
    data = pd.read_csv(filedir+files[i])
    data = data[data[' Destination IP'].isin(ips[i])]
    data.reset_index(inplace=True, drop=True)
    data.to_csv('/mnt/hgfs/linuxfile/victim2017/'+victims[i]+'_data.csv', mode='w+')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# 从文件夹'/mnt/hgfs/linuxfile/2019/0112/'中读取数据文件
filedir = '/mnt/hgfs/linuxfile/2019/0112/'
files = os.listdir(filedir)
victim2019_IP = ['192.168.50.1'] # 受害者ip

In [5]:
files_num = len(files)

victims = [''] * len(files)
for i in range(len(victims)):
    victims[i] = files[i][:-4] + '_victim'
    
    
for i in range(files_num):
    index = 0
    loop = True
    chunks = []
    chunksize = 300000
    data = pd.read_csv(filedir+files[i], iterator = True)
    while loop:
        try:
            chunk = data.get_chunk(chunksize)
            tdata = chunk[chunk[' Destination IP'].isin(victim2019_IP)]
            if tdata.empty == False:
                if os.path.exists('/mnt/hgfs/linuxfile/victim2019/'+victims[i]+'_data.csv') == False:
                     tdata.to_csv('/mnt/hgfs/linuxfile/victim2019/'+victims[i]+'_data.csv', mode='a+')
                else:
                    tdata.to_csv('/mnt/hgfs/linuxfile/victim2019/'+victims[i]+'_data.csv', mode='a+', header = False)
                print("add data to "+'/mnt/hgfs/linuxfile/victim2019/'+victims[i]+'_data.csv')
            print(files[i]+'\' '+str(index)+"th loop")
            index += 1
        except StopIteration:
            loop = False
            print("Interation is stopped")

  coro.send(None)


add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
DrDoS_DNS.csv' 0th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
DrDoS_DNS.csv' 1th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
DrDoS_DNS.csv' 2th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
DrDoS_DNS.csv' 3th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
DrDoS_DNS.csv' 4th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
DrDoS_DNS.csv' 5th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
DrDoS_DNS.csv' 6th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
DrDoS_DNS.csv' 7th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
DrDoS_DNS.csv' 8th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
DrDoS_DNS.csv' 9th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_DNS_victim_data.csv
D

add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_SSDP_victim_data.csv
DrDoS_SSDP.csv' 6th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_SSDP_victim_data.csv
DrDoS_SSDP.csv' 7th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_SSDP_victim_data.csv
DrDoS_SSDP.csv' 8th loop
Interation is stopped
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_UDP_victim_data.csv
DrDoS_UDP.csv' 0th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_UDP_victim_data.csv
DrDoS_UDP.csv' 1th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_UDP_victim_data.csv
DrDoS_UDP.csv' 2th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_UDP_victim_data.csv
DrDoS_UDP.csv' 3th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_UDP_victim_data.csv
DrDoS_UDP.csv' 4th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_UDP_victim_data.csv
DrDoS_UDP.csv' 5th loop
add data to /mnt/hgfs/linuxfile/victim2019/DrDoS_UDP_victim_data.csv
DrDoS_UDP.csv' 6th loop
add data to /mnt/hgfs/linuxfile/victim2019

In [None]:
files[0][:-4]

In [10]:
help(pd.read_csv)

Help on function read_csv in module pandas.io.parsers:

read_csv(filepath_or_buffer: Union[ForwardRef('PathLike[str]'), str, IO[~T], io.RawIOBase, io.BufferedIOBase, io.TextIOBase, _io.TextIOWrapper, mmap.mmap], sep=<object object at 0x7f30580bb060>, delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal: str = '.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, dialect=None, error_bad_lines=True, warn_bad_lines=True, delim_whit

In [None]:
victims