In [3]:
import pandas as pd
import numpy as np
import os
import datetime
import binascii
import cv2
from tqdm.notebook import tqdm

In [4]:
def extract_divided_groups(sorted_dataframe: pd.DataFrame, file_num_list: list) -> list: # return with grouped groups as one list, and get their labels
    filename_list = []
    itr = 0
    _filname_list = np.array(sorted_dataframe.loc[:, ('SrcAddr', 'Sport', 'DstAddr', 'Dport', 'StartTime')]).tolist()
    for count in file_num_list:
        _filelist = _filname_list[itr:itr + count]
        filename_list.append(_filelist)
        itr += count
    return filename_list

def generate_filelist_toread(dir_of_one_proto):
    _filelist = os.listdir(dir_of_one_proto)
#             filelist = []
    ###### get a dataframe of all split filename for further processing
    filelist = [os.path.splitext(filename)[0] for filename in _filelist]
    split_filename = [filename.split('_', 4) for filename in filelist]
    filename_df = pd.DataFrame(split_filename)
    filename_df.rename(columns={0: 'SrcAddr', 1: 'Sport',
                                2: 'DstAddr', 3: 'Dport',
                                4:'StartTime'}, inplace=True)
    
    # temp GroupByObject of data grouped by ['SrcAddr', 'Sport', 'DstAddr', 'Dport']
    divided_group_object = filename_df.groupby(['SrcAddr', 'Sport', 'DstAddr', 'Dport'], sort = False)

    # sort data in each group by 'StartTime'
    processed_dataflow = divided_group_object.apply(lambda x:x.sort_values('StartTime', ascending = True))                                                .reset_index(drop = True)
#     processed_dataflow =  divided_group_object.apply(lambda x: x).reset_index(drop = True)

    # data number in different groups, used to extract ordered groups from processed_dataflow
    processed_group_num = np.array(divided_group_object['SrcAddr'].count()).tolist()
    split_filenamelist_in_list = extract_divided_groups(processed_dataflow, processed_group_num)
    
    ###### pack all filenames with same ['SrcAddr', 'Sport', 'DstAddr', 'Dport'] in one list
    ###### each list was sorted by 'StartTime' inside, and pack all filename lists in one list
    filenamelist_in_list = []
    for split_str_list in tqdm(split_filenamelist_in_list, desc='Progress'):
        filename_list = [] # filename in one flow
        for split_str in split_str_list:
            filename = '_'.join(split_str)
            filename_list.append(dir_of_one_proto + filename + '.pcap')
        filenamelist_in_list.append(filename_list)
    
    return filenamelist_in_list


def pcap2hex(filename) -> bytes :
    if os.path.exists(filename):
        with open(filename, 'rb') as reader:
            reader.seek(24, 1) # jump over pcap head
            binflow = reader.read()
            hexstr = binascii.b2a_hex(binflow)
    # no such file
    else: return b'' # this is a bytes type string!!!!!!!
    return hexstr

def readfile_bylist(one_flow_file_list: list): # find pcap file by generated filename
    picstr = b'' # bytes string to be written in picture
    for filename in one_flow_file_list:
        hexstr = pcap2hex(filename)
        
        if hexstr != b'' and len(picstr) < 2048: # picstr length is more than 1024B
            picstr += hexstr
        elif hexstr == b'' and len(picstr) < 2048:
            continue # no such file matches with generated filename
        else: break
            
    # picstr length >= 2048
    if len(picstr) >= 2048:
        picstr = picstr[:2048]
    else:
        picstr = picstr.ljust(2048, b'0')
        
    return picstr

def hex2pic(picstr):
    pic_array = np.array([int(picstr[i:i+2], 16) for i in range(0, 2048, 2)]).reshape(32, 32)
#     img = Image.fromarray(np.uint8(pic_array))
    return pic_array

'''
NOTICE:
Actually, packets from or to '172.16.2.11' and '172.16.2.12' contains EITHER malicious data OR NON-malicious data,
but I can't really tell their difference for the way I process 'ISOT_Botnet_DataSet_2010.pcap' file into separate pcap files (with pkt2flow)
droped some details during its process. Take the file name as an instance, ports marked in hex code(0x12) will be renamed only as a singel '0':

e.g: 10.0.0.254_**0x12**_172.16.0.11_**0x56**_1286402882 -> 10.0.0.254_**0**_172.16.0.11_**0**_1286402882 after processing

As you see, flows with this hex code, like flows using ICMP protocol, can't be classified by its name,
so I just simply dropped these data, and only flows using TCP/UDP protocol are used.

Preprocessing on IP addresses here is alike, because flows using MAC address 'CC:CC:CC:DD:DD:DD' are all dismissed,
flows with uncertain IP addresses are marked as 'botnet' to avoid bad influences (and we actually have enough data to train the model).
Oh, and yes, we only use 'normal' data so all uncertainty will be eliminated :D.
'''

def get_isot_label_bypath(file_path):
    if '172.16.0.2' in file_path or '172.16.0.11' in file_path or '172.16.0.12' in file_path         or '172.16.2.11' in file_path or '172.16.2.12' in file_path or 'ff' in file_path:
        return 'botnet'
    else:
        return 'normal'

In [5]:
# dirpath = '/mnt/e/1IDM/CTU-13-Backup/'
dirpath = '/root/ISOT-2010/isot/'
        
# dataset_list = ['CTU-13.Dataset-42', 'CTU-13.Dataset-43', 'CTU-13.Dataset-44', 
#                     'CTU-13.Dataset-45', 'CTU-13.Dataset-46', 'CTU-13.Dataset-47', 
#                     'CTU-13.Dataset-48', 'CTU-13.Dataset-49', 'CTU-13.Dataset-50',
#                     'CTU-13.Dataset-51', 'CTU-13.Dataset-52', 'CTU-13.Dataset-53'] # 'CTU-13.Dataset-51' is skipped
# proto_list = ['/botnet/udp/', '/botnet/tcp_nosyn/', '/botnet/tcp_syn/'] # skip 'icmp' or other protos
proto_list = ['udp/', 'tcp_nosyn/', 'tcp_syn/'] # skip 'icmp' or other protos

all_filename_lists = []
        
for proto in proto_list:
    if os.path.exists(dirpath + proto):
        print('Processing directory: ' + dirpath + proto)
        filename_toread = generate_filelist_toread(dirpath + proto)
        
#         if get_isot_label_bypath(filename_toread[0][0])
        all_filename_lists.extend(filename_toread)
#         print('Processed directory: ' + dirpath + proto)
    else: print('no such directory: ' + dirpath + proto)

Processing directory: /root/ISOT-2010/isot/udp/


HBox(children=(FloatProgress(value=0.0, description='Progress', max=530253.0, style=ProgressStyle(description_…


Processing directory: /root/ISOT-2010/isot/tcp_nosyn/


HBox(children=(FloatProgress(value=0.0, description='Progress', max=31122.0, style=ProgressStyle(description_w…


Processing directory: /root/ISOT-2010/isot/tcp_syn/


HBox(children=(FloatProgress(value=0.0, description='Progress', max=489214.0, style=ProgressStyle(description_…




In [7]:
# imagelist = []
labellist = []
# image_path = '/mnt/e/1IDM/CTU-13-Backup/grayscale/'
# image_path = '/root/ISOT-2010/isot/trial/'
image_path = '/root/ISOT-2010/isot/grayscale/'
if os.path.exists(image_path) == False:
    os.mkdir(image_path)
        
image_num = len(all_filename_lists)
skip_flag = 0

for filename_list in tqdm(all_filename_lists, desc='Converting ISOT-2010 pcap to images'):
    # skip this file list if it's all about 'botnet'
    if get_isot_label_bypath(filename_list[0]) == 'botnet':
        skip_flag += 1
        continue   
    # skip this file list if there's a file with the same name
    im_name = os.path.splitext(os.path.basename(filename_list[0]))[0]
    file_name = image_path + im_name + '.bmp'
    if os.path.exists(file_name) == True:
        skip_flag += 1
        continue
    
    # process pcap into picture if not 'botnet'
    picture_str = readfile_bylist(filename_list)
    im = hex2pic(picture_str)
#     imagelist.append(im)
#     im_name = os.path.splitext(os.path.basename(filename_list[0]))[0]
#         im.save(image_path + im_name + '.bmp')
    cv2.imwrite(file_name, im)
    label = get_isot_label_bypath(filename_list[0])
    labellist.append([im_name, label])
       

HBox(children=(FloatProgress(value=0.0, description='Converting ISOT-2010 pcap to images', max=1050589.0, styl…




In [8]:
 
#         if cnt % 10000 == 0:
#             print('Processed %s pictures' % cnt)
#             print(datetime.datetime.now())
    #     if picture_str != b''.ljust(2048, b'0'):
    #         im = hex2pic(picture_str)
    #         imagelist.append(im)
    #         labellist.append(all_data_label[i])

assert len(os.listdir(image_path)) == len(labellist)
print('Processed {0} images at {1}'.format(image_num, datetime.datetime.now()))
print('Skipped {0} botnet images'.format(skip_flag))

label_df = pd.DataFrame(labellist, columns = ['PicName', 'Label'])
# label_df.to_csv('/mnt/e/1IDM/CTU-13-Backup/label.csv', encoding = 'utf-8')
label_df.to_csv('/root/ISOT-2010/isot/isot.csv', index = 0, encoding = 'utf-8')

Processed 1050589 images at 2020-04-09 13:09:45.491153
Skipped 63881 botnet images


In [7]:
import time
from tqdm.notebook import tqdm

for i in tqdm(range(1200), desc='test'):
#     time.sleep(0.01)
    continue

HBox(children=(FloatProgress(value=0.0, description='test', max=1200.0, style=ProgressStyle(description_width=…


