In [1]:
# from pandas.core.frame import DataFrame
import pandas as pd
import numpy as np
import os
import datetime
import binascii
from PIL import Image

In [2]:
def extract_divided_groups(sorted_dataframe: pd.DataFrame, file_num_list: list) -> list: # return with grouped groups as one list, and get their labels
    filename_list = []
    itr = 0
    _filname_list = np.array(sorted_dataframe.loc[:, ('SrcAddr', 'Sport', 'DstAddr', 'Dport', 'StartTime')]).tolist()
    for count in file_num_list:
        _filelist = _filname_list[itr:itr + count]
        filename_list.append(_filelist)
        itr += count
    return filename_list

In [3]:
def generate_filelist_toread(dir_of_one_proto):
    _filelist = os.listdir(dir_of_one_proto)
#             filelist = []
    ###### get a dataframe of all split filename for further processing
    filelist = [os.path.splitext(filename)[0] for filename in _filelist]
    split_filename = [filename.split('_', 4) for filename in filelist]
    filename_df = pd.DataFrame(split_filename)
    filename_df.rename(columns={0: 'SrcAddr', 1: 'Sport',
                                2: 'DstAddr', 3: 'Dport',
                                4:'StartTime'}, inplace=True)
    
    # temp GroupByObject of data grouped by ['SrcAddr', 'Sport', 'DstAddr', 'Dport']
    divided_group_object = filename_df.groupby(['SrcAddr', 'Sport', 'DstAddr', 'Dport'], sort = False)

    # sort data in each group by 'StartTime'
    processed_dataflow = divided_group_object.apply(lambda x:x.sort_values('StartTime', ascending = True))\
                                                .reset_index(drop = True)
#     processed_dataflow =  divided_group_object.apply(lambda x: x).reset_index(drop = True)

    # data number in different groups, used to extract ordered groups from processed_dataflow
    processed_group_num = np.array(divided_group_object['SrcAddr'].count()).tolist()
    split_filenamelist_in_list = extract_divided_groups(processed_dataflow, processed_group_num)
    
    ###### pack all filenames with same ['SrcAddr', 'Sport', 'DstAddr', 'Dport'] in one list
    ###### each list was sorted by 'StartTime' inside, and pack all filename lists in one list
    filenamelist_in_list = []
    for split_str_list in split_filenamelist_in_list:
        filename_list = [] # filename in one flow
        for split_str in split_str_list:
            filename = '_'.join(split_str)
            filename_list.append(dir_of_one_proto + filename + '.pcap')
        filenamelist_in_list.append(filename_list)
    
    return filenamelist_in_list

In [4]:
def pcap2hex(filename) -> bytes :
    if os.path.exists(filename):
        with open(filename, 'rb') as reader:
            reader.seek(24, 1) # jump over pcap head
            binflow = reader.read()
            hexstr = binascii.b2a_hex(binflow)
    # no such file
    else: return b'' # this is a bytes type string!!!!!!!
    return hexstr

def readfile_bylist(one_flow_file_list: list): # find pcap file by generated filename
    picstr = b'' # bytes string to be written in picture
    for filename in one_flow_file_list:
        hexstr = pcap2hex(filename)
        
        if hexstr != b'' and len(picstr) < 2048: # picstr length is more than 1024B
            picstr += hexstr
        elif hexstr == b'' and len(picstr) < 2048:
            continue # no such file matches with generated filename
        else: break
            
    # picstr length >= 2048
    if len(picstr) >= 2048:
        picstr = picstr[:2048]
    else:
        picstr = picstr.ljust(2048, b'0')
        
    return picstr

def hex2pic(picstr):
    pic_array = np.array([int(picstr[i:i+2], 16) for i in range(0, 2048, 2)]).reshape(32, 32)
    img = Image.fromarray(np.uint8(pic_array))
    return img

In [5]:
def get_label_bypath(file_path):
    if 'CTU-13.Dataset-42' in file_path or 'CTU-13.Dataset-43' in file_path \
            or 'CTU-13.Dataset-50' in file_path:
        return 'neris'
    elif 'CTU-13.Dataset-44' in file_path or 'CTU-13.Dataset-45' in file_path \
            or 'CTU-13.Dataset-51' in file_path or 'CTU-13.Dataset-52' in file_path:
        return 'rbot'
    elif 'CTU-13.Dataset-46' in file_path or 'CTU-13.Dataset-54' in file_path:
        return 'virut'
    elif 'CTU-13.Dataset-47' in file_path:
        return 'menti'
    elif 'CTU-13.Dataset-48' in file_path:
        return 'sogou'
    elif 'CTU-13.Dataset-49' in file_path:
        return 'murlo'
    elif 'CTU-13.Dataset-53' in file_path:
        return 'nsis.ay'

In [6]:
# dirpath = '/mnt/e/1IDM/CTU-13-Backup/'
dirpath = 'E:\\1IDM\\CTU-13-Backup\\'
    
dataset_list = ['CTU-13.Dataset-42', 'CTU-13.Dataset-43', 'CTU-13.Dataset-44', 
                    'CTU-13.Dataset-45', 'CTU-13.Dataset-46', 'CTU-13.Dataset-47', 
                    'CTU-13.Dataset-48', 'CTU-13.Dataset-49', 'CTU-13.Dataset-50',
                    'CTU-13.Dataset-51', 'CTU-13.Dataset-52', 'CTU-13.Dataset-53'] # 'CTU-13.Dataset-51' is skipped
# proto_list = ['/botnet/udp/', '/botnet/tcp_nosyn/', '/botnet/tcp_syn/'] # skip 'icmp' or other protos
proto_list = ['\\botnet\\udp\\', '\\botnet\\tcp_nosyn\\', '\\botnet\\tcp_syn\\'] # skip 'icmp' or other protos

all_filename_lists = []
    
for dataset in dataset_list:
    for proto in proto_list:
        if os.path.exists(dirpath + dataset + proto):
#             print('Processing directory: ' + dirpath + dataset + proto)
            filename_toread = generate_filelist_toread(dirpath + dataset + proto)
            all_filename_lists.extend(filename_toread)
#             print('Processed directory: ' + dirpath + dataset + proto)
        else: print('no such directory: ' + dirpath + dataset + proto)

no such directory: E:\1IDM\CTU-13-Backup\CTU-13.Dataset-48\botnet\tcp_nosyn\
no such directory: E:\1IDM\CTU-13-Backup\CTU-13.Dataset-51\botnet\udp\
no such directory: E:\1IDM\CTU-13-Backup\CTU-13.Dataset-51\botnet\tcp_nosyn\
no such directory: E:\1IDM\CTU-13-Backup\CTU-13.Dataset-51\botnet\tcp_syn\
no such directory: E:\1IDM\CTU-13-Backup\CTU-13.Dataset-52\botnet\tcp_nosyn\
no such directory: E:\1IDM\CTU-13-Backup\CTU-13.Dataset-53\botnet\tcp_nosyn\


In [7]:
# imagelist = []
labellist = []
# image_path = '/mnt/e/1IDM/CTU-13-Backup/grayscale/'
image_path = 'E:\\1IDM\\CTU-13-Backup\\grayscale\\'
if os.path.exists(image_path) == False:
    os.mkdir(image_path)

assert len(os.listdir(image_path)) == 0

image_num = len(all_filename_lists)

for cnt, filename_list in enumerate(all_filename_lists):
    if cnt % 5000 == 0:
        print('Processed %s pictures' % cnt)
        print(datetime.datetime.now())
        
    picture_str = readfile_bylist(filename_list)
    im = hex2pic(picture_str)
#     imagelist.append(im)
    im_name = os.path.splitext(os.path.basename(filename_list[0]))[0]
    im.save(image_path + im_name + '.bmp')
    label = get_label_bypath(filename_list[0])
    labellist.append([im_name, label])

#     if picture_str != b''.ljust(2048, b'0'):
#         im = hex2pic(picture_str)
#         imagelist.append(im)
#         labellist.append(all_data_label[i])

assert len(os.listdir(image_path)) == len(labellist)
print('Processed {0} pictures at {1}'.format(image_num, datetime.datetime.now()))

label_df = pd.DataFrame(labellist, columns = ['PicName', 'Label'])
# label_df.to_csv('/mnt/e/1IDM/CTU-13-Backup/label.csv', encoding = 'utf-8')
label_df.to_csv('E:\\1IDM\\CTU-13-Backup\\label.csv', index = 0, encoding = 'utf-8')

Processed 0 pictures
2020-03-30 10:44:46.095266
Processed 5000 pictures
2020-03-30 10:45:41.847674
Processed 10000 pictures
2020-03-30 10:46:24.802273
Processed 15000 pictures
2020-03-30 10:47:18.288596
Processed 20000 pictures
2020-03-30 10:47:58.899513
Processed 25000 pictures
2020-03-30 10:48:38.549273
Processed 30000 pictures
2020-03-30 10:49:24.386333
Processed 35000 pictures
2020-03-30 10:50:10.232309
Processed 40000 pictures
2020-03-30 10:50:27.870307
Processed 45000 pictures
2020-03-30 10:50:45.806343
Processed 50000 pictures
2020-03-30 10:51:07.906655
Processed 55000 pictures
2020-03-30 10:51:32.113678
Processed 60000 pictures
2020-03-30 10:51:54.164654
Processed 65000 pictures
2020-03-30 10:53:11.077475
Processed 70000 pictures
2020-03-30 10:54:16.475888
Processed 75000 pictures
2020-03-30 10:55:07.196539
Processed 80000 pictures
2020-03-30 10:56:02.150497
Processed 85000 pictures
2020-03-30 11:00:08.106812
Processed 90000 pictures
2020-03-30 11:03:06.892853
Processed 95000 p