## Preprocessing of EBSNN

1. Temporarily implemented a fast version, just to make training possible
    1. features, labels (for train, valid, and test set)
    2. pickle for file (no need of h5py with enough memory)
    3. segmentation is not done here
2. Then implement detailed preprocessing
    1. handle errors
    2. removing features like IP addresses as described in the paper
3. I want the preprocessing to handle details and nothing special to do in dataset.

In [7]:
import os
import pickle
import json
import dpkt
import h5py
import numpy as np

from sklearn.model_selection import train_test_split


# def calculate_alpha(counter, mode='normal'):
#     if mode == 'normal':
#         alpha = torch.tensor(counter, dtype=torch.float32)
#         alpha = alpha / alpha.sum(0).expand_as(alpha)
#     elif mode == 'invert':
#         alpha = torch.tensor(counter, dtype=torch.float32)
#         alpha_sum = alpha.sum(0)
#         alpha_sum_expand = alpha_sum.expand_as(alpha)
#         alpha = (alpha_sum - alpha) / alpha_sum_expand
#     # fill all zeros to ones
#     alpha[alpha==0.] = 1.
#     return alpha


def process_buffer(buffer, max_length=1500):
    """
    TODO: detailed processing of packet data (read the paper)

    DPKT docs: https://kbandla.github.io/dpkt/
    """
    try:
        eth = dpkt.ethernet.Ethernet(buffer)
        if not isinstance(eth.data, dpkt.ip.IP):
            return None
        ip = eth.data
        if not isinstance(ip.data, dpkt.tcp.TCP):
            return None
        tcp = ip.data
        payload = tcp.data
    except Exception as e:
        print("[error] {}".format(e))

    # redundant if do padding here
    return bytes(ip)   # debug


def read_class(class_name, data_dir):
    "read a class of packets"
    features = []
    count = 0
    failed_files = []
    for file in os.listdir(os.path.join(data_dir, class_name)):
        with open(f'../data/d1/{class_name}/{file}', 'rb') as f:
            try:
                pcap = dpkt.pcap.Reader(f)
            except Exception as e:
                failed_files.append(file)
                continue
            
            for timestamp, buffer in pcap:
                processed_data = process_buffer(buffer)
                if processed_data is not None:  # TODO: better handling
                    features.append(processed_data)
                    count += 1
        break   # FIXME: data size not consistent with paper (weibo 80k vs. 50k), break just for debugging
    
    print(f"class {class_name} total {count} packets")   # NOTE by zian: does flow needs extra processing ?
    print("failed files:", failed_files)
    return features


def read_dataset(data_dir):
    "dataset `d1` or `d2`"

    features = []
    labels = []
    label2id = {}
    id2label = {}

    for i, class_name in enumerate(os.listdir(data_dir)):
        label2id[class_name] = i
        id2label[i] = class_name
        class_features = read_class(class_name, data_dir)
        class_labels = [i for j in range(len(class_features))]
        features += class_features
        labels += class_labels
    
    return features, labels, label2id, id2label


def main():

    X, y, label2id, id2label = read_dataset('../data/d1')
    print(y[:5])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

    with open('../data/d1_train_dump.pkl', 'wb') as f:
        pickle.dump(X_train, f)
        pickle.dump(y_train, f)
        pickle.dump(label2id, f)
        pickle.dump(id2label, f)
    
    with open('../data/d1_val_dump.pkl', 'wb') as f:
        pickle.dump(X_val, f)
        pickle.dump(y_val, f)
        pickle.dump(label2id, f)
        pickle.dump(id2label, f)
    
    with open('../data/d1_test_dump.pkl', 'wb') as f:
        pickle.dump(X_test, f)
        pickle.dump(y_test, f)
        pickle.dump(label2id, f)
        pickle.dump(id2label, f)

main()



class weibo total 16980 packets
failed files: []
class kugou total 2427 packets
failed files: []
class cloudmusic total 55548 packets
failed files: []
class pplive total 2856 packets
failed files: []
class itunes total 8666 packets
failed files: []
class facebook total 0 packets
failed files: ['facebook_video2b.pcap']
class spotify total 3145 packets
failed files: []
class tudou total 37437 packets
failed files: []
class youtube total 7689 packets
failed files: []
class skype total 0 packets
failed files: ['unprocessed_skype_video1b.pcap', 'extra_skype_audio3.pcap']
class sohu total 13976 packets
failed files: []
class voipbuster total 1485 packets
failed files: []
class MS-Exchange total 289 packets
failed files: []
class aimchat total 1139 packets
failed files: []
class vimeo total 2991 packets
failed files: []
class yahoomail total 1700 packets
failed files: []
class gmail total 0 packets
failed files: ['extra_gmailchat2.pcap', 'extra_gmailchat3.pcap', 'gmailchat1.pcap']
class netfl