In [4]:
windows_size = 20
step_size = 4

In [5]:
import pandas as pd

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, shm_size_mb=4096*4)

from transformers import BertTokenizer

import warnings
warnings.filterwarnings('ignore')

import numpy as np

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [7]:
thunderbird_df = pd.read_feather('./Thunderbird.log_structured.feather')

In [8]:
thunderbird_df_part = thunderbird_df[['Label', 'Date', 'EventTemplate']]
thunderbird_df_part

Unnamed: 0,Label,Date,EventTemplate
0,-,2005.11.09,tftp: client does not accept options
1,-,2005.11.09,warning: unable to look up public/pickup: No s...
2,-,2005.11.09,warning: unable to look up public/pickup: No s...
3,-,2005.11.09,warning: unable to look up public/pickup: No s...
4,-,2005.11.09,warning: unable to look up public/pickup: No s...
...,...,...,...
4999995,-,2005.11.18,<*> on <*> to <*> via <*>
4999996,-,2005.11.18,DHCPDISCOVER from <*> via <*>
4999997,-,2005.11.18,DHCPDISCOVER from <*> via <*>
4999998,-,2005.11.18,<*> on <*> to <*> via <*>


In [5]:
# def minimum_length_drop(df: pd.DataFrame, feature: str, minimum_length: int = 35):
#
#         df = df.loc[df[feature].str.len() > minimum_length].reset_index(drop=True)
#
#         return df
#
# thunderbird_df_part = minimum_length_drop(thunderbird_df_part, feature='EventTemplate')
# thunderbird_df_part

Unnamed: 0,Label,Date,EventTemplate
0,-,2005.11.09,tftp: client does not accept options
1,-,2005.11.09,warning: unable to look up public/pickup: No s...
2,-,2005.11.09,warning: unable to look up public/pickup: No s...
3,-,2005.11.09,warning: unable to look up public/pickup: No s...
4,-,2005.11.09,warning: unable to look up public/pickup: No s...
...,...,...,...
3212578,-,2005.11.18,DHCPDISCOVER from <*> via eth1: network <*> no...
3212579,-,2005.11.18,DHCPREQUEST for <*> (<*>) from <*> via eth1: u...
3212580,-,2005.11.18,Instrumentation Service EventID: 1052 Temperat...
3212581,-,2005.11.18,Instrumentation Service EventID: 1052 Temperat...


In [10]:
bert_tokenizer = BertTokenizer.from_pretrained('../bert_tiny_checkpoint')

def get_sen_token(sequence: str):
    return bert_tokenizer(sequence, return_tensors='pt', padding=False, truncation=True, max_length=512)

In [11]:
template_index = thunderbird_df['EventTemplate'].value_counts().index.to_list()

In [12]:
template_dict = {template: get_sen_token(template) for template in template_index}

In [13]:
thunderbird_df_part['Label'] = thunderbird_df_part['Label'].apply(lambda x: int(x != '-'))

In [14]:
thunderbird_df_part['Label'].value_counts()

Label
0    4773247
1     226753
Name: count, dtype: int64

In [15]:
thunderbird_df_part['Token_dict'] = thunderbird_df_part['EventTemplate'].map(template_dict)

In [16]:
thunderbird_df_part['input_ids'] = thunderbird_df_part['Token_dict'].apply(lambda x: x['input_ids'].squeeze(0))
thunderbird_df_part['token_type_ids'] = thunderbird_df_part['Token_dict'].apply(lambda x: x['token_type_ids'].squeeze(0))
thunderbird_df_part['attention_mask'] = thunderbird_df_part['Token_dict'].apply(lambda x: x['attention_mask'].squeeze(0))

In [17]:
thunderbird_df_part

Unnamed: 0,Label,Date,EventTemplate,Token_dict,input_ids,token_type_ids,attention_mask
0,0,2005.11.09,tftp: client does not accept options,"[input_ids, token_type_ids, attention_mask]","[tensor(101), tensor(1056), tensor(6199), tens...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
1,0,2005.11.09,warning: unable to look up public/pickup: No s...,"[input_ids, token_type_ids, attention_mask]","[tensor(101), tensor(5432), tensor(1024), tens...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
2,0,2005.11.09,warning: unable to look up public/pickup: No s...,"[input_ids, token_type_ids, attention_mask]","[tensor(101), tensor(5432), tensor(1024), tens...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
3,0,2005.11.09,warning: unable to look up public/pickup: No s...,"[input_ids, token_type_ids, attention_mask]","[tensor(101), tensor(5432), tensor(1024), tens...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
4,0,2005.11.09,warning: unable to look up public/pickup: No s...,"[input_ids, token_type_ids, attention_mask]","[tensor(101), tensor(5432), tensor(1024), tens...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
...,...,...,...,...,...,...,...
4999995,0,2005.11.18,<*> on <*> to <*> via <*>,"[input_ids, token_type_ids, attention_mask]","[tensor(101), tensor(1026), tensor(1008), tens...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
4999996,0,2005.11.18,DHCPDISCOVER from <*> via <*>,"[input_ids, token_type_ids, attention_mask]","[tensor(101), tensor(28144), tensor(21906), te...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
4999997,0,2005.11.18,DHCPDISCOVER from <*> via <*>,"[input_ids, token_type_ids, attention_mask]","[tensor(101), tensor(28144), tensor(21906), te...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
4999998,0,2005.11.18,<*> on <*> to <*> via <*>,"[input_ids, token_type_ids, attention_mask]","[tensor(101), tensor(1026), tensor(1008), tens...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."


In [18]:
thunderbird_df_part.shape[0]

5000000

In [19]:
import torch

def Sliding_window(df: pd.DataFrame, window_size: int = 20, step_size: int = 4):

    date_list = list()
    input_ids_list = list()
    attention_mask_list = list()
    label_list = list()

    for idx in np.arange(0, thunderbird_df_part.shape[0], step=step_size, dtype=int):
        try:
            date_list.append(df['Date'].values[idx])
            
            input_ids_tensor = torch.cat([torch.Tensor([101]), torch.cat([i[1:] for i in df['input_ids'].values[idx: idx+window_size]], dim=0)], dim=0)
            if input_ids_tensor.shape[0] > 512:
                input_ids_tensor = input_ids_tensor[:512]
            else:
                input_ids_tensor = torch.cat([input_ids_tensor, torch.tile(torch.Tensor([0.]), (512 - input_ids_tensor.shape[0], ))], dim=0)
            input_ids_list.append(input_ids_tensor.numpy())

            attention_mask_tensor = torch.cat([torch.Tensor([1]), torch.cat([i[1:] for i in df['attention_mask'].values[idx: idx+window_size]], dim=0)], dim=0)
            if attention_mask_tensor.shape[0] > 512:
                attention_mask_tensor = attention_mask_tensor[:512]
            else:
                attention_mask_tensor = torch.cat([attention_mask_tensor, torch.tile(torch.Tensor([0.]), (512 - attention_mask_tensor.shape[0], ))], dim=0)
            attention_mask_list.append(attention_mask_tensor.numpy())

            label_list.append(max(df['Label'].values[idx: idx+window_size]))

        except IndexError:
            pass

    return {'date': date_list,
            'input_ids': input_ids_list,
            'attention_mask': attention_mask_list,
            'label': label_list}

In [20]:
import torch

data_dict = Sliding_window(df=thunderbird_df_part, )

In [21]:
data_df = pd.DataFrame(data_dict)

In [22]:
data_df['label'].value_counts()

label
0    876863
1    373137
Name: count, dtype: int64

In [23]:
data_df.to_feather('Thunderbird.log_structured_slided.feather')