In [49]:
import math
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
torch.cuda.is_available()

True

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [51]:
import numpy as np
import torchdata.datapipes as dp

In [52]:
def my_mapper(row):
    return {
        "label": np.array( 0 if row[-1] == 'BENIGN' else 1, np.float32),
        "data": np.array(row[:-2], dtype=np.float32)
    }

In [53]:
def build_datapipes(root_dir="."):
    datapipe = dp.iter.FileLister(root_dir)
    datapipe = datapipe.filter(filter_fn=lambda filename: filename.endswith(".csv"))
    datapipe = dp.iter.FileOpener(datapipe, mode='rt')
    datapipe = datapipe.parse_csv(delimiter=",", skip_lines=1)
    datapipe = datapipe.map(my_mapper)
    return datapipe

In [54]:
from torch.utils.data import DataLoader

datapipe = build_datapipes(root_dir="../../../datasets/CIC-IDS-2017/")
dl = DataLoader(dataset=datapipe, batch_size=50, shuffle=True)
first = next(iter(dl))



In [55]:
labels, features = first['label'], first['data']
print(f"Labels batch shape: {labels.size()}")
print(f"Feature batch shape: {features.size()}")

Labels batch shape: torch.Size([50])
Feature batch shape: torch.Size([50, 77])


In [56]:
features[0]

tensor([ 5.4865e+04,  3.0000e+00,  2.0000e+00,  0.0000e+00,  1.2000e+01,
         0.0000e+00,  6.0000e+00,  6.0000e+00,  6.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  4.0000e+06,
         6.6667e+05,  3.0000e+00,  0.0000e+00,  3.0000e+00,  3.0000e+00,
         3.0000e+00,  3.0000e+00,  0.0000e+00,  3.0000e+00,  3.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  4.0000e+01,
         0.0000e+00,  6.6667e+05,  0.0000e+00,  6.0000e+00,  6.0000e+00,
         6.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  9.0000e+00,  6.0000e+00,  0.0000e+00,
         4.0000e+01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  2.0000e+00,  1.2000e+01,  0.0000e+00,
         0.0000e+00,  3.3000e+01, -1.0000e+00,  1.0

In [57]:
class TransformerIDS(torch.nn.Module):
    def __init__(self, ntoken: int, d_model: int, nhead: int = 8, d_hid: int = 512,
                 nlayers: int = 6, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        # self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()
        
    def init_weights(self) -> None:
        initrange = 0.1
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        # src = self.embedding(src) * math.sqrt(self.d_model)
        # src = self.pos_encoder(src)
        # if src_mask is None:
        #     """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
        #     Unmasked positions are filled with float(0.0).
        #     """
        #     src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
        output = self.transformer_encoder(src)
        output = self.linear(output)
        return output

In [58]:
# Hyperparameters:
batch_size = 1 
epochs = 20
lr = 3e-5
gamma = 0.7
num_classes = 2

In [80]:
import pandas as pd

class CIC_IDS_2017:
    def __init__(self, path, transform=None, target_transform=None) -> None:
        self.read = pd.read_csv(path)
        self.read = self.read.to_numpy()
        self.data = self.read[:,:-2]
        self.labels = self.read[:,-1]
        self.labels = np.array([0 if x == 'BENIGN' else 1 for x in self.labels])
        
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        data, label = self.data[idx], self.labels[idx]
        if self.transform:
            data = self.transform(data)
        if self.target_transform:
            label = self.target_transform(label)
        return np.array(data, dtype=np.float32), np.array(label, dtype=np.float32)

In [81]:
train = CIC_IDS_2017("../../../datasets/CIC-IDS-2017/to_use/train.csv")
val = CIC_IDS_2017("../../../datasets/CIC-IDS-2017/to_use/val.csv")
test = CIC_IDS_2017("../../../datasets/CIC-IDS-2017/to_use/test.csv")

In [82]:
first = next(iter(train))
first

(array([ 5.84898e+05,  5.32600e+04,  4.00000e+00,  2.00000e+00,
         0.00000e+00,  1.20000e+01,  0.00000e+00,  6.00000e+00,
         6.00000e+00,  6.00000e+00,  0.00000e+00,  0.00000e+00,
         0.00000e+00,  0.00000e+00,  0.00000e+00,  3.00000e+06,
         5.00000e+05,  4.00000e+00,  0.00000e+00,  4.00000e+00,
         4.00000e+00,  4.00000e+00,  4.00000e+00,  0.00000e+00,
         4.00000e+00,  4.00000e+00,  0.00000e+00,  0.00000e+00,
         0.00000e+00,  0.00000e+00,  0.00000e+00,  0.00000e+00,
         0.00000e+00,  0.00000e+00,  0.00000e+00,  4.00000e+01,
         0.00000e+00,  5.00000e+05,  0.00000e+00,  6.00000e+00,
         6.00000e+00,  6.00000e+00,  0.00000e+00,  0.00000e+00,
         1.00000e+00,  0.00000e+00,  0.00000e+00,  0.00000e+00,
         0.00000e+00,  0.00000e+00,  0.00000e+00,  0.00000e+00,
         0.00000e+00,  9.00000e+00,  6.00000e+00,  0.00000e+00,
         4.00000e+01,  0.00000e+00,  0.00000e+00,  0.00000e+00,
         0.00000e+00,  0.00000e+00,  0.0

In [78]:
# train_datapipe = build_datapipes(root_dir="../../../datasets/CIC-IDS-2017/train/")
train_dl = DataLoader(dataset=train, batch_size=batch_size, shuffle=True)
# test_datapipe = build_datapipes(root_dir="../../../datasets/CIC-IDS-2017/test/")
test_dl = DataLoader(dataset=test, batch_size=batch_size, shuffle=True)
# val_datapipe = build_datapipes(root_dir="../../../datasets/CIC-IDS-2017/val/")
val_dl = DataLoader(dataset=val, batch_size=batch_size, shuffle=True)

In [60]:
model = TransformerIDS(2, 78, 6, 512, 3)
model.to(device)



TransformerIDS(
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=78, out_features=78, bias=True)
        )
        (linear1): Linear(in_features=78, out_features=512, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=512, out_features=78, bias=True)
        (norm1): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
    )
  )
  (linear): Linear(in_features=78, out_features=2, bias=True)
)

In [83]:
total_len = 0
for entry in iter(train_dl):
    # print(entry['label'])
    # print(entry['data'])
    # print(entry['data'].shape)
    total_len += len(entry['data'])
    
print(total_len)

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object

In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from tqdm.notebook import tqdm

# Training:
# loss function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)

# Learning Rate Scheduler for Optimizer:
scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

for epoch in range(epochs):
    epoch_loss = 0
    epoch_accuracy = 0
    
    for entry in iter(train_dl):
        data = entry['data']
        label = entry['label']
        label = label.type(torch.LongTensor)
        data, label = data.to(device), label.to(device)
        
        print(data.shape)
        output = model(data)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = (output.argmax(dim=1) == label).float().mean()
        epoch_accuracy += acc / len(train_dl)
        epoch_loss += loss / len(train_dl)

        with torch.no_grad():
            epoch_val_accuracy = 0
            epoch_val_loss = 0
            
        for data, label in val_dl:
            
            data = data.to(device)
            label = label.to(device)

            val_output = model(data)
            val_loss = criterion(val_output, label)

            acc = (val_output.argmax(dim=1) == label).float().mean()
            epoch_val_accuracy += acc / len(val_dl)
            epoch_val_loss += val_loss / len(val_dl)

    print(
        f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
    )

torch.Size([1, 78])


TypeError: _IterDataPipeSerializationWrapper instance doesn't have valid length