In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import matplotlib.pyplot as plt
from rff.layers import GaussianEncoding

In [3]:
# df = pd.read_csv(r"C:\Users\smbm2\projects\research\csv_datasets\Scenario A1\TimeBasedFeatures-Dataset-15s-VPN.csv") # personal
df = pd.read_csv(r"C:\python\research\csv_datasets\Scenario A1\TimeBasedFeatures-Dataset-15s-VPN.csv") # for IR
df = df.drop(columns=['Unnamed: 0'])

le = LabelEncoder()
df['class1'] = le.fit_transform(df['class1'])

print(df['class1'].value_counts())
print(df.isna().any())

df.head()

1    9793
0    8965
Name: class1, dtype: int64
duration              False
total_fiat            False
total_biat            False
min_fiat              False
min_biat              False
max_fiat              False
max_biat              False
mean_fiat             False
mean_biat             False
flowPktsPerSecond     False
flowBytesPerSecond    False
min_flowiat           False
max_flowiat           False
mean_flowiat          False
std_flowiat           False
min_active            False
mean_active           False
max_active            False
std_active            False
min_idle              False
mean_idle             False
max_idle              False
std_idle              False
class1                False
dtype: bool


Unnamed: 0,duration,total_fiat,total_biat,min_fiat,min_biat,max_fiat,max_biat,mean_fiat,mean_biat,flowPktsPerSecond,...,std_flowiat,min_active,mean_active,max_active,std_active,min_idle,mean_idle,max_idle,std_idle,class1
0,9368711.0,16.0,4.0,1564818.0,1549373.0,190205.285714,203290.456522,389822.391917,370323.719754,10.353612,...,267600.198443,1871488.0,1983656.0,2195089.0,183219.7,1234883.0,1420565.0,1523088.0,161096.539275,0
1,7340238.0,18.0,4.0,1567554.0,1527893.0,165686.977273,186914.846154,317267.548742,304370.651301,11.580006,...,221462.862028,1491627.0,3572433.0,5653239.0,2942704.0,1131498.0,1324636.0,1517774.0,273138.379008,0
2,4644225.0,29.0,15.0,1270547.0,1079974.0,165865.178571,195302.130435,329473.126261,300492.588227,11.412022,...,217475.425246,1758922.0,1758922.0,1758922.0,0.0,1079974.0,1079974.0,1079974.0,0.0,0
3,4978735.0,19.0,8.0,2492050.0,2457286.0,239543.25,276596.388889,612435.304238,628339.573544,8.034169,...,436959.716436,1710925.0,2382905.0,3054885.0,950323.2,1346073.0,1894031.5,2441990.0,774930.342317,0
4,11838189.0,19.0,10.0,3094089.0,3093543.0,243766.5,295954.725,599721.781709,625632.703972,7.602514,...,436129.639296,1747431.0,2400446.0,3240696.0,623274.4,1394455.0,1983227.0,3042717.0,725987.829075,0


In [4]:
class vpnDataset(Dataset):
    def __init__(self, df, target = 'class1'):
        self.n = df.shape[0]
        
        self.y = df[target].astype(np.float32).values

        self.x = df.drop(columns=[target]).astype(np.float32).values

    def __len__(self):
        return self.n
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [5]:
vpn_dataset = vpnDataset(df)
train_size = int(0.7*len(vpn_dataset))
print(train_size)
test_size = len(vpn_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(vpn_dataset, [train_size, test_size])
print(len(train_dataset), len(test_dataset))

13130
13130 5628


In [6]:
batch_sz = 256
train_dataloader = DataLoader(train_dataset, batch_size=batch_sz, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_sz, shuffle=False)

### Preprocessing is Done. Model time.

In [87]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()

        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        assert(self.head_dim * heads == embed_size), "Embed size needs to be div by heads"
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys =nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads*self.head_dim, embed_size)

    def forward(self, values, keys, query):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, key_len, self.heads, self.head_dim)
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

        # if mask is not None:
        #     energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3)
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, query_len, self.heads*self.head_dim)
        out = self.fc_out(out)
        return out
    
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()

        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(nn.Linear(embed_size, forward_expansion*embed_size),
                                          nn.ReLU(),
                                          nn.Linear(forward_expansion*embed_size, embed_size)
                                          )
        self.dropout = nn.Dropout(dropout)

    def forward(self,value,key,query):
        attention = self.attention(value, key, query)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

class Encoder(nn.Module):
    def __init__(self,
                 sigma,
                 embed_size,
                 input_size,
                 n_features,
                 num_layers,
                 heads,
                 forward_expansion,
                 dropout
    ):
        super(Encoder, self).__init__()

        self.embed_size = embed_size
        self.embeddings = GaussianEncoding(sigma = sigma, input_size=input_size, encoded_size=embed_size//2) 
        
        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion
                )
                for _ in range(num_layers)
            ]
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # print("shape of x going in ", x.shape)
        N, n_features = x.shape
        x = x.unsqueeze(2)
        print("shape of x going in ", x.shape)
    
        x = self.embeddings(x)
            
        # print(x_i.shape)

        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, x, x)
        
        # x= torch.reshape(x, (-1, n_features*x.shape[2]))
        return x


In [88]:
model = Encoder(sigma = 4, embed_size=20, input_size=1, n_features=23, num_layers=1, heads=1, forward_expansion=4, dropout=0.5)

In [89]:
for input, label in train_dataloader:
    model(input)

shape of x going in  torch.Size([256, 23, 1])
encoder output shape torch.Size([256, 23, 20])
tensor([[[ 0.0000e+00, -4.9193e+00,  1.4977e-01,  ..., -0.0000e+00,
           1.1308e-01, -0.0000e+00],
         [ 1.6164e+00, -0.0000e+00, -0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., -0.0000e+00,
           1.3403e+00,  8.7776e-02],
         ...,
         [-2.2901e-01, -2.6453e-01,  0.0000e+00,  ..., -0.0000e+00,
          -2.4917e+00,  3.9011e-01],
         [-0.0000e+00,  0.0000e+00, -2.7723e-02,  ...,  0.0000e+00,
          -0.0000e+00,  2.2446e+00],
         [-0.0000e+00,  1.3799e-01,  0.0000e+00,  ..., -0.0000e+00,
          -8.5961e-01,  2.4876e-01]],

        [[-0.0000e+00, -0.0000e+00,  2.5517e+00,  ..., -0.0000e+00,
          -1.7768e+00,  4.2568e+00],
         [-5.2854e-03, -0.0000e+00,  2.8503e+00,  ...,  2.4009e-01,
          -0.0000e+00, -1.9952e+00],
         [-2.2255e+00,  0.0000e+00, -8.1048e-02,  .

In [None]:
class 