In [1]:
!pip install gdown
!gdown --no-check-certificate --folder https://drive.google.com/drive/folders/17g55PHmMWFo6aBNhmzjOxVDfwtSDFUl3?usp=drive_link

Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting beautifulsoup4 (from gdown)
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->gdown)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Collecting PySocks!=1.5.7,>=1.5.6 (from requests[socks]->gdown)
  Using cached PySocks-1.7.1-py3-none-any.whl.metadata (13 kB)
Using cached gdown-5.2.0-py3-none-any.whl (18 kB)
Using cached beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Using cached PySocks-1.7.1-py3-none-any.whl (16 kB)
Using cached soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, PySocks, beautifulsoup4, gdown
Successfully installed PySocks-1.7.1 beautifulsoup4-4.12.3 gdown-5.2.0 soupsieve-2.6
Retrieving folder contents
Processing file 1J2eOhACVhee6fnrO0wA5ct255sVR5EQS BCNET_regular.csv
Processing file 1d4e4g7PNVkxO-H4ZknemlWZNnWOpJkLV Code_Red_I.csv
Processing file 1IleVfZkR-EQ0X6-

In [14]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

import os
import random

from TS_Transformer import TSTransformerEncoderClassiregressor
def set_all_seeds(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True

set_all_seeds(42)

In [13]:
# Constants
SEQUENCE_LENGTH = 10
FEATURES_START = 4
FEATURES_END = 41
LABEL_COLUMN = 41
BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 0.001

# Detect if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


# Custom dataset
class BGPDataset(Dataset):
    def __init__(self, data, labels, sequence_length=SEQUENCE_LENGTH):
        self.data = data
        self.labels = labels
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.data) - self.sequence_length + 1

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.sequence_length]
        y = self.labels[idx + self.sequence_length - 1]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# Load and preprocess data
def load_data(file_paths):
    all_data = []
    for file_path in file_paths:
        df = pd.read_csv(file_path, header=None)
        features = df.iloc[:, FEATURES_START:FEATURES_END + 1].values
        labels = df.iloc[:, LABEL_COLUMN].replace(-1, 0).values
        all_data.append((features, labels))
    return all_data

# Normalize features
def normalize_data(train_features, test_features):
    scaler = MinMaxScaler()
    train_features = scaler.fit_transform(train_features)
    test_features = scaler.transform(test_features)
    return train_features, test_features

# Compute class weights for imbalanced data
def compute_weights(labels):
    classes = np.unique(labels)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=labels)
    return {i: weight for i, weight in zip(classes, class_weights)}

class TransformerModel(nn.Module):
    def __init__(self, input_size, max_seq_len, d_model, n_heads, num_layers, dim_feedforward, num_classes, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.transformer = TSTransformerEncoderClassiregressor(
            feat_dim=input_size,
            max_len=max_seq_len,
            d_model=d_model,
            n_heads=n_heads,
            num_layers=num_layers,
            dim_feedforward=dim_feedforward,
            num_classes=num_classes,
            dropout=dropout,
            pos_encoding='fixed',
            activation='gelu',
            norm='BatchNorm'
        )

    def forward(self, x):
        # No padding mask used in this dataset; provide a mask of ones
        batch_size, seq_len, _ = x.size()
        padding_mask = torch.ones(batch_size, seq_len, dtype=torch.bool, device=x.device)
        return self.transformer(x, padding_mask)

# Training function
def train_model(model, train_loader, criterion, optimizer, scheduler, epochs=EPOCHS):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(x_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        scheduler.step(total_loss / len(train_loader))
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.5f}")

# Evaluation function with zero_division parameter
def evaluate_model(model, test_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model(x_batch)
            preds = (outputs.squeeze() > 0.5).int()
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(y_batch.cpu().tolist())
    print(classification_report(all_labels, all_preds, digits=4, zero_division=1))

# Main script
def main(folder_path):
    # Get all CSV file paths in the folder
    file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".csv")]

    # Split files into training (BCNET and RIPE) and testing (others)
    train_files = [f for f in file_paths if 'Nimda' in f or 'Slammer' in f]
    test_files = [f for f in file_paths if f not in train_files]

    print("Training files:", train_files)
    print("Testing files:", test_files)

    # Load and preprocess data
    train_data = load_data(train_files)
    test_data = load_data(test_files)

    # Prepare training and testing datasets
    train_features = np.vstack([data[0] for data in train_data])
    train_labels = np.concatenate([data[1] for data in train_data])
    test_features = np.vstack([data[0] for data in test_data])
    test_labels = np.concatenate([data[1] for data in test_data])
    print("Unique labels in training data:", np.unique(train_labels))
    print("Unique labels in testing data:", np.unique(test_labels))

    #raise Exception("Stop here")

    # Normalize data
    train_features, test_features = normalize_data(train_features, test_features)

    # Compute class weights
    class_weights = compute_weights(train_labels)
    print("Class weights:", class_weights)
    class_weights_tensor = torch.tensor([class_weights[i] for i in range(2)], dtype=torch.float32).to(device)
    # Create Datasets and DataLoaders
    train_dataset = BGPDataset(train_features, train_labels)
    test_dataset = BGPDataset(test_features, test_labels)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Model Initialization
    input_size = FEATURES_END - FEATURES_START + 1
    max_seq_len = SEQUENCE_LENGTH
    d_model = 16
    n_heads = 2
    num_layers = 2
    dim_feedforward = 16
    num_classes = 1  # Binary classification
    dropout = 0.6

    model = TransformerModel(input_size, max_seq_len, d_model, n_heads, num_layers, dim_feedforward, num_classes, dropout).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1)

    # Train and Evaluate
    train_model(model, train_loader, criterion, optimizer, scheduler)
    evaluate_model(model, test_loader)

# Path to the folder containing the datasets
folder_path = "/home/tus4zw/computer_networks/BGP_RIPE_datasets"  # Update with your folder path
main(folder_path)


Using device: cuda
Training files: ['/home/tus4zw/computer_networks/BGP_RIPE_datasets/Nimda.csv', '/home/tus4zw/computer_networks/BGP_RIPE_datasets/Slammer.csv']
Testing files: ['/home/tus4zw/computer_networks/BGP_RIPE_datasets/RIPE_regular.csv', '/home/tus4zw/computer_networks/BGP_RIPE_datasets/WannaCrypt.csv', '/home/tus4zw/computer_networks/BGP_RIPE_datasets/Moscow_blackout.csv', '/home/tus4zw/computer_networks/BGP_RIPE_datasets/Code_Red_I.csv', '/home/tus4zw/computer_networks/BGP_RIPE_datasets/BCNET_regular.csv']
Unique labels in training data: [0 1]
Unique labels in testing data: [0 1]
Class weights: {np.int64(0): np.float64(0.5795512867512281), np.int64(1): np.float64(3.6426267281105993)}




Epoch 1/20, Loss: 10.04267
Epoch 2/20, Loss: 7.57379
Epoch 3/20, Loss: 7.20850
Epoch 4/20, Loss: 7.12644
Epoch 5/20, Loss: 7.13281
Epoch 6/20, Loss: 7.17530
Epoch 7/20, Loss: 7.09308
Epoch 8/20, Loss: 7.03200
Epoch 9/20, Loss: 7.10150
Epoch 10/20, Loss: 7.02978
Epoch 11/20, Loss: 7.05310
Epoch 12/20, Loss: 7.01297
Epoch 13/20, Loss: 7.02428
Epoch 14/20, Loss: 7.07429
Epoch 15/20, Loss: 6.99578
Epoch 16/20, Loss: 7.00583
Epoch 17/20, Loss: 7.07511
Epoch 18/20, Loss: 6.97625
Epoch 19/20, Loss: 6.99209
Epoch 20/20, Loss: 7.07184
              precision    recall  f1-score   support

         0.0     0.9997    0.9989    0.9993     27947
         1.0     0.9955    0.9988    0.9971      6600

    accuracy                         0.9989     34547
   macro avg     0.9976    0.9989    0.9982     34547
weighted avg     0.9989    0.9989    0.9989     34547



In [16]:
# Constants
SEQUENCE_LENGTH = 10  # Number of timesteps for LSTM
FEATURES_START = 4    # Start index of features
FEATURES_END = 41     # End index of features (inclusive)
LABEL_COLUMN = 41     # Label column index
BATCH_SIZE = 64
EPOCHS = 20
LEARNING_RATE = 0.001

# Custom dataset for LSTM
class BGPDataset(Dataset):
    def __init__(self, data, labels, sequence_length=SEQUENCE_LENGTH):
        self.data = data
        self.labels = labels
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.data) - self.sequence_length + 1

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.sequence_length]
        y = self.labels[idx + self.sequence_length - 1]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# Load and preprocess data
def load_data(file_paths):
    all_data = []
    for file_path in file_paths:
        df = pd.read_csv(file_path, header=None)  # No headers in the file
        features = df.iloc[:, FEATURES_START:FEATURES_END + 1].values
        labels = df.iloc[:, LABEL_COLUMN].replace(-1, 0).values  # Convert -1 to 0
        all_data.append((features, labels))
    return all_data

# Normalize features on training data only
def normalize_data(train_features, test_features):
    scaler = MinMaxScaler()
    train_features = scaler.fit_transform(train_features)
    test_features = scaler.transform(test_features)
    return train_features, test_features

# Define LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Use the last time step's output
        return torch.sigmoid(out)

# Training Loop
def train_model(model, train_loader, criterion, optimizer, epochs=EPOCHS):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(x_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

# Evaluation Function
def evaluate_model(model, test_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            outputs = model(x_batch)
            preds = (outputs.squeeze() > 0.5).int()
            all_preds.extend(preds.tolist())
            all_labels.extend(y_batch.tolist())
    print(classification_report(all_labels, all_preds, digits=4))

# Main script
def main():

    folder_path = "/home/tus4zw/computer_networks/BGP_RIPE_datasets"
    # Get all CSV file paths in the folder
    file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".csv")]

    # Split files into training (BCNET and RIPE) and testing (others)
    train_files = [f for f in file_paths if 'Nimda' in f or 'Slammer' in f]
    test_files = [f for f in file_paths if f not in train_files]

    print("Training files:", train_files)
    print("Testing files:", test_files)

    # Load and preprocess data
    train_data = load_data(train_files)
    test_data = load_data(test_files)

    # Prepare training and testing datasets
    train_features = np.vstack([data[0] for data in train_data])
    train_labels = np.concatenate([data[1] for data in train_data])
    test_features = np.vstack([data[0] for data in test_data])
    test_labels = np.concatenate([data[1] for data in test_data])

    # Normalize data
    train_features, test_features = normalize_data(train_features, test_features)

    # Create Datasets and DataLoaders
    train_dataset = BGPDataset(train_features, train_labels)
    test_dataset = BGPDataset(test_features, test_labels)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Model Initialization
    input_size = FEATURES_END - FEATURES_START + 1
    hidden_size = 64
    num_layers = 2
    output_size = 1  # Binary classification
    model = LSTMModel(input_size, hidden_size, num_layers, output_size)
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # Train and Evaluate
    train_model(model, train_loader, criterion, optimizer)
    evaluate_model(model, test_loader)

main()

Training files: ['/home/tus4zw/computer_networks/BGP_RIPE_datasets/Nimda.csv', '/home/tus4zw/computer_networks/BGP_RIPE_datasets/Slammer.csv']
Testing files: ['/home/tus4zw/computer_networks/BGP_RIPE_datasets/RIPE_regular.csv', '/home/tus4zw/computer_networks/BGP_RIPE_datasets/WannaCrypt.csv', '/home/tus4zw/computer_networks/BGP_RIPE_datasets/Moscow_blackout.csv', '/home/tus4zw/computer_networks/BGP_RIPE_datasets/Code_Red_I.csv', '/home/tus4zw/computer_networks/BGP_RIPE_datasets/BCNET_regular.csv']


Epoch 1/20, Loss: 0.10312487151090108
Epoch 2/20, Loss: 0.007148597427871953
Epoch 3/20, Loss: 0.00427214672683167
Epoch 4/20, Loss: 0.0038857036808091837
Epoch 5/20, Loss: 0.00338605341302049
Epoch 6/20, Loss: 0.0020794493033065692
Epoch 7/20, Loss: 0.0014950096089378054
Epoch 8/20, Loss: 0.0012636703559110174
Epoch 9/20, Loss: 0.0008049043365217916
Epoch 10/20, Loss: 0.0008794138429377626
Epoch 11/20, Loss: 0.00023547446855872874
Epoch 12/20, Loss: 0.0010932277314251307
Epoch 13/20, Loss: 0.0004405384169292565
Epoch 14/20, Loss: 0.00016315606700138477
Epoch 15/20, Loss: 8.773484405712504e-05
Epoch 16/20, Loss: 5.038360266807424e-05
Epoch 17/20, Loss: 4.084580252112006e-05
Epoch 18/20, Loss: 0.00012967452521327682
Epoch 19/20, Loss: 8.368357237975442e-05
Epoch 20/20, Loss: 3.4993820228140726e-05
              precision    recall  f1-score   support

         0.0     0.9996    0.7887    0.8818     27947
         1.0     0.5275    0.9988    0.6904      6600

    accuracy                