# Autoencoder with CNN and LSTM layers  

### Imports

In [None]:
import hashlib
import torch
import numpy as np
import pandas as pd

from torch import optim, nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
import os

#### Access the environment variables and read in the dataset

In [None]:
sequence_directory = os.getenv('SEQUENCE_DIRECTORY')
sequence_dataset = os.getenv('SEQUENCE_DATASET')

dataset_path = os.path.join(sequence_directory, sequence_dataset)
df = pd.read_csv(dataset_path)

In [None]:
def one_hot_encode_protocol(df, column_name='protocol'):
    """One-hot encode the protocol column of a pandas DataFrame."""
    one_hot = pd.get_dummies(df[column_name], prefix=column_name, dtype=float)
    return pd.concat([df.drop(column_name, axis=1), one_hot], axis=1)

def hash_ip(ip_address):
    """Hash an IP address into a fixed-size integer."""
    # Use SHA-256 hashing
    hashed_ip = hashlib.sha256(ip_address.encode('utf-8')).hexdigest()
    # Convert the hash to an integer
    numeric_ip = int(hashed_ip, 16) % (10 ** 10)  # Use modulo to keep integer size manageable
    return numeric_ip

df.fillna(0, inplace=True)
df.sort_values(by='timing', inplace=True)

df['src_ip'] = df['src_ip'].astype(str) 
df['dst_ip'] = df['dst_ip'].astype(str)
df['src_ip'] = df['src_ip'].apply(hash_ip)
df['dst_ip'] = df['dst_ip'].apply(hash_ip)

# One-hot encode the protocol column
df = one_hot_encode_protocol(df)

print(df.keys())

### Sequence Creation

In [None]:
# Function to create sequences from the DataFrame
def create_sequences(df, sequence_length=32):
    """
    Organize the DataFrame by time and create sequences of packets with the same label.

    Parameters:
    - df: DataFrame, the preprocessed DataFrame.
    - sequence_length: int, the length of each sequence.

    Returns:
    - sequences: List of np.arrays, the packet sequences.
    - sequence_labels: List of int, the labels for each sequence.
    """
    sequences = []
    sequence_labels = []
    
    # Step 2 & 3: Filter by Label and Create Sequences
    for label in df['label'].unique():
        label_df = df[df['label'] == label]
        
        # Split the dataframe into chunks of size `sequence_length`
        num_sequences = len(label_df) // sequence_length
        for i in range(num_sequences):
            sequence = label_df.iloc[i*sequence_length : (i+1)*sequence_length]
            sequences.append(sequence.drop(['label'], axis=1).to_numpy())
            sequence_labels.append(label)
    
    return sequences, sequence_labels

# Apply the function to create sequences
sequences, sequence_labels = create_sequences(df)

### Scaling the sequences

In [None]:
def scale_sequences(sequences):
    """
    Scale the sequences to have a Gaussian distribution with a mean of 0 and a std of 1.

    Parameters:
    - sequences: List of np.arrays, the packet sequences.

    Returns:
    - scaled_sequences: List of np.arrays, the scaled packet sequences.
    """
    # Initialize the StandardScaler
    scaler = MinMaxScaler()
    
    # Reshape the sequences for scaling: Convert list of 2D arrays into a 2D array
    sequences_shape = sequences[0].shape # Shape of a single sequence (16, num_features)
    sequences_concatenated = np.concatenate(sequences, axis=0)
    
    # Fit and transform the data
    scaled_data = scaler.fit_transform(sequences_concatenated)
    
    # Reshape back into the original list of 2D arrays
    scaled_sequences = [scaled_data[i*sequences_shape[0]:(i+1)*sequences_shape[0]] for i in range(len(sequences))]
    
    return scaled_sequences

# Apply the scaling function to your sequences
scaled_sequences = scale_sequences(sequences)

### Converting the sequences to tensors

In [None]:
def sequences_to_tensors(sequences, sequence_labels):
    """
    Convert sequences and their labels into PyTorch tensors efficiently.

    Parameters:
    - sequences: List of np.arrays, the packet sequences.
    - sequence_labels: List of int, the binary labels for each sequence.

    Returns:
    - sequence_tensors: PyTorch Tensor, the packet sequences as tensors.
    - label_tensors: PyTorch Tensor, the labels for each sequence as tensors.
    """
    # Stack the sequences into a single numpy array
    sequences_array = np.stack(sequences)
    
    # Convert the stacked array to a PyTorch tensor
    sequence_tensors = torch.tensor(sequences_array, dtype=torch.float32)
    
    # Convert labels to a tensor
    label_tensors = torch.tensor(sequence_labels, dtype=torch.long)
    
    return sequence_tensors, label_tensors

def binarize_labels(sequence_labels):
    """
    Convert sequence labels from categorical ('Benign', 'Malware') to binary (0, 1).

    Parameters:
    - sequence_labels: List of str, the categorical labels for each sequence.

    Returns:
    - binary_labels: List of int, the binary labels for each sequence.
    """
    binary_labels = [0 if label == 'Benign' else 1 for label in sequence_labels]
    return binary_labels

# Assuming `sequence_labels` contains your categorical labels
binary_sequence_labels = binarize_labels(sequence_labels)

# Now convert your sequences and the newly binarized labels into PyTorch tensors
sequence_tensors, label_tensors = sequences_to_tensors(scaled_sequences, binary_sequence_labels)

### Create train and test datasets and instantiate DataLoader objects

In [None]:
class SequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        sequences: A tensor of shape [n_sequences, 16, 7] - packets per sequence and features per packet
        labels: A tensor of labels for the sequences
        """
        self.sequences = sequences
        self.labels = labels
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        # Optionally, add any sequence-specific preprocessing here
        sequence = self.sequences[idx]
        label = self.labels[idx]
        return sequence, label

# 1. Splitting dataset into class-specific subsets
sequences_0 = sequence_tensors[label_tensors == 0]
labels_0 = label_tensors[label_tensors == 0]

sequences_1 = sequence_tensors[label_tensors == 1]
labels_1 = label_tensors[label_tensors == 1]

# 2. Ensuring even class distribution for the validation set (similarly to the test set)
min_count = min(len(sequences_0), len(sequences_1))
# Using a portion of data for validation, let's say 20% of min_count
val_count = int(0.2 * min_count)

# 3. Randomly sampling for validation set
perm_0_val = torch.randperm(len(sequences_0))[:val_count]
perm_1_val = torch.randperm(len(sequences_1))[:val_count]

# Creating balanced validation datasets
sequences_val_balanced = torch.cat((sequences_0[perm_0_val], sequences_1[perm_1_val]), dim=0)
labels_val_balanced = torch.cat((labels_0[perm_0_val], labels_1[perm_1_val]), dim=0)

# 4. Updating train dataset after excluding validation samples
sequences_0_train = torch.cat((sequences_0[val_count:], sequences_0[:val_count]), dim=0)[val_count:]
labels_0_train = torch.cat((labels_0[val_count:], labels_0[:val_count]), dim=0)[val_count:]

sequences_1_train = torch.cat((sequences_1[val_count:], sequences_1[:val_count]), dim=0)[val_count:]
labels_1_train = torch.cat((labels_1[val_count:], labels_1[:val_count]), dim=0)[val_count:]

# 5. Creating train, validation, and test datasets & loaders
train_dataset = SequenceDataset(torch.cat((sequences_0_train, sequences_1_train), dim=0), torch.cat((labels_0_train, labels_1_train), dim=0))
val_dataset = SequenceDataset(sequences_val_balanced, labels_val_balanced)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

print(f'Shape of training data: {len(train_dataset)} {len(train_dataset[0])}')
print(f'Shape of testing data: {len(val_dataset)} {len(val_dataset[0])}')

### Define our model

In [None]:
class SequenceAutoencoder(nn.Module):
    def __init__(self):
        super(SequenceAutoencoder, self).__init__()
        self.encoder_conv = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=8, kernel_size=2, padding=1),
            nn.BatchNorm1d(8),
            nn.ReLU(True),
            nn.MaxPool1d(2, stride=2),  # Intelligent pooling
            nn.Dropout(p=0.25)
        )
        self.encoder_lstm = nn.LSTM(input_size=8, hidden_size=8, batch_first=True)
        
        self.decoder_lstm = nn.LSTM(input_size=8, hidden_size=8, batch_first=True)
        self.decoder_conv = nn.Sequential(
            nn.ConvTranspose1d(in_channels=8, out_channels=32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(True),
            nn.Dropout(p=0.5)
        )
    
    def forward(self, x):
        x = self.encoder_conv(x)
        
        x = x.permute(0, 2, 1)  # Adjusting for LSTM input
        
        # Assuming you're dealing with LSTM layers here...
        x, (hn, cn) = self.encoder_lstm(x)
        
        x = self.decoder_conv(x.permute(0, 2, 1))  # Adjusting back for ConvTranspose1D
        
        return x


model = SequenceAutoencoder()


optimizer = Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # L2 regularization
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)  # Learning rate decay
criterion = nn.MSELoss()

### Train the model

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=5, scheduler=None, patience=5):
    model.to(device)
    best_val_loss = float('inf')
    patience_counter = 0  # Initialize patience counter
    
    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        total_train_loss = 0.0
        
        for sequences, _ in train_loader:
            sequences = sequences.to(device)
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, sequences)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(train_loader)
        
        # Validation phase
        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0.0
        with torch.no_grad():  # Disable gradient computation
            for sequences, _ in val_loader:
                sequences = sequences.to(device)
                outputs = model(sequences)
                loss = criterion(outputs, sequences)
                total_val_loss += loss.item()
        
        avg_val_loss = total_val_loss / len(val_loader)
        
        print(f'Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')
        
        # Checkpoint and early stopping logic
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            print('Model checkpoint saved')
            patience_counter = 0  # Reset patience counter
        else:
            patience_counter += 1  # Increment patience counter
        
        if patience_counter > patience:
            print(f'Stopping early at epoch {epoch+1}. No improvement in validation loss for {patience} epochs.')
            break
        
        # Update learning rate
        if scheduler is not None:
            if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                scheduler.step(avg_val_loss)  # For ReduceLROnPlateau
            else:
                scheduler.step()  # For other types of schedulers
    
    print('Training complete')

# Configuration and Hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SequenceAutoencoder()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

steps_per_epoch = len(train_loader)
total_steps = steps_per_epoch * 15
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, total_steps=total_steps)

train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=5, scheduler=scheduler)


### Evaluate

In [None]:
import sys

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
evaluation = os.path.join(parent_dir, 'evaluation')
sys.path.append(evaluation)

from evaluate import AnomalyDetection # type: ignore

evaluation = AnomalyDetection(model, train_loader, device)
evaluation.run_SAE_analysis()