# Autoencoder with CNN

Current state of the 475 project that creates an Autoencoder with CNN layers
Currently on a custom dataset that has packet captures from documented Malware and Benign applications  
Trains on sequences of flows from CICIDS2018

### Imports

In [None]:
import torch
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from torch import nn
from torch.utils.data import DataLoader, Dataset

import numpy as np
import pandas as pd
import collections

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

### Data loading

In [None]:
# Path to your dataset
dataset_path = 'C:/Users/theob/Code/COS-475-Project/Dataset/CSE-CIC-IDS2018/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv'
dataset_path2 = 'C:/Users/theob/Code/COS-475-Project/Dataset/CSE-CIC-IDS2018/Friday-16-02-2018_TrafficForML_CICFlowMeter.csv'
# Read in the dataset
flows1 = pd.read_csv(dataset_path)
flows2 = pd.read_csv(dataset_path2, low_memory=False)

flows = pd.concat([flows1[:300000], flows2[:300000]], axis=0, ignore_index=True)

### Clean the flow dataframe

In [None]:

def clean_dataframe(df):
    initial_columns_count = df.shape[1]
    initial_rows_count = df.shape[0]

    for col in df.columns:
        if df[col].dtype == 'object':
            # Use errors='coerce' to turn parsing errors into NaNs
            df[col] = pd.to_numeric(df[col], errors='coerce')

    numeric_df = df.select_dtypes(include=[np.number])
    df = numeric_df.loc[:, (numeric_df != 0).any(axis=0)]
    df = df[~df.isin([np.inf, -np.inf]).any(axis=1)]
    df = df[(df >= 0).all(axis=1)]

    df.reset_index(drop=True, inplace=True)

    final_columns_count = df.shape[1]
    final_rows_count = df.shape[0]

    print(f"Initial number of columns: {initial_columns_count}, final number of columns: {final_columns_count}")
    print(f"Initial number of rows: {initial_rows_count}, final number of rows: {final_rows_count}")

    return df

def encode_labels(labels):
    # Converts a list of labels into integer format where each unique label is assigned a unique integer.
    # Additionally, prints the total number of unique labels and the number of occurrences of each label.

    # Create a dictionary to map each label to a unique integer
    unique_labels = sorted(set(labels))
    label_mapping = {label: i for i, label in enumerate(unique_labels)}

    # Count occurrences of each label
    label_counts = collections.Counter(labels)
    
    # Print the total number of unique labels and occurrences of each
    print(f"Total number of unique labels: {len(unique_labels)}")
    print("Occurrences of each label:")
    for label, count in sorted(label_counts.items(), key=lambda x: label_mapping[x[0]]):
        print(f"{label}: {count}")

    # Apply the mapping to the labels list to create the encoded labels list
    encoded_labels = [label_mapping[label] for label in labels]

    return encoded_labels

# Encode and separate the labels from the features
flows['Label'] = encode_labels(flows['Label'])

# Clean the feature space and drop 5 columns to create a 64 feature shape
flows_semi_reduced = clean_dataframe(flows.drop(['Timestamp', 'Active Std', 'Active Max',
       'Active Min',], axis=1))


flow_labels = flows_semi_reduced['Label']
flows_reduced = flows_semi_reduced.drop(['Label'], axis = 1)


#### Apply scalar before matrix creation

In [None]:
from sklearn.preprocessing import StandardScaler

def apply_minmax_scaler(dataframe):
    # Create an instance of MinMaxScaler
    scaler = StandardScaler()

    # Fit the scaler to the data and transform it
    scaled_data = scaler.fit_transform(dataframe)

    # Convert the scaled array back to a DataFrame
    scaled_dataframe = pd.DataFrame(scaled_data, columns=dataframe.columns)

    return scaled_dataframe

flows_scaled = flows_reduced

#### 2D-ify with Triangle Area Maps

In [None]:
def vectors_to_matrices(df):
    matrices = []
    for index, vector in df.iterrows():  # Iterating correctly over rows
        if pd.api.types.is_numeric_dtype(vector):  # Ensuring vector contains numeric data
            # Convert the Series to a proper numeric type if not already
            vector = vector.astype(float)
            # Calculate the outer product of the vector with itself
            matrix = np.outer(vector, vector)
            matrices.append(matrix)
    return matrices

matrices = vectors_to_matrices(flows_scaled)

#### Converting the matrices to tensors

In [None]:
from sklearn.model_selection import train_test_split

flow_tensors = [torch.tensor(matrix, dtype=torch.float32) for matrix in matrices]

train_indices, test_indices = train_test_split(
    np.arange(len(flow_labels)),  # create an index array
    test_size=0.2,  # 20% of the data will be used for testing
    random_state=42,  # seed for reproducibility
    stratify=flow_labels  # preserve class distribution
)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, indices, labels=None, augment=False):
        self.data = torch.stack([data[i] for i in indices])
        self.labels = torch.tensor([labels[i] for i in indices]) if labels is not None else None
        self.augment = augment

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_item = self.data[idx]
        data_item = data_item.unsqueeze(0)  # This adds a channel dimension
        if self.augment and (self.labels is not None and self.labels[idx] == 0):
            data_item = self.augment_data(data_item)
        return data_item, self.labels[idx] if self.labels is not None else data_item

    def augment_data(self, data_item):
        # Noise injection
        noise = torch.randn_like(data_item) * 0.01  # Adjust noise level
        data_item += noise
        # Normalization
        if torch.max(data_item) != torch.min(data_item):
            data_item = (data_item - torch.min(data_item)) / (torch.max(data_item) - torch.min(data_item))
        return data_item


labels_array = np.array(flow_labels, dtype=np.int32)

train_dataset = CustomDataset(flow_tensors, train_indices, labels=flow_labels)
test_dataset = CustomDataset(flow_tensors, test_indices, labels=flow_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [None]:
for data, labels in train_loader:
    input_width = data.shape[3]
    print("Input batch shape from DataLoader:", data.shape)
    print("Labels batch shape from DataLoader:", labels.shape)
    break


### Ensure that cuda is available, else run on the CPU

In [None]:
# Check if CUDA is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

### Define our model

In [None]:
import torch
import torch.nn as nn

class ConvSequenceAutoencoder(nn.Module):
    def __init__(self):
        super(ConvSequenceAutoencoder, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=8, kernel_size=(5, 5), padding=1, bias=True),
            nn.Sigmoid(),
            nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(5, 5), padding=1, bias=True),
            nn.Sigmoid()
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(in_channels=16, out_channels=8, kernel_size=(5, 5), bias=True),
            nn.Sigmoid(),
            nn.ConvTranspose2d(in_channels=8, out_channels=1, kernel_size=(5, 5), padding=2, bias=True),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
model = ConvSequenceAutoencoder().to(device)
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # L2 regularization
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)  # Learning rate decay
criterion = nn.MSELoss()

#### Test an input

In [None]:
def test_autoencoder():
    # Create a sample input tensor of size (batch_size, channels, height, width)
    # Example dimensions: 1 image, 1 channel (e.g., grayscale), 28x28 pixels
    input_tensor = torch.randn(64, 1, 65, 65)
    print("Input shape:", input_tensor.shape)
    
    # Initialize the model
    model = ConvSequenceAutoencoder()
    
    # Forward the input tensor through the model
    output_tensor = model(input_tensor)
    print("Output shape:", output_tensor.shape)

# Call the test function to check input and output shapes
test_autoencoder()

### Train the model

In [None]:
def train(model, train_loader, criterion, optimizer, epochs=5, patience=2):
    model.train()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=patience, verbose=True)
    best_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        total_loss = 0
        for data in train_loader:
            inputs, _ = data if len(data) == 2 else (data, data)  # Handle data without labels
            inputs = inputs.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)  # Direct output from the model, no unpacking needed
            loss = criterion(outputs, inputs)  # Reconstruction loss only

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')
        
        scheduler.step(avg_loss)
        if avg_loss < best_loss:
            best_loss = avg_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
        
        if epochs_no_improve == patience:
            print('Early stopping triggered')
            break

    print('Training complete')

# Example usage
model = ConvSequenceAutoencoder()  # Assuming model is already defined and includes sparsity loss
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.MSELoss()

# Ensure train_loader is defined and properly set up
train(model, train_loader, criterion, optimizer, epochs=3)

### Evaluate

In [None]:
import numpy as np
import torch
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

def evaluate_model_and_plot(model, test_loader, threshold, device='cuda'):
    model.eval()  # Set the model to evaluation mode
    reconstruction_errors = []
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            # Ensure only reconstructed outputs are used, not the sparsity loss
            reconstructed = model(inputs)  # Assuming model outputs (reconstruction, sparsity_loss)
            reconstructed = reconstructed.to(device)
            
            # Calculate MSE for each image in the batch
            mse = ((reconstructed - inputs) ** 2).mean(dim=[1, 2, 3])  # Calculate mean over channel, height, and width dimensions
            
            reconstruction_errors.append(mse.cpu().numpy())  # Store errors for further analysis
            # Thresholding the MSE to make predictions, e.g., for anomaly detection
            pred = (mse > threshold).int()
            predictions.extend(pred.cpu().numpy())  # Store predictions
            true_labels.extend(labels.cpu().numpy())  # Store true labels
    
    # Convert list of arrays to one large array
    reconstruction_errors = np.concatenate(reconstruction_errors)
    
    error_dict = {}
    unique_labels = np.unique(true_labels)
    for label in unique_labels:
        error_dict[label] = reconstruction_errors[np.array(true_labels) == label]

    # Now plot histograms for each pair of interest
    class_0_errors = error_dict[unique_labels[0]]
    for i in range(1, len(unique_labels)):
        class_i_errors = error_dict[unique_labels[i]]
        if class_i_errors.size > 0 and class_0_errors.size > 0:
            plt.figure(figsize=(8, 5))
            plot_histogram(
                class_i_errors, class_0_errors, threshold, 
                f'Label {unique_labels[i]}', f'Benign', 
                f'Label {unique_labels[i]} vs Benign'
            )

    accuracy = accuracy_score(true_labels, predictions)
    print(f'Accuracy: {accuracy*100:.2f}%')

    return accuracy, error_dict

def plot_histogram(class1_errors, class2_errors, threshold, class1_label, class2_label, title):
    max_error = max(np.max(class1_errors), np.max(class2_errors), threshold)
    bins_range = (0, max_error)

    plt.hist(class1_errors, bins=20, alpha=0.5, label=class1_label, color='blue', range=bins_range)
    plt.hist(class2_errors, bins=20, alpha=0.5, label=class2_label, color='green', range=bins_range)

    plt.axvline(x=threshold, color='red', linestyle='--', label=f'Threshold = {threshold:.2f}')
    plt.legend()
    plt.xlabel('Reconstruction Error')
    plt.ylabel('Frequency')
    plt.title(title)
    plt.xlim(left=0)
    plt.show()

# Assuming 'model', 'test_loader', 'device', and 'threshold' are already defined
accuracy, error_dict = evaluate_model_and_plot(model, test_loader, threshold= int(1e17), device='cuda')

### Visualize sample inputs and reconstructed output side by side

In [None]:
def visualize_reconstructions2(model, test_loader_2d, num_images=100):
    model.eval()  # Set the model to evaluation mode
    images_displayed = 0

    with torch.no_grad():
        for inputs, labels in test_loader_2d:
            if images_displayed >= num_images:
                break  # Stop after displaying the specified number of images
            inputs = inputs.to(device)
            outputs = model(inputs)
            
            # Assuming inputs and outputs are now [batch_size, channels, height, width]
            for original, reconstructed, label in zip(inputs, outputs, labels):
                if images_displayed >= num_images:
                    break

                label_text = 'Malware' if label.item() == 1 else 'Benign'
                
                # Remove the channel dimension for visualization
                original = original.squeeze().cpu().numpy()  # Squeeze here
                reconstructed = reconstructed.squeeze().cpu().numpy()  # And here
                
                plt.figure(figsize=(6, 3))
                
                # Plot the original image with label
                plt.subplot(1, 2, 1)
                plt.imshow(original, cmap='gray')  # Adjusted
                plt.title(f'Original ({label_text})')
                plt.axis('off')
                
                # Plot the reconstructed image
                plt.subplot(1, 2, 2)
                plt.imshow(reconstructed, cmap='gray')  # Adjusted
                plt.title(f'Reconstructed ({label_text})')
                plt.axis('off')
                
                plt.show()
                
                images_displayed += 1

import torch
from torchvision.utils import make_grid
import matplotlib.pyplot as plt

def visualize_reconstructions(model, test_loader, device='cuda', num_samples=5):
    model.eval()  # Set the model to evaluation mode
    sample_inputs, sample_recons = [], []

    with torch.no_grad():
        for batch_idx, (inputs, labels) in enumerate(test_loader):
            if batch_idx >= num_samples:  # Only store num_samples of samples
                break
            inputs = inputs.to(device)
            reconstructed, _ = model(inputs)  # Assuming model outputs (reconstruction, sparsity_loss)
            sample_inputs.append(inputs.cpu())
            sample_recons.append(reconstructed.cpu())
    
    # Now, visualize the samples and their reconstructions
    for index in range(num_samples):
        plt.figure(figsize=(10, 4))

        # Original Images
        plt.subplot(1, 2, 1)
        plt.title('Original Images')
        original_images = make_grid(sample_inputs[index], nrow=5, padding=2, normalize=True)
        plt.imshow(original_images.permute(1, 2, 0))
        plt.axis('off')

        # Reconstructed Images
        plt.subplot(1, 2, 2)
        plt.title('Reconstructed Images')
        recon_images = make_grid(sample_recons[index], nrow=5, padding=2, normalize=True)
        plt.imshow(recon_images.permute(1, 2, 0))
        plt.axis('off')

        plt.show()

# Example usage, after you have called your evaluate_model_and_plot method:
visualize_reconstructions(model, test_loader, device='cuda', num_samples=5)