# Autoencoder with CNN

475 Final Project that creates an Autoencoder with CNN layers  
Trains on flows from CICIDS2018

### Imports

In [1]:
import torch
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import collections

from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


### Data loading

In [2]:
# Path to your dataset
dataset_path = 'C:/Users/theob/Code/COS-475-Project/Dataset/Clean CSE/Cleaned Wed and Fri.csv'
flows = pd.read_csv(dataset_path)
flows_small = flows[0:30000]

### Clean the flow dataframe

In [None]:
def clean_dataframe(df):
    initial_columns_count = df.shape[1]
    initial_rows_count = df.shape[0]

    df = df.copy()  # Work on a copy to avoid side-effects on the original DataFrame

    # Convert all object columns to numeric, errors turn into NaNs
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Replace infinities with NaNs for mean calculation
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Fill NaN values with the mean of the columns, warn if the column is all NaN
    for col in df.columns:
        if df[col].isna().all():
            print(f"Warning: Column {col} is entirely NaN.")
        mean_value = df[col].mean()
        if pd.isna(mean_value):
            print(f"Warning: Mean is NaN for column {col}.")
        df[col].fillna(mean_value, inplace=True)

    # Remove columns without any numeric data or that remain all NaN
    df.dropna(axis=1, how='all', inplace=True)

    # Replace negative values with zero or mean, based on requirement
    df[df < 0] = 0

    # Drop all columns that contain only zeros
    df = df.loc[:, (df != 0).any(axis=0)]

    df.reset_index(drop=True, inplace=True)

    final_columns_count = df.shape[1]
    final_rows_count = df.shape[0]

    print(f"Initial number of columns: {initial_columns_count}, final number of columns: {final_columns_count}")
    print(f"Initial number of rows: {initial_rows_count}, final number of rows: {final_rows_count}")

    return df

def encode_labels(labels):
    # Converts a list of labels into integer format where each unique label is assigned a unique integer.
    # Additionally, prints the total number of unique labels and the number of occurrences of each label.

    # Create a dictionary to map each label to a unique integer
    unique_labels = sorted(set(labels))
    label_mapping = {label: i for i, label in enumerate(unique_labels)}

    # Count occurrences of each label
    label_counts = collections.Counter(labels)
    
    # Print the total number of unique labels and occurrences of each
    print(f"Total number of unique labels: {len(unique_labels)}")
    print("Occurrences of each label:")
    for label, count in sorted(label_counts.items(), key=lambda x: label_mapping[x[0]]):
        print(f"{label}: {count}")

    # Apply the mapping to the labels list to create the encoded labels list
    encoded_labels = [label_mapping[label] for label in labels]

    return encoded_labels

# Encode and separate the labels from the features
flows_small['Label'] = encode_labels(flows_small['Label'])

# Clean the feature space and drop 5 columns to create a 64 feature shape
flows_semi_reduced = clean_dataframe(flows_small.drop(['Timestamp', 'Active Max',
       'Active Min',], axis=1))

flow_labels = flows_semi_reduced['Label']
flows_reduced = flows_semi_reduced.drop(['Label'], axis = 1)

#### Create indices for mapping back to original inputs later

In [4]:
indices = flows_reduced.index

#### 2D-ify with Triangle Area Maps

In [None]:
def compute_matrix(vector):
    vector_transformed = np.square(vector)
    matrix = np.outer(vector_transformed, vector_transformed)
    max_val = np.max(matrix)
    if max_val > 0:
        matrix /= max_val
    return matrix

def vectors_to_matrices(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame")
    
    # Ensure all data is numeric
    if not np.all(df.map(np.isreal).all()):
        raise ValueError("All columns must be numeric")

    # Convert all columns to float for consistent processing and handling
    df = df.astype(float)

    # Use vectorized operations where possible
    matrices = [compute_matrix(df.iloc[i].values) for i in range(len(df))]

    return matrices

def tam(df):
    matrices = []
    for index, vector in df.iterrows():  # Iterating correctly over rows
        if pd.api.types.is_numeric_dtype(vector):  # Ensuring vector contains numeric data
            # Convert the Series to a proper numeric type if not already
            vector = vector.astype(float)
            # Calculate the outer product of the vector with itself
            matrix = np.outer(vector.values.T, vector.values.T)
            matrices.append(matrix)
    return matrices

matrices = compute_matrix(flows_reduced)

#### Convert matrices into tensors

In [None]:
flow_tensors = [torch.tensor(matrix, dtype=torch.float32) for matrix in matrices]

#### Train test split with the indices mapping to original inputs

In [None]:
from sklearn.utils import shuffle

# Step 1: Separate benign and malicious data along with their indices
benign_indices = [idx for idx, label in enumerate(flow_labels) if label == 0]
malicious_indices = [idx for idx, label in enumerate(flow_labels) if label != 0]

benign_matrices = [matrices[idx] for idx in benign_indices]
malicious_matrices = [matrices[idx] for idx in malicious_indices]

benign_labels = [0] * len(benign_indices)
malicious_labels = [flow_labels[idx] for idx in malicious_indices]

# Step 2: Use all benign data for training
X_train = benign_matrices
y_train = benign_labels
idx_train = list(range(len(benign_matrices)))  # Resetting indices for training

# Step 3: Test set includes all malicious and an equal amount of benign
X_test = malicious_matrices + benign_matrices[:len(malicious_matrices)]  # Equal count for both classes
y_test = malicious_labels + benign_labels[:len(malicious_matrices)]
idx_test = list(range(len(X_test)))  # Resetting indices for test

# Shuffle test data to ensure random distribution
X_test, y_test, idx_test = shuffle(X_test, y_test, idx_test, random_state=42)

# Output sizes and distributions
print("Training set size:", len(X_train), "Test set size:", len(X_test))
print("Train indices:", idx_train, "Test indices:", idx_test)

#### Custom dataset

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, indices, labels=None, augment=False, noise_level=0.01):
        self.data = data
        self.indices = indices
        self.labels = labels
        self.augment = augment
        self.noise_level = noise_level

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        try:
            actual_idx = self.indices[idx]
            data_item = torch.tensor(self.data[actual_idx], dtype=torch.float32).unsqueeze(0)
            label = torch.tensor(self.labels[actual_idx], dtype=torch.long) if self.labels else torch.tensor(-1)
            if torch.isnan(data_item).any() or torch.isinf(data_item).any():
                print(f"NaN or Inf found in batch at index {idx}")
            return data_item, label, actual_idx
        except IndexError as e:
            print(f"IndexError: {e} - Requested idx: {idx}, Actual idx: {actual_idx}, Available data length: {len(self.data)}")
            raise
        except Exception as e:
            print(f"Unexpected error: {e} - Accessing index: {idx}")
            raise


#### Creating the dataset and dataloader objects

In [None]:
train_dataset = CustomDataset(X_train, idx_train, y_train, augment=True)
test_dataset = CustomDataset(X_test, idx_test, y_test, augment=False)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

#### Validate that the data loader objects arent garbage

In [None]:
def check_dataloader(dl, name="DataLoader"):
    try:
        data_iter = iter(dl)
        first_batch = next(data_iter)
        print(f"{name} - Batch Content: {first_batch}")  # Debugging line to see the batch content

        if len(first_batch) != 3:
            print(f"Error: Batch does not contain three elements. Contains: {len(first_batch)} elements")
            return

        flows, labels, indices = first_batch
        print(f"{name} - First batch shapes: Features: {flows.shape}, Labels: {labels.shape}, Indices: {indices}")
        print(f"{name} - Data types: Features: {flows.dtype}, Labels: {labels.dtype}")
        print(f"{name} - Sample labels: {labels[:10]}")
        print(f"{name} - Sample indices: {indices[:10]}")
    except Exception as e:
        print(f"Error checking {name}: {e}")


#### Option to save the dataloader objects

In [None]:
torch.save(train_loader, "C:/Users/theob/Code/COS-475-Project/Data-for-wilder/CIC-train-clean.pth")
torch.save(test_loader, "C:/Users/theob/Code/COS-475-Project/Data-for-wilder/CIC-test-clean.pth")

#### Option to load in the prebuilt dataloaders

In [None]:
train_loader = torch.load('C:/Users/theob/Code/COS-475-Project/Data-for-wilder/CIC-train-clean.pth')
test_loader = torch.load('C:/Users/theob/Code/COS-475-Project/Data-for-wilder/CIC-test-clean.pth')

#### Verify the shape of the batches and labels

In [None]:
for data, labels, idx in train_loader:
    input_width = data.shape[0]
    print("Input batch shape from DataLoader:", data.shape)
    print("Labels batch shape from DataLoader:", labels.shape)
    print("Indices batch shape from DataLoader:", idx.shape)
    break


### Ensure that cuda is available, else run on the CPU

In [None]:
# Check if CUDA is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

### Model definition

In [None]:
import torch.nn as nn
from torch.nn import BatchNorm2d
from torch.nn import Dropout

class ConvSequenceAutoencoder(nn.Module):
    def __init__(self):
        super(ConvSequenceAutoencoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=4, kernel_size=(5, 5), padding=1),
            BatchNorm2d(4),
            nn.ReLU(),
            Dropout(0.5),
        )
        
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(in_channels=4, out_channels=1, kernel_size=(5, 5), padding=1),
            BatchNorm2d(1),
            nn.ReLU(),
            Dropout(0.5),
        )

    def forward(self, x):
        x = self.encoder(x)  # Compressing the input to a latent space representation
        x = self.decoder(x)  # Attempting to reconstruct the original input from the latent representation
        return x

model = ConvSequenceAutoencoder().to(device)

#### Test an input to ensure input and output are same shape for MSE calculating

In [None]:
def test_autoencoder():
    input_tensor = torch.randn(64, 1, 61, 61) # Modify to be the shape of input check from earlier
    print("Input shape:", input_tensor.shape)
    
    # Forward the input tensor through the model
    output_tensor = model(input_tensor)
    print("Output shape:", output_tensor.shape)

# Call the test function to check input and output shapes
test_autoencoder()

### Train the model

In [None]:
def train(model, train_loader, criterion, optimizer, epochs=5, patience=1, min_delta=0.001):
    model.train()  # Set the model to training mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    best_loss = float('inf')
    epochs_no_improve = 0  # Counter to keep track of the number of epochs with no improvement

    for epoch in range(epochs):
        total_loss = 0
        for inputs, _, _ in train_loader:  # Only taking inputs, no labels or indices
            
            # Send the inputs to device
            inputs = inputs.to(device) 
            optimizer.zero_grad()
            outputs = model(inputs)

            # Calculate loss for current input
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Calculate average loss for epoch
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}: Loss = {avg_loss:.4f}')

        # Early stopping logic
        if best_loss - avg_loss > min_delta:
            best_loss = avg_loss
            epochs_no_improve = 0
            print("Loss improved.")
        else:
            epochs_no_improve += 1
            print("No improvement in loss.")
            if epochs_no_improve >= patience:
                print(f"Early stopping triggered after {patience} epochs without improvement.")
                break

optimizer = Adam(model.parameters(), lr=0.00001, weight_decay=1e-5)
criterion = nn.MSELoss()

train(model, train_loader, criterion, optimizer, epochs=100)

### Visualize reconstruction error on benign vs each type of attack

In [None]:
def compute_mse(reconstructed, inputs):
    return ((reconstructed - inputs) ** 2).mean(dim=[1, 2, 3])

def accumulate_errors(model, test_loader, device):
    reconstruction_errors = []
    true_labels = []
    model.eval()
    with torch.no_grad():
        for inputs, labels, _ in test_loader:
            inputs = inputs.to(device)
            reconstructed = model(inputs)
            mse = compute_mse(reconstructed, inputs)
            reconstruction_errors.append(mse.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    reconstruction_errors = np.concatenate(reconstruction_errors)
    return reconstruction_errors, true_labels

def find_best_threshold(reconstruction_errors, true_labels, thresholds):
    best_accuracy = 0
    best_threshold = 0

    for threshold in thresholds:
        predictions = (reconstruction_errors > threshold).astype(int)
        accuracy = accuracy_score(true_labels, predictions)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_threshold = threshold
        print(f'Threshold: {threshold:.4f}, Accuracy: {accuracy*100:.2f}%')

    return best_threshold, best_accuracy

def plot_all_classes(reconstruction_errors, true_labels, threshold):
    error_dict = {}
    unique_labels = np.unique(true_labels)
    for label in unique_labels:
        error_dict[label] = reconstruction_errors[true_labels == label]

    plot_histogram(error_dict, threshold, unique_labels)

def plot_histogram(error_dict, threshold, labels, zoom_percentile=100):
    plt.figure(figsize=(12, 8))
    colors = plt.cm.get_cmap('viridis', len(labels))

    # Clean data: remove any NaN values from error_dict
    for key in error_dict.keys():
        error_dict[key] = error_dict[key][np.isfinite(error_dict[key])]

    all_errors = np.concatenate(list(error_dict.values()))
    if all_errors.size == 0:
        print("No valid error data available to plot.")
        return
    
    upper_limit = max(np.percentile(all_errors, zoom_percentile), threshold)

    for idx, label in enumerate(labels):
        plt.hist(error_dict[label], bins=20, alpha=0.7, label=f'Label {label}', color=colors(idx))
    
    plt.axvline(x=threshold, color='red', linestyle='--', label=f'Threshold = {threshold:.2f}')
    plt.legend()
    plt.xlabel('Reconstruction Error')
    plt.ylabel('Frequency')
    plt.title('Reconstruction Errors by Class')
    plt.xlim(left=0, right=upper_limit)
    plt.show()

# Main execution workflow
errors, labels = accumulate_errors(model, test_loader, device)
thresholds = np.linspace(0.001, 0.01, num=50)  # Example range and granularity
best_threshold, best_accuracy = find_best_threshold(errors, labels, thresholds)
print(f'Best Threshold: {best_threshold:.4f}, Highest Accuracy: {best_accuracy*100:.2f}%')
plot_all_classes(errors, labels, best_threshold)

#### Inspect the original inputs that were misclassified

In [None]:

import torch.nn.functional as F

def map_predictions_to_original(test_loader, model, device, anomaly_threshold):
    misclassified_malicious = []
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        for inputs, labels, indices in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            
            # Calculate the reconstruction error
            reconstruction_error = F.mse_loss(outputs, inputs, reduction='none')
            reconstruction_error = reconstruction_error.mean([1, 2, 3])  # Average over all dimensions except the batch
            
            # Determine if the reconstruction error exceeds the threshold
            anomalies = (reconstruction_error > anomaly_threshold).int()

            # Calculate mismatches and filter for misclassified malicious entries
            labels = labels.to(device)
            mismatches = (anomalies != labels).cpu().numpy()
            misclassified = indices.cpu().numpy()[mismatches & (labels.cpu().numpy() == 1)]  # Focus on false negatives
            misclassified_malicious.extend(misclassified)
    
    # Save the misclassified malicious entries to a CSV file
    misclassified_df = flows_reduced.loc[misclassified_malicious]
    misclassified_df.to_csv('misclassified_malicious1.csv', index=False)
    return misclassified_malicious

# Example usage:
anomaly_threshold = 0.0527  # Define a suitable threshold based on your application
results = map_predictions_to_original(test_loader, model, device, anomaly_threshold)
print("Misclassified malicious entries recorded.")


