In [3]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Configuration ---
DATASET_PATH = "/media/ssd/test/standardized-datasets/combined/combined_unsw_cicRed_botRed_netflow_10pct.csv"
# Define required columns based on design doc and typical NetFlow features
REQUIRED_COLUMNS = [
    'IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT',
    'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS',
    'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'Label', 'Attack', 'dataset_source', 'flow_id'
]

# --- 1. Data Loading ---
logging.info(f"Loading dataset from: {DATASET_PATH}")
if not os.path.exists(DATASET_PATH):
    logging.error(f"Dataset file not found at {DATASET_PATH}")
    # Handle error appropriately, maybe raise FileNotFoundError
    raise FileNotFoundError(f"Dataset file not found at {DATASET_PATH}")

try:
    # Load only necessary columns to save memory initially if needed, or load all
    # Consider using low_memory=False if dtype warnings appear
    df = pd.read_csv(DATASET_PATH, low_memory=False)
    logging.info(f"Dataset loaded successfully. Shape: {df.shape}")

    # Verify required columns
    missing_cols = [col for col in REQUIRED_COLUMNS if col not in df.columns]
    if missing_cols:
        logging.error(f"Missing required columns: {missing_cols}")
        raise ValueError(f"Dataset missing required columns: {missing_cols}")
    logging.info("All required columns are present.")

    # --- Optional: Basic Cleaning/Info ---
    logging.info(f"Memory usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
    # print(df.info())
    # print(df.head())
    # Handle potential NaNs if necessary (standardization should minimize this)
    # df.dropna(subset=['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT'], inplace=True)


except Exception as e:
    logging.error(f"Error during data loading or initial verification: {e}")
    raise

# --- 2. Graph Construction ---
logging.info("Starting graph construction...")

# Convert ports to string and handle potential float representations if necessary
df['L4_SRC_PORT'] = df['L4_SRC_PORT'].astype(int).astype(str)
df['L4_DST_PORT'] = df['L4_DST_PORT'].astype(int).astype(str)

# Combine IP and Port to create unique node identifiers
df['src_ip_port'] = df['IPV4_SRC_ADDR'] + ':' + df['L4_SRC_PORT']
df['dst_ip_port'] = df['IPV4_DST_ADDR'] + ':' + df['L4_DST_PORT']
logging.info("Created source and destination IP:Port identifiers.")

# Get all unique IP:Port combinations involved in flows
unique_src_nodes = df['src_ip_port'].unique()
unique_dst_nodes = df['dst_ip_port'].unique()
all_unique_nodes = pd.unique(np.concatenate((unique_src_nodes, unique_dst_nodes)))
logging.info(f"Found {len(all_unique_nodes)} unique IP:Port nodes.")

# Create mapping from IP:Port string to integer node ID
ip_port_to_id = {ip_port: i for i, ip_port in enumerate(all_unique_nodes)}
num_nodes = len(all_unique_nodes)
logging.info(f"Created mapping for {num_nodes} nodes.")

# Map source and destination IP:Ports to their integer IDs
src_ids = df['src_ip_port'].map(ip_port_to_id).values
dst_ids = df['dst_ip_port'].map(ip_port_to_id).values

# Create edge_index tensor [2, num_edges]
# Edges represent flows from source node to destination node
edge_index = torch.tensor([src_ids, dst_ids], dtype=torch.long)
logging.info(f"Created edge_index tensor with shape: {edge_index.shape}")

# --- Display some results ---
print(f"Number of flows (edges): {len(df)}")
print(f"Number of unique IP:Port nodes: {num_nodes}")
print(f"Edge index shape: {edge_index.shape}")
# print("First 5 rows with node IDs:")
# print(df[['src_ip_port', 'dst_ip_port']].head())
# print(f"Source IDs (first 5): {src_ids[:5]}")
# print(f"Destination IDs (first 5): {dst_ids[:5]}")
# print(f"Edge Index (first 5 columns):\n{edge_index[:, :5]}")

# Keep relevant data for next steps
flows_df = df # Keep the dataframe for feature engineering

2025-04-13 20:58:01,500 - INFO - Loading dataset from: /media/ssd/test/standardized-datasets/combined/combined_unsw_cicRed_botRed_netflow_10pct.csv
2025-04-13 20:58:05,102 - INFO - Dataset loaded successfully. Shape: (655094, 47)
2025-04-13 20:58:05,103 - INFO - All required columns are present.
2025-04-13 20:58:05,327 - INFO - Memory usage: 382.78 MB
2025-04-13 20:58:05,329 - INFO - Starting graph construction...
2025-04-13 20:58:06,014 - INFO - Created source and destination IP:Port identifiers.
2025-04-13 20:58:06,304 - INFO - Found 611553 unique IP:Port nodes.
2025-04-13 20:58:06,470 - INFO - Created mapping for 611553 nodes.
  edge_index = torch.tensor([src_ids, dst_ids], dtype=torch.long)
2025-04-13 20:58:07,430 - INFO - Created edge_index tensor with shape: torch.Size([2, 655094])


Number of flows (edges): 655094
Number of unique IP:Port nodes: 611553
Edge index shape: torch.Size([2, 655094])


In [4]:
# Cell 1.5: Data Sampling (New Cell)
import numpy as np
import logging
import pandas as pd
import gc

# --- Sampling Configuration ---
# Adjust these values based on memory capacity and desired dataset size
SAMPLED_SIZE_LARGE_CLASSES = 50000  # Target size for *sum* of samples from large classes
MIN_LARGE_CLASS_SIZE = 1000 # Threshold to consider a class 'large'

# --- Sampling Function ---
def create_imbalanced_subset(df, target_col, new_dataset_size_large_classes, min_large_class_size):
    """
    Create a smaller dataset while preserving class imbalance, focusing on reducing majority classes.

    Args:
        df (pd.DataFrame): Original dataset.
        target_col (str): Name of the target label column.
        new_dataset_size_large_classes (int): Target total number of samples from classes exceeding min_large_class_size.
        min_large_class_size (int): Minimum number of samples for a class to be considered 'large'.

    Returns:
        pd.DataFrame: A reduced dataset.
    """
    logging.info(f"Starting imbalanced sampling. Target size for large classes: {new_dataset_size_large_classes}")
    class_counts = df[target_col].value_counts()
    logging.info(f"Original class distribution:\\n{class_counts}")

    large_classes = class_counts[class_counts >= min_large_class_size]
    small_classes = class_counts[class_counts < min_large_class_size]

    total_large_samples_original = large_classes.sum()
    num_large_classes = len(large_classes)

    sampled_data = []

    if num_large_classes > 0 and total_large_samples_original > 0:
        logging.info(f"Found {num_large_classes} large classes (>= {min_large_class_size} samples).")
        # Calculate scaling factor based on the sum of large classes
        scaling_factor = min(1.0, new_dataset_size_large_classes / total_large_samples_original)
        logging.info(f"Scaling factor for large classes: {scaling_factor:.4f}")

        # Sample from large classes proportionally
        for class_label, original_count in large_classes.items():
            # Calculate proportional target size
            target_size = int(original_count * scaling_factor)
            # Ensure we don't sample more than available and respect min_large_class_size if scaled size is too small
            sample_size = max(1, min(target_size, original_count)) # Ensure at least 1 sample, don't exceed original count
            logging.info(f"  Sampling class '{class_label}': target size={target_size}, final sample size={sample_size}")
            sampled_class_df = df[df[target_col] == class_label].sample(n=sample_size, random_state=42, replace=False)
            sampled_data.append(sampled_class_df)
    else:
        logging.info("No large classes found or large classes sum to zero.")

    # Keep all samples from small classes
    if not small_classes.empty:
        logging.info(f"Keeping all samples for {len(small_classes)} small classes (< {min_large_class_size} samples).")
        small_class_df = df[df[target_col].isin(small_classes.index)]
        sampled_data.append(small_class_df)

    if not sampled_data:
        logging.warning("Sampling resulted in an empty dataset.")
        return pd.DataFrame(columns=df.columns)

    # Concatenate sampled dataframes
    df_sampled = pd.concat(sampled_data).reset_index(drop=True)
    logging.info(f"Finished sampling. New dataset size: {len(df_sampled)}")
    logging.info(f"New class distribution:\\n{df_sampled[target_col].value_counts()}")

    return df_sampled

# --- Apply Sampling ---
# Assuming 'df' is loaded from Cell 1 and 'Label' is the target column
try:
    # Ensure 'Label' column exists
    if 'Label' not in df.columns:
        raise KeyError("Target column 'Label' not found in DataFrame 'df'. Verify column names.")

    df_sampled = create_imbalanced_subset(
        df,
        target_col='Label',
        new_dataset_size_large_classes=SAMPLED_SIZE_LARGE_CLASSES,
        min_large_class_size=MIN_LARGE_CLASS_SIZE
    )
    # Clean up original large dataframe if memory is tight
    # del df
    # gc.collect()
except NameError:
    logging.error("Original DataFrame 'df' not found. Ensure Cell 1 executed successfully.")
    df_sampled = None # Set to None to indicate failure
except KeyError as e:
    logging.error(e)
    df_sampled = None # Set to None to indicate failure
except Exception as e:
     logging.error(f"An unexpected error occurred during sampling: {e}")
     df_sampled = None

# --- Check if sampling was successful ---
if df_sampled is not None and not df_sampled.empty:
    logging.info(f"Sampling successful. Sampled DataFrame shape: {df_sampled.shape}")
    print(f"Sampled DataFrame shape: {df_sampled.shape}")
    print(f"Sampled DataFrame memory usage: {df_sampled.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
else:
    logging.error("Sampling failed or resulted in an empty DataFrame. Stopping execution.")
    # Optionally raise an error to stop the notebook
    raise ValueError("Sampling failed.")


2025-04-13 20:58:10,797 - INFO - Starting imbalanced sampling. Target size for large classes: 50000
2025-04-13 20:58:10,804 - INFO - Original class distribution:\nLabel
0    341806
1    313288
Name: count, dtype: int64
2025-04-13 20:58:10,805 - INFO - Found 2 large classes (>= 1000 samples).
2025-04-13 20:58:10,806 - INFO - Scaling factor for large classes: 0.0763
2025-04-13 20:58:10,806 - INFO -   Sampling class '0': target size=26088, final sample size=26088
2025-04-13 20:58:10,938 - INFO -   Sampling class '1': target size=23911, final sample size=23911
2025-04-13 20:58:11,097 - INFO - Finished sampling. New dataset size: 49999
2025-04-13 20:58:11,099 - INFO - New class distribution:\nLabel
0    26088
1    23911
Name: count, dtype: int64
2025-04-13 20:58:11,102 - INFO - Sampling successful. Sampled DataFrame shape: (49999, 49)


Sampled DataFrame shape: (49999, 49)
Sampled DataFrame memory usage: 41.30 MB


In [5]:
# Cell 2: Feature Engineering (Revised for Tabular Output X, y on Sampled Data)

from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import torch
import numpy as np
import logging
import gc

# --- Ensure df_sampled is available from the previous cell ---
if 'df_sampled' not in locals() or df_sampled is None or df_sampled.empty:
    logging.error("Sampled DataFrame 'df_sampled' not available or empty. Cannot proceed with feature engineering.")
    # Optionally raise an error
    raise NameError("Sampled DataFrame 'df_sampled' is required but not available.")

# --- 3. Feature Engineering on Sampled Data ---
logging.info(f"Starting feature engineering on sampled data (Shape: {df_sampled.shape})...")

# Define feature columns based on design doc
numerical_cols = [
    'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS',
    'FLOW_DURATION_MILLISECONDS'
]
categorical_cols = [
    'PROTOCOL', 'L7_PROTO', 'TCP_FLAGS'
]
label_col = 'Label'

# Select relevant columns for processing X and y from the sampled data
columns_to_process = numerical_cols + categorical_cols + [label_col]

# Ensure all needed columns exist in df_sampled
missing_processing_cols = [col for col in columns_to_process if col not in df_sampled.columns]
if missing_processing_cols:
     logging.error(f"Sampled DataFrame missing required columns for processing: {missing_processing_cols}")
     raise ValueError(f"Missing columns in df_sampled: {missing_processing_cols}")

# Create a working copy for processing
flows_df_processed = df_sampled[columns_to_process].copy()
logging.info(f"DataFrame shape after selecting processing columns: {flows_df_processed.shape}")


# --- 3.1 Process Numerical Features ---
logging.info(f"Processing numerical features: {numerical_cols}")
# Ensure numeric types and handle potential errors/NaNs
for col in numerical_cols:
    flows_df_processed[col] = pd.to_numeric(flows_df_processed[col], errors='coerce')
# Fill NaNs created by coercion or existing NaNs (e.g., with 0 or median)
flows_df_processed[numerical_cols] = flows_df_processed[numerical_cols].fillna(0)

# Log transform (log1p to handle zeros) - often good for byte/packet counts
# Important: Apply log1p *before* scaling
log_transformed_features = np.log1p(flows_df_processed[numerical_cols].values)

# Scale numerical features
scaler = StandardScaler()
# Fit scaler only on numerical features, transform the log-transformed values
scaled_numerical_features = scaler.fit_transform(log_transformed_features)
logging.info("Applied Log1p and StandardScaler to numerical features.")

# --- 3.2 Process Categorical Features ---
logging.info(f"Processing categorical features: {categorical_cols}")
# Ensure categorical columns are treated as such (e.g., strings or category type)
# Convert to string first to handle potential mixed types or NaNs consistently
flows_df_processed[categorical_cols] = flows_df_processed[categorical_cols].astype(str)
# Fill any remaining NaNs in categorical columns (though astype(str) should handle most)
flows_df_processed[categorical_cols] = flows_df_processed[categorical_cols].fillna('missing') # Or another placeholder


# Check cardinality (optional but good practice, especially after sampling)
for col in categorical_cols:
    unique_count = flows_df_processed[col].nunique()
    logging.info(f"Unique values count in '{col}': {unique_count}")
    if unique_count > 1000: # Example threshold
         logging.warning(f"High cardinality in '{col}' ({unique_count}). OneHotEncoding might lead to very high dimensions.")


# One-Hot Encode categorical features using get_dummies
logging.info("Applying One-Hot Encoding (pd.get_dummies) to categorical features...")
# Using pandas get_dummies is often simpler for DataFrames
# Ensure consistent prefixing and handle potential NaN columns if 'missing' wasn't used for fillna
categorical_encoded_df = pd.get_dummies(
    flows_df_processed[categorical_cols],
    columns=categorical_cols,
    prefix=categorical_cols,
    dummy_na=False, # Set to True if NaNs weren't filled and you want explicit NaN columns
    dtype=int # Ensure resulting columns are integer type
)
logging.info(f"One-Hot Encoding resulted in {categorical_encoded_df.shape[1]} new feature columns.")


# --- 3.3 Combine Features into Matrix X ---
# Concatenate scaled numerical features and one-hot encoded categorical features
# Ensure alignment by using the index from flows_df_processed
logging.info(f"Shape of scaled numerical: {scaled_numerical_features.shape}")
logging.info(f"Shape of encoded categorical: {categorical_encoded_df.shape}")

# Convert numerical features to DataFrame to align indices easily
scaled_numerical_df = pd.DataFrame(scaled_numerical_features, index=flows_df_processed.index, columns=numerical_cols)

# Combine the feature DataFrames
# Make sure indices match before concatenation
X_features_list = [scaled_numerical_df, categorical_encoded_df.set_index(scaled_numerical_df.index)] # Align indices
X_df = pd.concat(X_features_list, axis=1)

# Convert the final feature DataFrame to a NumPy array
X = X_df.values
logging.info(f"Combined features into matrix X with shape: {X.shape}")

# --- 3.4 Prepare Labels Vector y ---
y = flows_df_processed[label_col].values.astype(np.int64) # Ensure labels are integers
logging.info(f"Created labels vector y with shape: {y.shape}")
unique_labels, counts = np.unique(y, return_counts=True)
logging.info(f"Label distribution in y: {dict(zip(unique_labels, counts))}")


# --- Clean up intermediate objects ---
del flows_df_processed
del scaled_numerical_df
del categorical_encoded_df
del X_features_list
#del X_df # Remove intermediate dataframe
gc.collect()
logging.info("Feature engineering on sampled data complete. Produced feature matrix X and label vector y.")

# --- Display final shapes ---
print(f"Final Feature Matrix X shape: {X.shape}")
print(f"Final Label Vector y shape: {y.shape}")

import pickle
import os

# Define the save directory and file name
save_dir = '/media/ssd/test/GNN/Standardized Models/CAGN-GAT/' # Relative path from notebook location might need adjustment
# Or use an absolute path if easier:
# save_dir = '/media/ssd/test/GNN/Adversarial Evaluation/' 
feature_list_filename = os.path.join(save_dir, 'cagn_gat_feature_list.pkl')

# Ensure the directory exists (it should, but just in case)
os.makedirs(save_dir, exist_ok=True) 

# Save the column names
feature_list = X_df.columns.tolist()
with open(feature_list_filename, 'wb') as f:
    pickle.dump(feature_list, f)
    
print(f"Saved {len(feature_list)} feature names to {feature_list_filename}")
# Optional: Clean up intermediate dataframe if memory is tight AFTER saving
# del X_df 
# gc.collect()

2025-04-13 20:58:17,614 - INFO - Starting feature engineering on sampled data (Shape: (49999, 49))...
2025-04-13 20:58:17,618 - INFO - DataFrame shape after selecting processing columns: (49999, 9)
2025-04-13 20:58:17,618 - INFO - Processing numerical features: ['IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'FLOW_DURATION_MILLISECONDS']
2025-04-13 20:58:17,629 - INFO - Applied Log1p and StandardScaler to numerical features.
2025-04-13 20:58:17,630 - INFO - Processing categorical features: ['PROTOCOL', 'L7_PROTO', 'TCP_FLAGS']
2025-04-13 20:58:17,698 - INFO - Unique values count in 'PROTOCOL': 220
2025-04-13 20:58:17,701 - INFO - Unique values count in 'L7_PROTO': 112
2025-04-13 20:58:17,704 - INFO - Unique values count in 'TCP_FLAGS': 28
2025-04-13 20:58:17,704 - INFO - Applying One-Hot Encoding (pd.get_dummies) to categorical features...
2025-04-13 20:58:17,856 - INFO - One-Hot Encoding resulted in 360 new feature columns.
2025-04-13 20:58:17,857 - INFO - Shape of scaled numerical: 

Final Feature Matrix X shape: (49999, 365)
Final Label Vector y shape: (49999,)
Saved 365 feature names to /media/ssd/test/GNN/Standardized Models/CAGN-GAT/cagn_gat_feature_list.pkl


In [11]:
# Cell 3: Adaptive Graph Construction

import torch
from torch_geometric.data import Data
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics.pairwise import pairwise_distances
from scipy import sparse # Need explicit import for sparse matrix operations
import numpy as np
import logging
import gc
import time

# Make sure X and y are available from Cell 2

# --- Define Adaptive Graph Construction Function ---
# (Based on the function found in the original CAGN-GAT Fusion notebook)
def adaptive_graph_construction(X, y, k=20, adaptive_metric='euclidean', threshold=0.5):
    """
    Adaptive graph construction based on feature similarity (k-NN intersection with distance threshold).

    Args:
        X (np.ndarray): Feature matrix (num_samples, num_features).
        y (np.ndarray): Labels (num_samples,).
        k (int): Number of neighbors for k-NN graph.
        adaptive_metric (str): Metric for feature similarity ('euclidean', 'cosine', etc.).
        threshold (float): Threshold for edge creation based on similarity distance.

    Returns:
        Data: PyTorch Geometric Data object with nodes=samples, x=features, edge_index=similarity_edges.
    """
    num_samples = X.shape[0]
    logging.info(f"Starting adaptive graph construction for {num_samples} samples...")
    logging.info(f"Parameters: k={k}, metric='{adaptive_metric}', threshold={threshold}")
    start_time_knn = time.time()

    # 1. Calculate k-NN graph adjacency
    logging.info("Calculating k-NN graph...")
    # Note: kneighbors_graph returns a sparse matrix in CSR format by default
    try:
        knn_adj = kneighbors_graph(X, k, mode='connectivity', metric=adaptive_metric, include_self=False, n_jobs=-1) # Use all available CPU cores
        logging.info(f"Calculated k-NN graph. Shape: {knn_adj.shape}, NNZ: {knn_adj.nnz}. Time: {time.time() - start_time_knn:.2f}s")
    except MemoryError as e:
        logging.error(f"MemoryError during k-NN graph calculation: {e}. Consider reducing k or features.")
        raise
    except Exception as e:
        logging.error(f"Error during k-NN graph calculation: {e}")
        raise

    # 2. Calculate pairwise distances and threshold
    start_time_dist = time.time()
    logging.info("Calculating pairwise distances (using sklearn) and applying threshold...")
    # This is the most memory-intensive part.
    try:
        # Calculate distances - use float32 for potentially lower memory usage if precision allows
        # Note: pairwise_distances returns a dense matrix.
        distances = pairwise_distances(X.astype(np.float32), metric=adaptive_metric, n_jobs=-1)
        logging.info(f"Calculated pairwise distances matrix. Shape: {distances.shape}. Time: {time.time() - start_time_dist:.2f}s")

        # Create adjacency matrix based on distance threshold
        # This creates a dense boolean matrix initially
        dist_adj_mask = (distances < threshold)
        # Ensure it's boolean, set diagonal to False (no self-loops from distance)
        np.fill_diagonal(dist_adj_mask, False)
        logging.info("Created distance threshold adjacency mask.")

        # Convert boolean mask to sparse CSR format for efficient intersection
        dist_adj_sparse = sparse.csr_matrix(dist_adj_mask)
        logging.info(f"Converted distance mask to sparse CSR. NNZ: {dist_adj_sparse.nnz}")

        # Cleanup large dense matrices asap
        del distances
        del dist_adj_mask
        gc.collect()


    except MemoryError as e:
         logging.error(f"MemoryError during pairwise distance calculation or conversion: {e}. Graph construction failed. Consider reducing features or using sampling.")
         raise
    except Exception as e:
         logging.error(f"Error during distance calculation/thresholding: {e}")
         raise

    # 3. Intersect k-NN graph and distance threshold graph
    start_time_intersect = time.time()
    logging.info("Intersecting k-NN (CSR) and distance threshold (CSR) graphs...")
    # Element-wise multiplication of sparse matrices performs intersection
    # Both knn_adj and dist_adj_sparse should be CSR for this to be efficient
    try:
        final_adj = knn_adj.multiply(dist_adj_sparse)
        logging.info(f"Intersection complete. Final Adj NNZ: {final_adj.nnz}. Time: {time.time() - start_time_intersect:.2f}s")
    except Exception as e:
        logging.error(f"Error during sparse matrix intersection: {e}")
        raise

    # 4. Create PyG Data Object
    start_time_pyg = time.time()
    # Get final edge index in COO format for PyG
    final_adj_coo = final_adj.tocoo()
    edge_index = torch.tensor(np.vstack((final_adj_coo.row, final_adj_coo.col)), dtype=torch.long)
    logging.info(f"Final number of edges: {edge_index.shape[1]}")

    # Convert features and labels to tensors
    # Use float32 for features to save memory
    features = torch.tensor(X, dtype=torch.float32)
    labels = torch.tensor(y, dtype=torch.long) # y should already be int64 from cell 2

    data = Data(x=features, edge_index=edge_index, y=labels)
    logging.info(f"Created PyG Data object: {data}. Time: {time.time() - start_time_pyg:.2f}s")

    # Clean up remaining large intermediate objects
    del knn_adj, dist_adj_sparse, final_adj, final_adj_coo
    gc.collect()
    logging.info("Cleaned up intermediate graph construction objects.")

    return data

# --- Build the Graph ---
# Assuming X and y are numpy arrays from the previous cell

# Build the graph using the function
# This might take time and significant memory!
logging.info("--- Starting Graph Construction Execution ---")
graph_build_start_time = time.time()
try:
    # Ensure y is numpy array when passed
    data = adaptive_graph_construction(X, y, k=20, threshold=0.5)
    print(f"Graph construction successful. Total time: {time.time() - graph_build_start_time:.2f}s")
    print(data)
except MemoryError:
    print("\n"+"="*20 + " MEMORY ERROR " + "="*20)
    print("MemoryError occurred during graph construction!")
    print("This is likely due to the high dimensionality of X (from OneHot)")
    print("or the large number of samples during distance calculation.")
    print("Consider falling back to LabelEncoding for categorical features")
    print("or reducing the number of samples if possible.")
    print("="*52)
    # Optionally: Set data to None or raise the error again
    data = None
    # Raise the error to stop notebook execution if preferred
    # raise
except Exception as e:
    print(f"\nAn unexpected error occurred during graph construction: {e}")
    data = None
    # raise

# Note: If successful, the 'data' object now holds the graph structure.
# Nodes represent the original flows/samples.
# data.x holds the feature matrix X.
# data.edge_index represents edges between samples based on feature similarity.
# data.y holds the labels.
# There is no data.edge_attr.


2025-04-12 23:37:07,049 - INFO - --- Starting Graph Construction Execution ---
2025-04-12 23:37:07,051 - INFO - Starting adaptive graph construction for 49999 samples...
2025-04-12 23:37:07,051 - INFO - Parameters: k=20, metric='euclidean', threshold=0.5
2025-04-12 23:37:07,052 - INFO - Calculating k-NN graph...
2025-04-12 23:37:12,843 - INFO - Calculated k-NN graph. Shape: (49999, 49999), NNZ: 999980. Time: 5.79s
2025-04-12 23:37:12,846 - INFO - Calculating pairwise distances (using sklearn) and applying threshold...
2025-04-12 23:37:36,990 - INFO - Calculated pairwise distances matrix. Shape: (49999, 49999). Time: 24.14s
2025-04-12 23:37:39,220 - INFO - Created distance threshold adjacency mask.
2025-04-12 23:37:54,851 - INFO - Converted distance mask to sparse CSR. NNZ: 73400740
2025-04-12 23:37:55,082 - INFO - Intersecting k-NN (CSR) and distance threshold (CSR) graphs...
2025-04-12 23:37:55,731 - INFO - Intersection complete. Final Adj NNZ: 954732. Time: 0.65s
2025-04-12 23:37:55,

Graph construction successful. Total time: 48.84s
Data(x=[49999, 365], edge_index=[2, 954732], y=[49999])


In [12]:
# Cell 4: Data Splitting (Node-based for Sampled Graph)
import torch
import numpy as np
import logging
from sklearn.model_selection import train_test_split

# Make sure the 'data' object is available from Cell 3
if 'data' not in locals() or data is None:
    logging.error("Data object 'data' not available. Cannot proceed with splitting.")
    raise NameError("Data object 'data' is required but not available.")

# --- 5. Data Splitting (Node Splitting) ---
logging.info("Splitting nodes into training, validation, and test sets...")

num_nodes = data.num_nodes
node_indices = np.arange(num_nodes)
labels = data.y.cpu().numpy() # Use labels for stratified split

# Define split proportions
train_ratio = 0.70
val_ratio = 0.15
test_ratio = 0.15 # Should sum to 1.0

# Ensure ratios sum to 1 (or close enough)
assert np.isclose(train_ratio + val_ratio + test_ratio, 1.0), "Split ratios must sum to 1.0"

# First split into train and temp (val + test), stratifying by labels
# Stratification helps maintain class distribution in splits, especially important after sampling
try:
    train_indices, temp_indices, y_train, y_temp = train_test_split(
        node_indices,
        labels, # Use labels for stratification
        train_size=train_ratio,
        random_state=42, # for reproducibility
        stratify=labels # Stratify based on node labels
    )
except ValueError as e:
    logging.warning(f"Could not stratify during train/temp split (perhaps too few samples in a class?): {e}. Proceeding without stratification.")
    train_indices, temp_indices = train_test_split(
        node_indices,
        train_size=train_ratio,
        random_state=42
    )
    y_temp = labels[temp_indices] # Need labels for the next stratification step


# Calculate remaining proportion for validation relative to the temp set size
# Adjust validation size calculation relative to the *remaining* data
val_relative_ratio = val_ratio / (val_ratio + test_ratio)

# Split temp into validation and test, stratifying by labels within the temp set
try:
    val_indices, test_indices, _, _ = train_test_split(
        temp_indices,
        y_temp, # Use labels from the temp set for stratification
        train_size=val_relative_ratio,
        random_state=42, # use same random_state for consistency
        stratify=y_temp # Stratify within the temp set
    )
except ValueError as e:
     logging.warning(f"Could not stratify during val/test split: {e}. Proceeding without stratification.")
     val_indices, test_indices = train_test_split(
        temp_indices,
        train_size=val_relative_ratio,
        random_state=42
    )


logging.info(f"Split complete:")
logging.info(f"  Train nodes: {len(train_indices)}")
logging.info(f"  Validation nodes: {len(val_indices)}")
logging.info(f"  Test nodes: {len(test_indices)}")

# Create boolean masks for nodes
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[train_indices] = True
val_mask[val_indices] = True
test_mask[test_indices] = True

# Add masks to the data object
# Ensure masks are on the same device as the data object
data.train_mask = train_mask.to(data.x.device)
data.val_mask = val_mask.to(data.x.device)
data.test_mask = test_mask.to(data.x.device)

logging.info("Added train_mask, val_mask, test_mask to the Data object (Node-based).")
print("Data object updated with node masks:")
print(data)
print(f"Number of training nodes: {data.train_mask.sum().item()}")
print(f"Number of validation nodes: {data.val_mask.sum().item()}")
print(f"Number of test nodes: {data.test_mask.sum().item()}")

# Clean up intermediate numpy arrays
del node_indices, labels, train_indices, temp_indices, val_indices, test_indices, y_train, y_temp
gc.collect()

2025-04-12 23:37:55,904 - INFO - Splitting nodes into training, validation, and test sets...
2025-04-12 23:37:55,923 - INFO - Split complete:
2025-04-12 23:37:55,924 - INFO -   Train nodes: 34999
2025-04-12 23:37:55,925 - INFO -   Validation nodes: 7500
2025-04-12 23:37:55,925 - INFO -   Test nodes: 7500
2025-04-12 23:37:55,927 - INFO - Added train_mask, val_mask, test_mask to the Data object (Node-based).


Data object updated with node masks:
Data(x=[49999, 365], edge_index=[2, 954732], y=[49999], train_mask=[49999], val_mask=[49999], test_mask=[49999])
Number of training nodes: 34999
Number of validation nodes: 7500
Number of test nodes: 7500


0

In [13]:
# Cell 5: Model Definition (CAGN)

import torch
import torch.nn.functional as F
from torch.nn import Linear, Sequential, ReLU, BatchNorm1d, Dropout
from torch_geometric.nn import GATConv, MessagePassing
import logging

# --- 6. Model Definition (CAGN) ---
logging.info("Defining CAGN model...")

# Define input/output dimensions based on the 'data' object from Cell 3
# Ensure 'data' exists and has the required attributes
if 'data' not in locals() or data is None:
    raise NameError("Data object 'data' is not available from graph construction.")
if not hasattr(data, 'num_node_features'):
     raise AttributeError("Data object does not have 'num_node_features'. Graph construction might have failed.")
if not hasattr(data, 'y'):
     raise AttributeError("Data object does not have 'y' (labels).")


node_feat_dim = data.num_node_features # Dimension of features in data.x
hidden_dim = 128                       # Hyperparameter: Size of hidden layers
# Determine output dimension based on labels (binary or multi-class)
num_classes = len(torch.unique(data.y))
output_dim = 1 if num_classes == 2 else num_classes # 1 for binary logits, num_classes for multi-class logits
logging.info(f"Input dim: {node_feat_dim}, Hidden dim: {hidden_dim}, Output dim: {output_dim} ({num_classes} classes)")


class CAGN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=8, dropout=0.6):
        super(CAGN, self).__init__()
        self.dropout_rate = dropout
        # Use edge_dim=None since the similarity graph doesn't have edge features
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=self.dropout_rate, edge_dim=None)
        # Input to conv2 is hidden_dim * heads because concat=True by default in GATConv
        self.conv2 = GATConv(hidden_dim * heads, hidden_dim, heads=1, concat=False, dropout=self.dropout_rate, edge_dim=None)
        # Input to conv3 is the output dimension of conv2, which is hidden_dim
        self.conv3 = GATConv(hidden_dim, output_dim, heads=1, concat=False, dropout=self.dropout_rate, edge_dim=None)

        # Projection head for contrastive loss (optional, but can sometimes help)
        # Let's project the output of conv2 (which is hidden_dim)
        # self.contrastive_proj = Linear(hidden_dim, hidden_dim)

        self.contrastive_loss_weight = 0.5  # Hyperparameter: Balance between cls and contrastive loss

    def forward(self, x, edge_index):
        x = F.dropout(x, p=self.dropout_rate, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x) # ELU activation is common after GAT layers
        x = F.dropout(x, p=self.dropout_rate, training=self.training)

        # Output of conv2 is the embedding 'z' used for contrastive loss
        z = self.conv2(x, edge_index)
        z = F.elu(z)
        z = F.dropout(z, p=self.dropout_rate, training=self.training)

        # Output of conv3 is the final classification logits
        out_logits = self.conv3(z, edge_index)

        # Return both the final logits and the intermediate embeddings
        return out_logits, z

    def contrastive_loss(self, z, labels, margin=1.0):
        """
        Calculates contrastive loss on node embeddings 'z'.
        Pulls nodes with the same label closer and pushes nodes with different labels apart.
        """
        # Ensure labels are on the same device as embeddings
        labels = labels.to(z.device)

        # Normalize embeddings
        norm_z = F.normalize(z, p=2, dim=1)

        # Calculate cosine similarity matrix
        sim_matrix = torch.mm(norm_z, norm_z.t())

        # Create positive and negative masks based on labels
        pos_mask = (labels.unsqueeze(1) == labels.unsqueeze(0)).float()
        neg_mask = 1 - pos_mask

        # Calculate loss for positive pairs (want similarity close to 1)
        # We use (1 - sim) for positive pairs, aiming to minimize this distance
        pos_loss = (1 - sim_matrix) * pos_mask

        # Calculate loss for negative pairs (want similarity less than margin)
        # We use relu(sim - margin), penalizing similarities > margin
        neg_loss = F.relu(sim_matrix - margin) * neg_mask

        # Average the losses over the number of pairs, avoiding division by zero
        # Summing over all pairs (upper/lower triangle, excluding diagonal implicitly by masks)
        num_pos_pairs = pos_mask.sum()
        num_neg_pairs = neg_mask.sum()

        pos_term = pos_loss.sum() / (num_pos_pairs + 1e-8) # Add epsilon for stability
        neg_term = neg_loss.sum() / (num_neg_pairs + 1e-8) # Add epsilon for stability

        # Combine positive and negative losses
        total_contrastive_loss = pos_term + neg_term

        return total_contrastive_loss


logging.info(f"Defined CAGN model.")
# Example instantiation (optional, for checking structure)
# model_cagn = CAGN(node_feat_dim, hidden_dim, output_dim)
# print(model_cagn)


2025-04-12 23:37:56,086 - INFO - Defining CAGN model...
2025-04-12 23:37:56,089 - INFO - Input dim: 365, Hidden dim: 128, Output dim: 1 (2 classes)
2025-04-12 23:37:56,090 - INFO - Defined CAGN model.


In [14]:
# Cell 5: Model Definition (CAGN - With Batched Contrastive Loss)

import torch
import torch.nn.functional as F
from torch.nn import Linear, Sequential, ReLU, BatchNorm1d, Dropout
from torch_geometric.nn import GATConv, MessagePassing
import logging

# --- 6. Model Definition (CAGN) ---
logging.info("Defining CAGN model...")

# Define input/output dimensions based on the 'data' object from Cell 3
# Ensure 'data' exists and has the required attributes
if 'data' not in locals() or data is None:
    raise NameError("Data object 'data' is not available from graph construction.")
if not hasattr(data, 'num_node_features'):
     raise AttributeError("Data object does not have 'num_node_features'. Graph construction might have failed.")
if not hasattr(data, 'y'):
     raise AttributeError("Data object does not have 'y' (labels).")


node_feat_dim = data.num_node_features # Dimension of features in data.x
# Keep hidden_dim and heads reduced as they were causing issues before
hidden_dim = 64                        # Hyperparameter: Size of hidden layers (Kept reduced)
gat_heads = 8                          # Number of attention heads (Kept reduced)
# Determine output dimension based on labels (binary or multi-class)
num_classes = len(torch.unique(data.y))
output_dim = 1 if num_classes == 2 else num_classes # 1 for binary logits, num_classes for multi-class logits
logging.info(f"Input dim: {node_feat_dim}, Hidden dim: {hidden_dim}, Output dim: {output_dim} ({num_classes} classes)")


class CAGN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=2, dropout=0.6): # Default heads=2
        super(CAGN, self).__init__()
        self.dropout_rate = dropout
        # Use edge_dim=None since the similarity graph doesn't have edge features
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=self.dropout_rate, edge_dim=None)
        # Input to conv2 is hidden_dim * heads because concat=True by default in GATConv
        self.conv2 = GATConv(hidden_dim * heads, hidden_dim, heads=1, concat=False, dropout=self.dropout_rate, edge_dim=None)
        # Input to conv3 is the output dimension of conv2, which is hidden_dim
        self.conv3 = GATConv(hidden_dim, output_dim, heads=1, concat=False, dropout=self.dropout_rate, edge_dim=None)

        self.contrastive_loss_weight = 0.5  # Hyperparameter: Balance between cls and contrastive loss

    def forward(self, x, edge_index):
        x = F.dropout(x, p=self.dropout_rate, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x) # ELU activation is common after GAT layers
        x = F.dropout(x, p=self.dropout_rate, training=self.training)

        # Output of conv2 is the embedding 'z' used for contrastive loss
        z = self.conv2(x, edge_index)
        z = F.elu(z)
        z = F.dropout(z, p=self.dropout_rate, training=self.training)

        # Output of conv3 is the final classification logits
        out_logits = self.conv3(z, edge_index)

        # Return both the final logits and the intermediate embeddings
        return out_logits, z

    def contrastive_loss(self, z, labels, margin=1.0, batch_size=1024):
        """
        Calculates contrastive loss on node embeddings 'z' in batches.
        Pulls nodes with the same label closer and pushes nodes with different labels apart.
        """
        # Ensure labels are on the same device as embeddings
        labels = labels.to(z.device)
        num_nodes = z.size(0)

        if num_nodes == 0:
            return torch.tensor(0.0, device=z.device, requires_grad=True) # Handle empty input

        # Normalize embeddings
        norm_z = F.normalize(z, p=2, dim=1)

        total_pos_loss = 0.0
        total_neg_loss = 0.0
        total_pos_pairs = 0.0
        total_neg_pairs = 0.0

        for i in range(0, num_nodes, batch_size):
            # Select batch embeddings and labels
            batch_indices = torch.arange(i, min(i + batch_size, num_nodes), device=z.device)
            batch_z = norm_z[batch_indices]
            batch_labels = labels[batch_indices]

            # Compute similarity between batch and all nodes
            sim_sub_matrix = torch.mm(batch_z, norm_z.t()) # Shape: [batch_size, num_nodes]

            # Create positive and negative masks for the batch vs all nodes
            pos_sub_mask = (batch_labels.unsqueeze(1) == labels.unsqueeze(0)).float() # Shape: [batch_size, num_nodes]
            # Avoid comparing node to itself within the batch's slice of the full matrix
            pos_sub_mask[torch.arange(batch_z.size(0)), batch_indices] = 0
            neg_sub_mask = 1 - pos_sub_mask
            # Also explicitly remove self-comparisons for negative mask, although usually covered by pos_mask
            neg_sub_mask[torch.arange(batch_z.size(0)), batch_indices] = 0


            # Calculate loss for this batch
            batch_pos_loss = (1 - sim_sub_matrix) * pos_sub_mask
            batch_neg_loss = F.relu(sim_sub_matrix - margin) * neg_sub_mask

            # Accumulate losses and pair counts
            total_pos_loss += batch_pos_loss.sum()
            total_neg_loss += batch_neg_loss.sum()
            total_pos_pairs += pos_sub_mask.sum()
            total_neg_pairs += neg_sub_mask.sum()

            # --- Memory cleanup within loop ---
            del sim_sub_matrix, pos_sub_mask, neg_sub_mask, batch_pos_loss, batch_neg_loss
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            # --- End Memory cleanup ---


        # Final average loss
        pos_term = total_pos_loss / (total_pos_pairs + 1e-8) # Add epsilon for stability
        neg_term = total_neg_loss / (total_neg_pairs + 1e-8) # Add epsilon for stability

        # Check for NaN potential (if no positive or no negative pairs were found)
        if torch.isnan(pos_term): pos_term = 0.0
        if torch.isnan(neg_term): neg_term = 0.0

        total_contrastive_loss = pos_term + neg_term

        return total_contrastive_loss


logging.info(f"Defined CAGN model with batched contrastive loss.")

2025-04-12 23:37:56,114 - INFO - Defining CAGN model...
2025-04-12 23:37:56,117 - INFO - Input dim: 365, Hidden dim: 64, Output dim: 1 (2 classes)
2025-04-12 23:37:56,118 - INFO - Defined CAGN model with batched contrastive loss.


In [15]:
# Cell 6: Training & Evaluation Setup (All Models)

import torch
import torch.optim as optim
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve
import logging
import time
import gc
import numpy as np # Ensure numpy is imported
# --- Baseline Model Imports ---
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, GINConv # Import necessary layers
from torch.nn import Linear, Sequential, ReLU, BatchNorm1d
import torch.nn.functional as F # Ensure F is imported
# --- End Baseline Model Imports ---


# --- Baseline GNN Model Definitions ---
class GCN(torch.nn.Module):
    """Standard GCN model for baseline comparison."""
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training) # Add dropout
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training) # Add dropout
        x = self.conv3(x, edge_index) # Return logits
        return x

class BaselineGAT(torch.nn.Module): # Renamed to avoid conflict if CAGN uses GAT
    """Standard GAT model for baseline comparison."""
    def __init__(self, input_dim, hidden_dim, output_dim, heads=8, dropout=0.6):
        super(BaselineGAT, self).__init__()
        # Ensure hidden_dim is reasonable for multiplication by heads
        # Find the largest multiple of heads <= hidden_dim
        gat_hidden_dim_per_head = hidden_dim // heads
        gat_hidden_dim = gat_hidden_dim_per_head * heads
        if gat_hidden_dim == 0: # Handle case where hidden_dim < heads
             gat_hidden_dim_per_head = 1
             gat_hidden_dim = heads
             logging.warning(f"GAT hidden_dim ({hidden_dim}) < heads ({heads}). Setting hidden per head to 1.")


        if gat_hidden_dim != hidden_dim:
            logging.warning(f"Adjusting GAT hidden_dim from {hidden_dim} to {gat_hidden_dim} to be divisible by heads={heads}")


        self.conv1 = GATConv(input_dim, gat_hidden_dim_per_head, heads=heads, dropout=dropout, concat=True)
        # Input to conv2 is the concatenated output: gat_hidden_dim
        self.conv2 = GATConv(gat_hidden_dim, gat_hidden_dim_per_head, heads=heads, concat=True, dropout=dropout)
        # Input to conv3 is the concatenated output: gat_hidden_dim
        self.conv3 = GATConv(gat_hidden_dim, output_dim, heads=1, concat=False, dropout=dropout) # Final layer output dim is correct

    def forward(self, x, edge_index):
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training) # Match dropout used in GATConv
        x = F.elu(self.conv2(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv3(x, edge_index) # Return logits
        return x


class GraphSAGE(torch.nn.Module):
    """Standard GraphSAGE model for baseline comparison."""
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.conv3 = SAGEConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv3(x, edge_index) # Return logits
        return x

class GIN(torch.nn.Module):
    """Graph Isomorphism Network (GIN) model for baseline comparison."""
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GIN, self).__init__()
        nn1 = Sequential(Linear(input_dim, hidden_dim), ReLU(), Linear(hidden_dim, hidden_dim))
        self.conv1 = GINConv(nn1)
        self.bn1 = BatchNorm1d(hidden_dim)

        nn2 = Sequential(Linear(hidden_dim, hidden_dim), ReLU(), Linear(hidden_dim, hidden_dim))
        self.conv2 = GINConv(nn2)
        self.bn2 = BatchNorm1d(hidden_dim)

        nn3 = Sequential(Linear(hidden_dim, hidden_dim), ReLU(), Linear(hidden_dim, hidden_dim))
        self.conv3 = GINConv(nn3)
        self.bn3 = BatchNorm1d(hidden_dim)

        self.fc = Linear(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.bn1(x)
        x = F.relu(self.conv2(x, edge_index))
        x = self.bn2(x)
        x = F.relu(self.conv3(x, edge_index))
        x = self.bn3(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc(x) # Return logits
        return x

logging.info("Defined baseline GNN models: GCN, BaselineGAT, GraphSAGE, GIN.")
# --- End Baseline Model Definitions ---


# Make sure 'data' object (with masks) and 'CAGN' model class are available
if 'data' not in locals() or data is None:
    raise NameError("Data object 'data' is required but not available.")
if 'CAGN' not in globals():
      raise NameError("CAGN model class not defined. Ensure Cell 5 executed successfully.")
if not hasattr(data, 'train_mask'):
     raise AttributeError("Data object missing 'train_mask'. Ensure node splitting (Cell 4) executed successfully.")

# --- 7. Training & Evaluation Setup ---
logging.info("Setting up training and evaluation functions (Node Classification)...") # Renamed log message

# --- 7.1 Device Configuration ---
# Determine device early
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging.info(f"Using device: {device}")

# --- 7.2 Loss Functions ---
# Determine if binary or multi-class based on output_dim derived from data.y
# Re-calculate based on data object just in case
num_classes_check = len(torch.unique(data.y))
output_dim_check = 1 if num_classes_check == 2 else num_classes_check
is_binary = (output_dim_check == 1)

if is_binary:
    # Calculate positive class weight for handling imbalance (using node labels)
    num_nodes = data.num_nodes
    train_labels = data.y[data.train_mask]
    num_positives = (train_labels == 1).sum().item()
    num_negatives = (train_labels == 0).sum().item()
    pos_weight = None
    if num_positives > 0 and num_negatives > 0:
        pos_weight_value = num_negatives / num_positives
        pos_weight = torch.tensor([pos_weight_value], device=device)
        logging.info(f"Calculated pos_weight for BCEWithLogitsLoss (Train set): {pos_weight_value:.4f} (on {pos_weight.device})")
    else:
        logging.warning("Could not calculate pos_weight for train set (num_positives or num_negatives is zero). Using default weighting.")

    criterion_cls = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    logging.info(f"Classification loss: {criterion_cls} initialized.")
else:
    # No need for pos_weight for multi-class CrossEntropyLoss
    # Weighting can be added via the 'weight' argument if needed
    criterion_cls = nn.CrossEntropyLoss()
    logging.info(f"Classification loss: {criterion_cls} initialized for multi-class.")


# Contrastive loss is handled by the CAGN model's method

# --- 7.3 Unified Training Function (Single Epoch) ---
def train_epoch(model, data, optimizer, criterion_cls, is_cagn_model=False):
    model.train()
    optimizer.zero_grad()

    # Perform forward pass
    if is_cagn_model:
        out_logits, z_embeddings = model(data.x, data.edge_index)
    else:
        # Baseline models return only logits
        out_logits = model(data.x, data.edge_index)

    # Squeeze logits if necessary (especially for binary classification outputting [N, 1])
    logits_train = out_logits[data.train_mask]
    if is_binary and logits_train.ndim > 1 and logits_train.shape[1] == 1:
        logits_train = logits_train.squeeze(1)
    elif logits_train.ndim == 0: # Handle case where mask selects only one sample
         logits_train = logits_train.unsqueeze(0)

    # Prepare labels
    labels_train = data.y[data.train_mask]
    target_labels = labels_train.float() if is_binary else labels_train.long()

    # Calculate classification loss on training nodes
    cls_loss = criterion_cls(logits_train, target_labels)

    # Calculate contrastive loss ONLY for CAGN
    contrast_loss = torch.tensor(0.0, device=cls_loss.device) # Default to 0
    if is_cagn_model:
        contrast_loss = model.contrastive_loss(z_embeddings[data.train_mask], data.y[data.train_mask])
        total_loss = cls_loss + model.contrastive_loss_weight * contrast_loss
    else:
        total_loss = cls_loss # Total loss is just classification loss for baselines

    # Backpropagation
    total_loss.backward()
    optimizer.step()

    # Return individual losses for logging
    return total_loss.item(), cls_loss.item(), contrast_loss.item()


# --- 7.4 Unified Evaluation Function ---
@torch.no_grad()
def evaluate(model, data, mask, criterion_cls, is_cagn_model=False):
    model.eval()

    # Perform forward pass
    if is_cagn_model:
        out_logits, z_embeddings = model(data.x, data.edge_index)
    else:
        out_logits = model(data.x, data.edge_index)
        z_embeddings = None # Not needed for baselines

    # Calculate classification loss on the specified mask
    logits_eval = out_logits[mask]
    if is_binary and logits_eval.ndim > 1 and logits_eval.shape[1] == 1:
         logits_eval = logits_eval.squeeze(1)
    elif logits_eval.ndim == 0: # Handle case where mask selects only one sample
         logits_eval = logits_eval.unsqueeze(0)

    labels_eval = data.y[mask]
    target_labels_eval = labels_eval.float() if is_binary else labels_eval.long()

    cls_loss = criterion_cls(logits_eval, target_labels_eval).item()

    # Contrastive loss calculation during evaluation is often skipped or handled differently
    # For simplicity, we report it as 0 here for evaluation metrics.
    contrast_loss = 0.0
    # if is_cagn_model and z_embeddings is not None:
        # You *could* calculate it, but it adds overhead to evaluation
        # contrast_loss = model.contrastive_loss(z_embeddings[mask], data.y[mask]).item()

    total_loss = cls_loss + (model.contrastive_loss_weight * contrast_loss if is_cagn_model else 0.0)

    # Get predictions
    if is_binary:
        preds_proba = torch.sigmoid(logits_eval) # Probabilities for binary
        preds = (preds_proba > 0.5).float()            # Threshold at 0.5
    else:
        preds_proba = F.softmax(logits_eval, dim=-1) # Probabilities for multi-class
        preds = preds_proba.argmax(dim=-1)             # Get class with highest probability

    # Get ground truth labels
    labels = data.y[mask]

    # Ensure labels and preds are on CPU for sklearn
    labels_cpu = labels.cpu().numpy()
    preds_cpu = preds.cpu().numpy()
    preds_proba_cpu = preds_proba.cpu().numpy()

    # --- Calculate Metrics ---
    accuracy = accuracy_score(labels_cpu, preds_cpu)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels_cpu, preds_cpu, average='binary' if is_binary else 'macro', zero_division=0
    )

    auc = 0.0
    # Ensure there are samples to evaluate and more than one class in the labels
    if mask.sum() > 0 and len(np.unique(labels_cpu)) > 1:
        try:
            if is_binary:
                # Use probabilities of the positive class (usually index 1 if output is [N] or index 0 if [N,1])
                proba_positive_class = preds_proba_cpu if preds_proba_cpu.ndim == 1 else preds_proba_cpu[:, 0]
                if len(np.unique(labels_cpu)) > 1: # Check again within binary case
                     auc = roc_auc_score(labels_cpu, proba_positive_class)
                else:
                     logging.warning(f"Skipping AUC: Only one class present in labels for the current mask.")
            elif preds_proba_cpu.ndim > 1 and preds_proba_cpu.shape[1] >= 2: # Check if enough classes for multi-class AUC
                auc = roc_auc_score(labels_cpu, preds_proba_cpu, multi_class='ovr', average='macro')
            else:
                 logging.warning("AUC calculation skipped: Not enough classes or incompatible probability shapes.")
        except ValueError as e:
            logging.warning(f"Could not calculate AUC: {e}. Setting AUC to 0.0")
            auc = 0.0
    else:
        # Log why AUC is skipped. Add check for mask.sum() > 0
        if mask.sum() == 0:
             logging.warning(f"AUC calculation skipped: No samples in the current mask.")
        else:
             logging.warning(f"AUC calculation skipped: Only one class ({np.unique(labels_cpu)}) present in labels for the current mask ({mask.sum()} samples).")


    metrics = {
        'loss': total_loss,
        'cls_loss': cls_loss,
        'contrast_loss': contrast_loss, # Will be 0 for baselines and potentially 0 for CAGN eval here
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc
    }
    return metrics


# --- 7.5 Device Configuration (Move data object) ---
try:
    if data.x.device != device:
         data = data.to(device)
         logging.info(f"Moved data object to {device}")
    else:
         logging.info(f"Data object already on {device}")
except Exception as e:
    logging.error(f"Failed to move data to {device}: {e}. Check GPU memory. Falling back to CPU.")
    device = torch.device('cpu')
    data = data.to(device)
    # Re-initialize binary classification criterion with pos_weight on CPU if needed
    if is_binary and pos_weight is not None and pos_weight.device != device:
        pos_weight = pos_weight.to(device)
        criterion_cls = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        logging.info(f"Fallback: Moved pos_weight back to {device} and re-initialized criterion_cls")

logging.info("Unified training loop and evaluation functions defined.")


2025-04-12 23:37:56,162 - INFO - Defined baseline GNN models: GCN, BaselineGAT, GraphSAGE, GIN.
2025-04-12 23:37:56,163 - INFO - Setting up training and evaluation functions (Node Classification)...
2025-04-12 23:37:56,164 - INFO - Using device: cuda
2025-04-12 23:37:57,680 - INFO - Calculated pos_weight for BCEWithLogitsLoss (Train set): 1.0910 (on cuda:0)
2025-04-12 23:37:57,682 - INFO - Classification loss: BCEWithLogitsLoss() initialized.
2025-04-12 23:37:57,704 - INFO - Moved data object to cuda
2025-04-12 23:37:57,705 - INFO - Unified training loop and evaluation functions defined.


In [16]:
# Cell 7: Main Dataset Benchmarking Loop

import pandas as pd
import torch
import gc
import logging
import time
import copy
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from torch_geometric.data import Data
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics.pairwise import pairwise_distances
from scipy import sparse
import torch.optim as optim # Make sure optim is imported here if not globally
import torch.nn as nn      # Make sure nn is imported here if not globally

# --- Dataset Definitions ---
datasets_to_benchmark = {
    "Combined_10pct": "/media/ssd/test/standardized-datasets/combined/combined_unsw_cicRed_botRed_netflow_10pct.csv",
    "CICIDS2018_Reduced": "/media/ssd/test/standardized-datasets/netflow/nf_cic_ids2018_reduced_standardized.csv",
    "BotIoT_v2_Reduced": "/media/ssd/test/standardized-datasets/netflow/nf_bot_iot_v2_reduced_standardized.csv",
    "UNSW_NB15": "/media/ssd/test/standardized-datasets/netflow/nf_unsw_nb15_standardized.csv"
}

# --- Configurations (Defined within this cell for clarity) ---
# Make sure REQUIRED_COLUMNS, numerical_cols, categorical_cols are defined globally or adjusted here if needed per dataset
label_col = 'Label' # Assuming 'Label' is the target column in all standardized datasets
SAMPLED_SIZE_LARGE_CLASSES = 50000 # Or adjust as needed
MIN_LARGE_CLASS_SIZE = 1000
learning_rate = 0.001
weight_decay = 5e-4
epochs = 200
patience = 20
hidden_dim = 64       # Shared hidden dimension
gat_heads = 8         # Heads for GAT/CAGN
dropout_rate = 0.6    # Dropout for GAT/CAGN (<<<< DEFINED HERE)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

all_dataset_results = {} # Store results for each dataset

# --- Loop Through Datasets ---
for dataset_name, dataset_path in datasets_to_benchmark.items():
    print(f"\n{'='*30}\nProcessing Dataset: {dataset_name}\n{'='*30}")
    logging.info(f"Starting processing for dataset: {dataset_name} from {dataset_path}")

    # --- 1. Data Loading ---
    try:
        logging.info(f"Loading dataset: {dataset_name}")
        df = pd.read_csv(dataset_path, low_memory=False)
        logging.info(f"Dataset {dataset_name} loaded. Shape: {df.shape}")
        # Basic verification
        if label_col not in df.columns:
            logging.error(f"Dataset {dataset_name} missing target column: '{label_col}'")
            continue
    except FileNotFoundError:
        logging.error(f"Dataset file not found: {dataset_path}")
        continue
    except Exception as e:
        logging.error(f"Error loading dataset {dataset_name}: {e}")
        continue

    # --- 2. Sampling ---
    logging.info(f"Starting sampling for {dataset_name}...")
    # Make sure create_imbalanced_subset function is defined globally (e.g., from original Cell 1.5)
    try:
        df_sampled = create_imbalanced_subset(
            df,
            target_col=label_col,
            new_dataset_size_large_classes=SAMPLED_SIZE_LARGE_CLASSES,
            min_large_class_size=MIN_LARGE_CLASS_SIZE
        )
        if df_sampled is None or df_sampled.empty:
             logging.warning(f"Sampling resulted in empty dataframe for {dataset_name}. Skipping.")
             del df
             gc.collect()
             continue
        logging.info(f"Sampling complete for {dataset_name}. Sampled shape: {df_sampled.shape}")
        del df
        gc.collect()
    except Exception as e:
        logging.error(f"Error during sampling for {dataset_name}: {e}")
        continue

    # --- 3. Feature Engineering ---
    logging.info(f"Starting feature engineering for {dataset_name}...")
    try:
        # Ensure numerical_cols and categorical_cols are defined globally or adjusted if they vary per dataset
        numerical_cols = ['IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'FLOW_DURATION_MILLISECONDS'] # Example
        categorical_cols = ['PROTOCOL', 'L7_PROTO', 'TCP_FLAGS'] # Example

        columns_to_process = numerical_cols + categorical_cols + [label_col]
        missing_processing_cols = [col for col in columns_to_process if col not in df_sampled.columns]
        if missing_processing_cols:
             logging.error(f"Sampled {dataset_name} missing columns for FE: {missing_processing_cols}")
             del df_sampled
             gc.collect()
             continue

        flows_df_processed = df_sampled[columns_to_process].copy()

        # --- Process Numerical ---
        for col in numerical_cols:
            flows_df_processed[col] = pd.to_numeric(flows_df_processed[col], errors='coerce')
        flows_df_processed[numerical_cols] = flows_df_processed[numerical_cols].fillna(0)
        log_transformed_features = np.log1p(flows_df_processed[numerical_cols].values)
        scaler = StandardScaler()
        scaled_numerical_features = scaler.fit_transform(log_transformed_features)
        scaled_numerical_df = pd.DataFrame(scaled_numerical_features, index=flows_df_processed.index, columns=numerical_cols)

        # --- Process Categorical ---
        flows_df_processed[categorical_cols] = flows_df_processed[categorical_cols].astype(str).fillna('missing')
        categorical_encoded_df = pd.get_dummies(
            flows_df_processed[categorical_cols],
            columns=categorical_cols,
            prefix=categorical_cols,
            dummy_na=False,
            dtype=int
        )

        # --- Combine Features ---
        X_df = pd.concat([scaled_numerical_df, categorical_encoded_df.set_index(scaled_numerical_df.index)], axis=1)
        X = X_df.values
        y = flows_df_processed[label_col].values.astype(np.int64)

        logging.info(f"Feature engineering complete for {dataset_name}. X shape: {X.shape}, y shape: {y.shape}")
        del df_sampled, flows_df_processed, log_transformed_features, scaled_numerical_features
        del scaled_numerical_df, categorical_encoded_df, X_df
        gc.collect()

    except Exception as e:
        logging.error(f"Error during feature engineering for {dataset_name}: {e}")
        if 'df_sampled' in locals(): del df_sampled
        gc.collect()
        continue

    # --- 4. Graph Construction ---
    logging.info(f"Starting adaptive graph construction for {dataset_name}...")
    # Make sure adaptive_graph_construction function is defined globally (e.g., from original Cell 3)
    try:
        data = adaptive_graph_construction(X, y, k=20, threshold=0.5)
        logging.info(f"Graph construction complete for {dataset_name}. Data object: {data}")
        del X, y
        gc.collect()
    except MemoryError as e:
         logging.error(f"MemoryError during graph construction for {dataset_name}: {e}. Skipping dataset.")
         if 'X' in locals(): del X
         if 'y' in locals(): del y
         gc.collect()
         continue
    except Exception as e:
        logging.error(f"Error during graph construction for {dataset_name}: {e}")
        if 'X' in locals(): del X
        if 'y' in locals(): del y
        gc.collect()
        continue

    # --- 5. Data Splitting (Masks) ---
    logging.info(f"Creating train/val/test masks for {dataset_name}...")
    try:
        num_nodes = data.num_nodes
        node_indices = np.arange(num_nodes)
        labels_split = data.y.cpu().numpy()

        train_ratio = 0.70
        val_ratio = 0.15
        test_ratio = 0.15

        try:
            train_indices, temp_indices, _, y_temp = train_test_split(
                node_indices, labels_split, train_size=train_ratio, random_state=42, stratify=labels_split
            )
            val_relative_ratio = val_ratio / (val_ratio + test_ratio)
            val_indices, test_indices, _, _ = train_test_split(
                temp_indices, y_temp, train_size=val_relative_ratio, random_state=42, stratify=y_temp
            )
        except ValueError as split_error:
             logging.warning(f"Stratified split failed for {dataset_name}: {split_error}. Using non-stratified split.")
             train_indices, temp_indices = train_test_split(node_indices, train_size=train_ratio, random_state=42)
             val_relative_ratio = val_ratio / (val_ratio + test_ratio)
             val_indices, test_indices = train_test_split(temp_indices, train_size=val_relative_ratio, random_state=42)

        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        test_mask = torch.zeros(num_nodes, dtype=torch.bool)
        train_mask[train_indices] = True
        val_mask[val_indices] = True
        test_mask[test_indices] = True

        data.train_mask = train_mask
        data.val_mask = val_mask
        data.test_mask = test_mask
        logging.info(f"Masks created for {dataset_name}. Train: {data.train_mask.sum()}, Val: {data.val_mask.sum()}, Test: {data.test_mask.sum()}")
        del node_indices, labels_split, train_indices, temp_indices, val_indices, test_indices
        if 'y_temp' in locals(): del y_temp
        gc.collect()
    except Exception as e:
        logging.error(f"Error during data splitting for {dataset_name}: {e}")
        del data
        gc.collect()
        continue

    # --- 6. Move Data to Device ---
    original_device = device # Store the intended device
    try:
        data = data.to(device)
        logging.info(f"Moved data for {dataset_name} to {device}")
    except Exception as e:
        logging.error(f"Failed to move data for {dataset_name} to {device}: {e}. Trying CPU.")
        try:
            device = torch.device('cpu') # Fallback device for this dataset run
            data = data.to(device)
            logging.info(f"Using CPU for {dataset_name}.")
        except Exception as cpu_e:
             logging.error(f"Failed to move data for {dataset_name} to CPU: {cpu_e}. Skipping dataset.")
             del data
             gc.collect()
             continue

    # --- 7. Model Training & Evaluation Loop (Inner Loop) ---
    node_feat_dim = data.num_node_features
    num_classes = len(torch.unique(data.y))
    output_dim = 1 if num_classes == 2 else num_classes
    is_binary_check = (output_dim == 1)

    # Recalculate criterion based on current device/labels
    pos_weight_current = None # Reset pos_weight for each dataset
    if is_binary_check:
        train_labels = data.y[data.train_mask]
        num_positives = (train_labels == 1).sum().item()
        num_negatives = (train_labels == 0).sum().item()
        if num_positives > 0 and num_negatives > 0:
            pos_weight_value = num_negatives / num_positives
            pos_weight_current = torch.tensor([pos_weight_value], device=device) # Use current device
        criterion_cls = nn.BCEWithLogitsLoss(pos_weight=pos_weight_current)
    else:
        criterion_cls = nn.CrossEntropyLoss()


    models_to_train = {
        "CAGN": CAGN(node_feat_dim, hidden_dim, output_dim, heads=gat_heads, dropout=dropout_rate),
        "GCN": GCN(node_feat_dim, hidden_dim, output_dim),
        "BaselineGAT": BaselineGAT(node_feat_dim, hidden_dim, output_dim, heads=gat_heads, dropout=dropout_rate),
        "GraphSAGE": GraphSAGE(node_feat_dim, hidden_dim, output_dim),
        "GIN": GIN(node_feat_dim, hidden_dim, output_dim)
    }

    current_dataset_results_list = []
    current_dataset_histories = {}

    for model_name, model_instance in models_to_train.items():
        print(f"    --- Training {model_name} for {dataset_name} ---")
        logging.info(f"Starting training for {model_name} on {dataset_name}")
        model = model_instance.to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=10, min_lr=1e-6)
        is_cagn = isinstance(model, CAGN)
        best_val_loss = float('inf')
        epochs_no_improve = 0
        best_model_state = None
        history = {'train_loss': [], 'train_cls_loss': [], 'train_contrast_loss': [],
                   'val_loss': [], 'val_cls_loss': [], 'val_contrast_loss': [],
                   'val_f1': [], 'val_auc': []}
        loop_start_time = time.time()

        for epoch in range(1, epochs + 1):
            epoch_start_time = time.time()
            train_total_loss, train_cls_loss, train_contrast_loss = train_epoch(
                model, data, optimizer, criterion_cls, is_cagn_model=is_cagn
            )
            val_metrics = evaluate(model, data, data.val_mask, criterion_cls, is_cagn_model=is_cagn)
            val_loss = val_metrics['loss']
            val_f1 = val_metrics['f1']
            val_auc = val_metrics['auc']
            scheduler_loss = val_metrics['cls_loss'] if not is_cagn else val_loss
            scheduler.step(scheduler_loss)

            history['train_loss'].append(train_total_loss)
            history['train_cls_loss'].append(train_cls_loss)
            history['train_contrast_loss'].append(train_contrast_loss)
            history['val_loss'].append(val_metrics['loss'])
            history['val_cls_loss'].append(val_metrics['cls_loss'])
            history['val_contrast_loss'].append(val_metrics['contrast_loss'])
            history['val_f1'].append(val_f1)
            history['val_auc'].append(val_auc)

            epoch_duration = time.time() - epoch_start_time
            if epoch % 50 == 0 or epoch == 1:
                 train_loss_str = f"{train_total_loss:.4f} (CLS: {train_cls_loss:.4f}" + (f", Contr: {train_contrast_loss:.4f})" if is_cagn else ")")
                 val_loss_str = f"{val_metrics['loss']:.4f} (CLS: {val_metrics['cls_loss']:.4f})"
                 print(f'    Epoch: {epoch:03d}, Train Loss: {train_loss_str}, Val Loss: {val_loss_str}, Val F1: {val_f1:.4f}, Val AUC: {val_auc:.4f}')

            monitor_loss = val_metrics['cls_loss'] if not is_cagn else val_loss
            if monitor_loss < best_val_loss:
                best_val_loss = monitor_loss
                epochs_no_improve = 0
                best_model_state = copy.deepcopy(model.state_dict())
            else:
                epochs_no_improve += 1
            if epochs_no_improve >= patience:
                logging.info(f"Early stopping for {model_name} on {dataset_name} at epoch {epoch}.")
                break

        total_training_time = time.time() - loop_start_time
        current_dataset_histories[model_name] = history

        if best_model_state:
            model.load_state_dict(best_model_state)
        test_metrics = evaluate(model, data, data.test_mask, criterion_cls, is_cagn_model=is_cagn)
        print(f"    Test F1 for {model_name}: {test_metrics['f1']:.4f}, AUC: {test_metrics['auc']:.4f}")

                # --- Save the best model state (Added) ---
        # Check if the current model is CAGN and if a best state was found during training
        if model_name == "CAGN" and best_model_state is not None:
            save_dir = "/media/ssd/test/GNN/Standardized Models/CAGN-GAT/" # Define save directory
            # Save the best model for the specific dataset being processed
            save_path = os.path.join(save_dir, f"best_cagn_model_{dataset_name}.pt") # Include dataset name in filename
            try:
                os.makedirs(save_dir, exist_ok=True) # Ensure directory exists
                torch.save(best_model_state, save_path) # Save the state dictionary
                logging.info(f"Saved best {model_name} model state for {dataset_name} to {save_path}")
            except Exception as save_e:
                logging.error(f"Error saving {model_name} model state for {dataset_name}: {save_e}")
        # --- End Save Model State ---
        
        current_dataset_results_list.append({
            'Model': model_name,
            'Accuracy': test_metrics['accuracy'],
            'Precision': test_metrics['precision'],
            'Recall': test_metrics['recall'],
            'F1': test_metrics['f1'],
            'AUC': test_metrics['auc'],
            'Test Loss (CLS)': test_metrics['cls_loss'],
            'Training Time (s)': total_training_time
        })

        del model, model_instance, optimizer, scheduler, history, best_model_state, test_metrics
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        # --- End Inner Training Loop Placeholder ---

    # --- 8. Store and Display Results for Current Dataset ---
    if current_dataset_results_list:
        results_df = pd.DataFrame(current_dataset_results_list)
        results_df = results_df.sort_values(by='F1', ascending=False)
        print(f"\n--- Results for {dataset_name} ---")
        print(results_df.round(4).to_markdown(index=False))
        all_dataset_results[dataset_name] = results_df
        # Optional plotting per dataset can go here

    # --- 9. Clean up before next dataset ---
    del data
    if 'train_mask' in locals(): del train_mask, val_mask, test_mask
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    # Reset device to original intended device if it was changed to CPU
    device = original_device


# --- End of Dataset Loop ---
print(f"\n{'='*30}\nBenchmark Complete\n{'='*30}")


# --- Final Summary Cell (e.g., New Cell 8) ---
print("\n--- Overall Benchmark Summary ---")
for dataset_name, results_df in all_dataset_results.items():
    print(f"\n--- Results for {dataset_name} ---")
    print(results_df.round(4).to_markdown(index=False))

# You could also concatenate results into a single DataFrame
# combined_results = pd.concat(all_dataset_results, names=['Dataset']).reset_index(level=0)
# print("\n--- Combined Results Table ---")
# print(combined_results.round(4).to_markdown(index=False))

2025-04-12 23:37:58,070 - INFO - Starting processing for dataset: Combined_10pct from /media/ssd/test/standardized-datasets/combined/combined_unsw_cicRed_botRed_netflow_10pct.csv
2025-04-12 23:37:58,071 - INFO - Loading dataset: Combined_10pct



Processing Dataset: Combined_10pct


2025-04-12 23:38:01,560 - INFO - Dataset Combined_10pct loaded. Shape: (655094, 47)
2025-04-12 23:38:01,564 - INFO - Starting sampling for Combined_10pct...
2025-04-12 23:38:01,564 - INFO - Starting imbalanced sampling. Target size for large classes: 50000
2025-04-12 23:38:01,571 - INFO - Original class distribution:\nLabel
0    341806
1    313288
Name: count, dtype: int64
2025-04-12 23:38:01,572 - INFO - Found 2 large classes (>= 1000 samples).
2025-04-12 23:38:01,573 - INFO - Scaling factor for large classes: 0.0763
2025-04-12 23:38:01,573 - INFO -   Sampling class '0': target size=26088, final sample size=26088
2025-04-12 23:38:01,648 - INFO -   Sampling class '1': target size=23911, final sample size=23911
2025-04-12 23:38:01,743 - INFO - Finished sampling. New dataset size: 49999
2025-04-12 23:38:01,745 - INFO - New class distribution:\nLabel
0    26088
1    23911
Name: count, dtype: int64
2025-04-12 23:38:01,806 - INFO - Sampling complete for Combined_10pct. Sampled shape: (49999

    --- Training CAGN for Combined_10pct ---
    Epoch: 001, Train Loss: 1.4275 (CLS: 0.9477, Contr: 0.9596), Val Loss: 0.6366 (CLS: 0.6366), Val F1: 0.7599, Val AUC: 0.8789


2025-04-12 23:39:18,956 - INFO - Early stopping for CAGN on Combined_10pct at epoch 43.
2025-04-12 23:39:18,996 - INFO - Saved best CAGN model state for Combined_10pct to /media/ssd/test/GNN/Standardized Models/CAGN-GAT/best_cagn_model_Combined_10pct.pt


    Test F1 for CAGN: 0.8233, AUC: 0.9582


2025-04-12 23:39:19,264 - INFO - Starting training for GCN on Combined_10pct


    --- Training GCN for Combined_10pct ---
    Epoch: 001, Train Loss: 0.7043 (CLS: 0.7043), Val Loss: 0.6891 (CLS: 0.6891), Val F1: 0.7305, Val AUC: 0.8326
    Epoch: 050, Train Loss: 0.2839 (CLS: 0.2839), Val Loss: 0.2642 (CLS: 0.2642), Val F1: 0.9187, Val AUC: 0.9630
    Epoch: 100, Train Loss: 0.2162 (CLS: 0.2162), Val Loss: 0.2062 (CLS: 0.2062), Val F1: 0.9403, Val AUC: 0.9723
    Epoch: 150, Train Loss: 0.1927 (CLS: 0.1927), Val Loss: 0.1885 (CLS: 0.1885), Val F1: 0.9432, Val AUC: 0.9755
    Epoch: 200, Train Loss: 0.1784 (CLS: 0.1784), Val Loss: 0.1789 (CLS: 0.1789), Val F1: 0.9480, Val AUC: 0.9771
    Test F1 for GCN: 0.9433, AUC: 0.9770


2025-04-12 23:39:24,373 - INFO - Starting training for BaselineGAT on Combined_10pct


    --- Training BaselineGAT for Combined_10pct ---
    Epoch: 001, Train Loss: 0.8349 (CLS: 0.8349), Val Loss: 0.7243 (CLS: 0.7243), Val F1: 0.3211, Val AUC: 0.5115
    Epoch: 050, Train Loss: 0.3961 (CLS: 0.3961), Val Loss: 0.2809 (CLS: 0.2809), Val F1: 0.9169, Val AUC: 0.9598
    Epoch: 100, Train Loss: 0.3362 (CLS: 0.3362), Val Loss: 0.2257 (CLS: 0.2257), Val F1: 0.9309, Val AUC: 0.9698
    Epoch: 150, Train Loss: 0.3244 (CLS: 0.3244), Val Loss: 0.2151 (CLS: 0.2151), Val F1: 0.9341, Val AUC: 0.9698
    Epoch: 200, Train Loss: 0.3179 (CLS: 0.3179), Val Loss: 0.2126 (CLS: 0.2126), Val F1: 0.9345, Val AUC: 0.9704
    Test F1 for BaselineGAT: 0.9290, AUC: 0.9725


2025-04-12 23:39:31,902 - INFO - Starting training for GraphSAGE on Combined_10pct


    --- Training GraphSAGE for Combined_10pct ---
    Epoch: 001, Train Loss: 0.7214 (CLS: 0.7214), Val Loss: 0.7140 (CLS: 0.7140), Val F1: 0.7014, Val AUC: 0.7901
    Epoch: 050, Train Loss: 0.3043 (CLS: 0.3043), Val Loss: 0.2964 (CLS: 0.2964), Val F1: 0.9019, Val AUC: 0.9562
    Epoch: 100, Train Loss: 0.1936 (CLS: 0.1936), Val Loss: 0.1872 (CLS: 0.1872), Val F1: 0.9460, Val AUC: 0.9760
    Epoch: 150, Train Loss: 0.1561 (CLS: 0.1561), Val Loss: 0.1555 (CLS: 0.1555), Val F1: 0.9539, Val AUC: 0.9823
    Epoch: 200, Train Loss: 0.1380 (CLS: 0.1380), Val Loss: 0.1415 (CLS: 0.1415), Val F1: 0.9579, Val AUC: 0.9850
    Test F1 for GraphSAGE: 0.9554, AUC: 0.9855


2025-04-12 23:39:42,106 - INFO - Starting training for GIN on Combined_10pct


    --- Training GIN for Combined_10pct ---
    Epoch: 001, Train Loss: 0.7936 (CLS: 0.7936), Val Loss: 0.9884 (CLS: 0.9884), Val F1: 0.0055, Val AUC: 0.4501
    Epoch: 050, Train Loss: 0.3150 (CLS: 0.3150), Val Loss: 0.3455 (CLS: 0.3455), Val F1: 0.8190, Val AUC: 0.9419
    Epoch: 100, Train Loss: 0.1472 (CLS: 0.1472), Val Loss: 0.2733 (CLS: 0.2733), Val F1: 0.9595, Val AUC: 0.9822
    Epoch: 150, Train Loss: 0.1028 (CLS: 0.1028), Val Loss: 0.1139 (CLS: 0.1139), Val F1: 0.9652, Val AUC: 0.9925


2025-04-12 23:39:52,689 - INFO - Early stopping for GIN on Combined_10pct at epoch 195.


    Test F1 for GIN: 0.9650, AUC: 0.9933


2025-04-12 23:39:53,088 - INFO - Starting processing for dataset: CICIDS2018_Reduced from /media/ssd/test/standardized-datasets/netflow/nf_cic_ids2018_reduced_standardized.csv
2025-04-12 23:39:53,089 - INFO - Loading dataset: CICIDS2018_Reduced



--- Results for Combined_10pct ---
| Model       |   Accuracy |   Precision |   Recall |     F1 |    AUC |   Test Loss (CLS) |   Training Time (s) |
|:------------|-----------:|------------:|---------:|-------:|-------:|------------------:|--------------------:|
| GIN         |     0.9669 |      0.976  |   0.9543 | 0.965  | 0.9933 |            0.1036 |             10.5824 |
| GraphSAGE   |     0.9579 |      0.9669 |   0.9442 | 0.9554 | 0.9855 |            0.1432 |              9.9704 |
| GCN         |     0.9469 |      0.9645 |   0.9231 | 0.9433 | 0.977  |            0.186  |              4.8885 |
| BaselineGAT |     0.9321 |      0.9296 |   0.9284 | 0.929  | 0.9725 |            0.2105 |              7.3028 |
| CAGN        |     0.8507 |      0.9484 |   0.7273 | 0.8233 | 0.9582 |            0.3551 |             29.3136 |

Processing Dataset: CICIDS2018_Reduced


2025-04-12 23:40:04,523 - INFO - Dataset CICIDS2018_Reduced loaded. Shape: (2080710, 47)
2025-04-12 23:40:04,524 - INFO - Starting sampling for CICIDS2018_Reduced...
2025-04-12 23:40:04,524 - INFO - Starting imbalanced sampling. Target size for large classes: 50000
2025-04-12 23:40:04,540 - INFO - Original class distribution:\nLabel
0    1497201
1     583509
Name: count, dtype: int64
2025-04-12 23:40:04,541 - INFO - Found 2 large classes (>= 1000 samples).
2025-04-12 23:40:04,542 - INFO - Scaling factor for large classes: 0.0240
2025-04-12 23:40:04,542 - INFO -   Sampling class '0': target size=35978, final sample size=35978
2025-04-12 23:40:04,965 - INFO -   Sampling class '1': target size=14021, final sample size=14021
2025-04-12 23:40:05,193 - INFO - Finished sampling. New dataset size: 49999
2025-04-12 23:40:05,195 - INFO - New class distribution:\nLabel
0    35978
1    14021
Name: count, dtype: int64
2025-04-12 23:40:05,196 - INFO - Sampling complete for CICIDS2018_Reduced. Sample

    --- Training CAGN for CICIDS2018_Reduced ---
    Epoch: 001, Train Loss: 1.7133 (CLS: 1.2324, Contr: 0.9617), Val Loss: 0.8971 (CLS: 0.8971), Val F1: 0.6224, Val AUC: 0.8274
    Epoch: 050, Train Loss: 0.9818 (CLS: 0.6230, Contr: 0.7176), Val Loss: 0.3747 (CLS: 0.3747), Val F1: 0.8583, Val AUC: 0.9562
    Epoch: 100, Train Loss: 0.9382 (CLS: 0.5903, Contr: 0.6958), Val Loss: 0.3493 (CLS: 0.3493), Val F1: 0.8785, Val AUC: 0.9602


2025-04-12 23:42:29,837 - INFO - Early stopping for CAGN on CICIDS2018_Reduced at epoch 149.
2025-04-12 23:42:29,874 - INFO - Saved best CAGN model state for CICIDS2018_Reduced to /media/ssd/test/GNN/Standardized Models/CAGN-GAT/best_cagn_model_CICIDS2018_Reduced.pt


    Test F1 for CAGN: 0.8761, AUC: 0.9623


2025-04-12 23:42:30,138 - INFO - Starting training for GCN on CICIDS2018_Reduced


    --- Training GCN for CICIDS2018_Reduced ---
    Epoch: 001, Train Loss: 1.0860 (CLS: 1.0860), Val Loss: 1.0504 (CLS: 1.0504), Val F1: 0.2698, Val AUC: 0.3299
    Epoch: 050, Train Loss: 0.4354 (CLS: 0.4354), Val Loss: 0.4197 (CLS: 0.4197), Val F1: 0.8453, Val AUC: 0.9481
    Epoch: 100, Train Loss: 0.2793 (CLS: 0.2793), Val Loss: 0.2626 (CLS: 0.2626), Val F1: 0.9245, Val AUC: 0.9727
    Epoch: 150, Train Loss: 0.2406 (CLS: 0.2406), Val Loss: 0.2309 (CLS: 0.2309), Val F1: 0.9431, Val AUC: 0.9759


2025-04-12 23:42:35,246 - INFO - Starting training for BaselineGAT on CICIDS2018_Reduced


    Epoch: 200, Train Loss: 0.2211 (CLS: 0.2211), Val Loss: 0.2167 (CLS: 0.2167), Val F1: 0.9470, Val AUC: 0.9778
    Test F1 for GCN: 0.9455, AUC: 0.9787
    --- Training BaselineGAT for CICIDS2018_Reduced ---
    Epoch: 001, Train Loss: 1.0975 (CLS: 1.0975), Val Loss: 0.8761 (CLS: 0.8761), Val F1: 0.6312, Val AUC: 0.8441
    Epoch: 050, Train Loss: 0.5565 (CLS: 0.5565), Val Loss: 0.3814 (CLS: 0.3814), Val F1: 0.8450, Val AUC: 0.9465
    Epoch: 100, Train Loss: 0.4923 (CLS: 0.4923), Val Loss: 0.3163 (CLS: 0.3163), Val F1: 0.8615, Val AUC: 0.9635
    Epoch: 150, Train Loss: 0.4684 (CLS: 0.4684), Val Loss: 0.2863 (CLS: 0.2863), Val F1: 0.8866, Val AUC: 0.9665


2025-04-12 23:42:42,805 - INFO - Starting training for GraphSAGE on CICIDS2018_Reduced


    Epoch: 200, Train Loss: 0.4508 (CLS: 0.4508), Val Loss: 0.2725 (CLS: 0.2725), Val F1: 0.9128, Val AUC: 0.9668
    Test F1 for BaselineGAT: 0.9081, AUC: 0.9678
    --- Training GraphSAGE for CICIDS2018_Reduced ---
    Epoch: 001, Train Loss: 0.9992 (CLS: 0.9992), Val Loss: 0.9851 (CLS: 0.9851), Val F1: 0.5148, Val AUC: 0.7066
    Epoch: 050, Train Loss: 0.4006 (CLS: 0.4006), Val Loss: 0.3786 (CLS: 0.3786), Val F1: 0.8741, Val AUC: 0.9561
    Epoch: 100, Train Loss: 0.2630 (CLS: 0.2630), Val Loss: 0.2531 (CLS: 0.2531), Val F1: 0.9278, Val AUC: 0.9708
    Epoch: 150, Train Loss: 0.2267 (CLS: 0.2267), Val Loss: 0.2198 (CLS: 0.2198), Val F1: 0.9445, Val AUC: 0.9772


2025-04-12 23:42:49,064 - INFO - Starting training for GIN on CICIDS2018_Reduced


    Epoch: 200, Train Loss: 0.2124 (CLS: 0.2124), Val Loss: 0.2068 (CLS: 0.2068), Val F1: 0.9434, Val AUC: 0.9788
    Test F1 for GraphSAGE: 0.9451, AUC: 0.9772
    --- Training GIN for CICIDS2018_Reduced ---
    Epoch: 001, Train Loss: 1.1166 (CLS: 1.1166), Val Loss: 0.9420 (CLS: 0.9420), Val F1: 0.4982, Val AUC: 0.7778
    Epoch: 050, Train Loss: 0.4640 (CLS: 0.4640), Val Loss: 0.4822 (CLS: 0.4822), Val F1: 0.7852, Val AUC: 0.9435
    Epoch: 100, Train Loss: 0.2262 (CLS: 0.2262), Val Loss: 0.3945 (CLS: 0.3945), Val F1: 0.7914, Val AUC: 0.9691
    Epoch: 150, Train Loss: 0.1729 (CLS: 0.1729), Val Loss: 0.2076 (CLS: 0.2076), Val F1: 0.9548, Val AUC: 0.9791
    Epoch: 200, Train Loss: 0.1672 (CLS: 0.1672), Val Loss: 0.1857 (CLS: 0.1857), Val F1: 0.9525, Val AUC: 0.9827
    Test F1 for GIN: 0.9544, AUC: 0.9837


2025-04-12 23:42:56,216 - INFO - Starting processing for dataset: BotIoT_v2_Reduced from /media/ssd/test/standardized-datasets/netflow/nf_bot_iot_v2_reduced_standardized.csv
2025-04-12 23:42:56,217 - INFO - Loading dataset: BotIoT_v2_Reduced



--- Results for CICIDS2018_Reduced ---
| Model       |   Accuracy |   Precision |   Recall |     F1 |    AUC |   Test Loss (CLS) |   Training Time (s) |
|:------------|-----------:|------------:|---------:|-------:|-------:|------------------:|--------------------:|
| GIN         |     0.9752 |      0.9853 |   0.9253 | 0.9544 | 0.9837 |            0.1803 |              6.7868 |
| GCN         |     0.9703 |      0.9728 |   0.9196 | 0.9455 | 0.9787 |            0.2152 |              4.9405 |
| GraphSAGE   |     0.97   |      0.9704 |   0.9211 | 0.9451 | 0.9772 |            0.2094 |              6.0875 |
| BaselineGAT |     0.9481 |      0.9027 |   0.9135 | 0.9081 | 0.9678 |            0.2715 |              7.3861 |
| CAGN        |     0.9287 |      0.8541 |   0.8992 | 0.8761 | 0.9623 |            0.3424 |             99.8804 |

Processing Dataset: BotIoT_v2_Reduced


2025-04-12 23:42:57,994 - INFO - Dataset BotIoT_v2_Reduced loaded. Shape: (345256, 47)
2025-04-12 23:42:57,996 - INFO - Starting sampling for BotIoT_v2_Reduced...
2025-04-12 23:42:57,996 - INFO - Starting imbalanced sampling. Target size for large classes: 50000
2025-04-12 23:42:58,000 - INFO - Original class distribution:\nLabel
1    339853
0      5403
Name: count, dtype: int64
2025-04-12 23:42:58,001 - INFO - Found 2 large classes (>= 1000 samples).
2025-04-12 23:42:58,001 - INFO - Scaling factor for large classes: 0.1448
2025-04-12 23:42:58,002 - INFO -   Sampling class '1': target size=49217, final sample size=49217
2025-04-12 23:42:58,071 - INFO -   Sampling class '0': target size=782, final sample size=782
2025-04-12 23:42:58,103 - INFO - Finished sampling. New dataset size: 49999
2025-04-12 23:42:58,105 - INFO - New class distribution:\nLabel
1    49217
0      782
Name: count, dtype: int64
2025-04-12 23:42:58,105 - INFO - Sampling complete for BotIoT_v2_Reduced. Sampled shape: (

    --- Training CAGN for BotIoT_v2_Reduced ---
    Epoch: 001, Train Loss: 0.5182 (CLS: 0.0296, Contr: 0.9772), Val Loss: 0.0176 (CLS: 0.0176), Val F1: 0.9242, Val AUC: 0.9179
    Epoch: 050, Train Loss: 0.3415 (CLS: 0.0169, Contr: 0.6492), Val Loss: 0.0100 (CLS: 0.0100), Val F1: 0.9670, Val AUC: 0.9226
    Epoch: 100, Train Loss: 0.3358 (CLS: 0.0170, Contr: 0.6378), Val Loss: 0.0095 (CLS: 0.0095), Val F1: 0.9682, Val AUC: 0.9392


2025-04-12 23:45:18,875 - INFO - Early stopping for CAGN on BotIoT_v2_Reduced at epoch 112.
2025-04-12 23:45:18,912 - INFO - Saved best CAGN model state for BotIoT_v2_Reduced to /media/ssd/test/GNN/Standardized Models/CAGN-GAT/best_cagn_model_BotIoT_v2_Reduced.pt


    Test F1 for CAGN: 0.9688, AUC: 0.9638


2025-04-12 23:45:19,180 - INFO - Starting training for GCN on BotIoT_v2_Reduced


    --- Training GCN for BotIoT_v2_Reduced ---
    Epoch: 001, Train Loss: 0.0244 (CLS: 0.0244), Val Loss: 0.0217 (CLS: 0.0217), Val F1: 0.0000, Val AUC: 0.7933
    Epoch: 050, Train Loss: 0.0103 (CLS: 0.0103), Val Loss: 0.0093 (CLS: 0.0093), Val F1: 0.9242, Val AUC: 0.9796
    Epoch: 100, Train Loss: 0.0065 (CLS: 0.0065), Val Loss: 0.0058 (CLS: 0.0058), Val F1: 0.9679, Val AUC: 0.9879
    Epoch: 150, Train Loss: 0.0051 (CLS: 0.0051), Val Loss: 0.0047 (CLS: 0.0047), Val F1: 0.9765, Val AUC: 0.9913


2025-04-12 23:45:24,216 - INFO - Starting training for BaselineGAT on BotIoT_v2_Reduced


    Epoch: 200, Train Loss: 0.0044 (CLS: 0.0044), Val Loss: 0.0042 (CLS: 0.0042), Val F1: 0.9766, Val AUC: 0.9935
    Test F1 for GCN: 0.9779, AUC: 0.9939
    --- Training BaselineGAT for BotIoT_v2_Reduced ---
    Epoch: 001, Train Loss: 0.0480 (CLS: 0.0480), Val Loss: 0.0233 (CLS: 0.0233), Val F1: 0.0519, Val AUC: 0.4818
    Epoch: 050, Train Loss: 0.0171 (CLS: 0.0171), Val Loss: 0.0096 (CLS: 0.0096), Val F1: 0.9241, Val AUC: 0.9577
    Epoch: 100, Train Loss: 0.0130 (CLS: 0.0130), Val Loss: 0.0081 (CLS: 0.0081), Val F1: 0.9243, Val AUC: 0.9586
    Epoch: 150, Train Loss: 0.0133 (CLS: 0.0133), Val Loss: 0.0074 (CLS: 0.0074), Val F1: 0.9243, Val AUC: 0.9607


2025-04-12 23:45:31,791 - INFO - Starting training for GraphSAGE on BotIoT_v2_Reduced


    Epoch: 200, Train Loss: 0.0113 (CLS: 0.0113), Val Loss: 0.0070 (CLS: 0.0070), Val F1: 0.9243, Val AUC: 0.9637
    Test F1 for BaselineGAT: 0.9245, AUC: 0.9680
    --- Training GraphSAGE for BotIoT_v2_Reduced ---
    Epoch: 001, Train Loss: 0.0212 (CLS: 0.0212), Val Loss: 0.0205 (CLS: 0.0205), Val F1: 0.1129, Val AUC: 0.9555
    Epoch: 050, Train Loss: 0.0089 (CLS: 0.0089), Val Loss: 0.0085 (CLS: 0.0085), Val F1: 0.9499, Val AUC: 0.9879
    Epoch: 100, Train Loss: 0.0045 (CLS: 0.0045), Val Loss: 0.0042 (CLS: 0.0042), Val F1: 0.9691, Val AUC: 0.9914
    Epoch: 150, Train Loss: 0.0038 (CLS: 0.0038), Val Loss: 0.0037 (CLS: 0.0037), Val F1: 0.9691, Val AUC: 0.9919


2025-04-12 23:45:36,456 - INFO - Starting training for GIN on BotIoT_v2_Reduced


    Epoch: 200, Train Loss: 0.0036 (CLS: 0.0036), Val Loss: 0.0035 (CLS: 0.0035), Val F1: 0.9731, Val AUC: 0.9927
    Test F1 for GraphSAGE: 0.9737, AUC: 0.9941
    --- Training GIN for BotIoT_v2_Reduced ---
    Epoch: 001, Train Loss: 0.0242 (CLS: 0.0242), Val Loss: 0.0457 (CLS: 0.0457), Val F1: 0.0000, Val AUC: 0.5236
    Epoch: 050, Train Loss: 0.0112 (CLS: 0.0112), Val Loss: 0.0139 (CLS: 0.0139), Val F1: 0.9900, Val AUC: 0.9252
    Epoch: 100, Train Loss: 0.0068 (CLS: 0.0068), Val Loss: 0.0096 (CLS: 0.0096), Val F1: 0.9725, Val AUC: 0.9660


2025-04-12 23:45:40,199 - INFO - Early stopping for GIN on BotIoT_v2_Reduced at epoch 142.


    Test F1 for GIN: 0.9777, AUC: 0.9664

--- Results for BotIoT_v2_Reduced ---
| Model       |   Accuracy |   Precision |   Recall |     F1 |    AUC |   Test Loss (CLS) |   Training Time (s) |
|:------------|-----------:|------------:|---------:|-------:|-------:|------------------:|--------------------:|
| GCN         |     0.9573 |      0.9997 |   0.9569 | 0.9779 | 0.9939 |            0.004  |              4.8673 |
| GIN         |     0.9569 |      0.9992 |   0.9571 | 0.9777 | 0.9664 |            0.0078 |              3.7422 |
| GraphSAGE   |     0.9496 |      0.9999 |   0.9489 | 0.9737 | 0.9941 |            0.0035 |              4.497  |
| CAGN        |     0.9404 |      0.9993 |   0.9401 | 0.9688 | 0.9638 |            0.0086 |             75.6037 |
| BaselineGAT |     0.8617 |      0.9998 |   0.8597 | 0.9245 | 0.968  |            0.0068 |              7.4019 |


2025-04-12 23:45:40,533 - INFO - Starting processing for dataset: UNSW_NB15 from /media/ssd/test/standardized-datasets/netflow/nf_unsw_nb15_standardized.csv
2025-04-12 23:45:40,533 - INFO - Loading dataset: UNSW_NB15



Processing Dataset: UNSW_NB15


2025-04-12 23:45:52,967 - INFO - Dataset UNSW_NB15 loaded. Shape: (2390275, 47)
2025-04-12 23:45:52,969 - INFO - Starting sampling for UNSW_NB15...
2025-04-12 23:45:52,969 - INFO - Starting imbalanced sampling. Target size for large classes: 50000
2025-04-12 23:45:52,987 - INFO - Original class distribution:\nLabel
0    2295222
1      95053
Name: count, dtype: int64
2025-04-12 23:45:52,989 - INFO - Found 2 large classes (>= 1000 samples).
2025-04-12 23:45:52,989 - INFO - Scaling factor for large classes: 0.0209
2025-04-12 23:45:52,990 - INFO -   Sampling class '0': target size=48011, final sample size=48011
2025-04-12 23:45:53,593 - INFO -   Sampling class '1': target size=1988, final sample size=1988
2025-04-12 23:45:53,673 - INFO - Finished sampling. New dataset size: 49999
2025-04-12 23:45:53,675 - INFO - New class distribution:\nLabel
0    48011
1     1988
Name: count, dtype: int64
2025-04-12 23:45:53,676 - INFO - Sampling complete for UNSW_NB15. Sampled shape: (49999, 47)
2025-04-

    --- Training CAGN for UNSW_NB15 ---
    Epoch: 001, Train Loss: 2.4032 (CLS: 1.9140, Contr: 0.9783), Val Loss: 1.2883 (CLS: 1.2883), Val F1: 0.1175, Val AUC: 0.6244


2025-04-12 23:47:07,698 - INFO - Early stopping for CAGN on UNSW_NB15 at epoch 40.
2025-04-12 23:47:07,735 - INFO - Saved best CAGN model state for UNSW_NB15 to /media/ssd/test/GNN/Standardized Models/CAGN-GAT/best_cagn_model_UNSW_NB15.pt


    Test F1 for CAGN: 0.1824, AUC: 0.8942


2025-04-12 23:47:08,000 - INFO - Starting training for GCN on UNSW_NB15


    --- Training GCN for UNSW_NB15 ---
    Epoch: 001, Train Loss: 1.3907 (CLS: 1.3907), Val Loss: 1.3303 (CLS: 1.3303), Val F1: 0.0807, Val AUC: 0.5779
    Epoch: 050, Train Loss: 0.7065 (CLS: 0.7065), Val Loss: 0.6553 (CLS: 0.6553), Val F1: 0.4467, Val AUC: 0.9370
    Epoch: 100, Train Loss: 0.4936 (CLS: 0.4936), Val Loss: 0.4758 (CLS: 0.4758), Val F1: 0.5074, Val AUC: 0.9671
    Epoch: 150, Train Loss: 0.4191 (CLS: 0.4191), Val Loss: 0.4191 (CLS: 0.4191), Val F1: 0.6033, Val AUC: 0.9772


2025-04-12 23:47:13,092 - INFO - Starting training for BaselineGAT on UNSW_NB15


    Epoch: 200, Train Loss: 0.3578 (CLS: 0.3578), Val Loss: 0.3768 (CLS: 0.3768), Val F1: 0.4179, Val AUC: 0.9820
    Test F1 for GCN: 0.4212, AUC: 0.9864
    --- Training BaselineGAT for UNSW_NB15 ---
    Epoch: 001, Train Loss: 1.7527 (CLS: 1.7527), Val Loss: 1.3513 (CLS: 1.3513), Val F1: 0.0706, Val AUC: 0.4891
    Epoch: 050, Train Loss: 0.9758 (CLS: 0.9758), Val Loss: 0.7231 (CLS: 0.7231), Val F1: 0.4066, Val AUC: 0.9342
    Epoch: 100, Train Loss: 0.8462 (CLS: 0.8462), Val Loss: 0.5814 (CLS: 0.5814), Val F1: 0.3631, Val AUC: 0.9522
    Epoch: 150, Train Loss: 0.8078 (CLS: 0.8078), Val Loss: 0.5622 (CLS: 0.5622), Val F1: 0.3703, Val AUC: 0.9531


2025-04-12 23:47:20,761 - INFO - Starting training for GraphSAGE on UNSW_NB15


    Epoch: 200, Train Loss: 0.7819 (CLS: 0.7819), Val Loss: 0.5558 (CLS: 0.5558), Val F1: 0.3696, Val AUC: 0.9534
    Test F1 for BaselineGAT: 0.3829, AUC: 0.9522
    --- Training GraphSAGE for UNSW_NB15 ---
    Epoch: 001, Train Loss: 1.3195 (CLS: 1.3195), Val Loss: 1.3116 (CLS: 1.3116), Val F1: 0.0764, Val AUC: 0.8215
    Epoch: 050, Train Loss: 0.6804 (CLS: 0.6804), Val Loss: 0.6695 (CLS: 0.6695), Val F1: 0.5332, Val AUC: 0.9536
    Epoch: 100, Train Loss: 0.2843 (CLS: 0.2843), Val Loss: 0.2693 (CLS: 0.2693), Val F1: 0.7530, Val AUC: 0.9837
    Epoch: 150, Train Loss: 0.2071 (CLS: 0.2071), Val Loss: 0.1858 (CLS: 0.1858), Val F1: 0.7872, Val AUC: 0.9947
    Epoch: 200, Train Loss: 0.1688 (CLS: 0.1688), Val Loss: 0.1496 (CLS: 0.1496), Val F1: 0.8051, Val AUC: 0.9964
    Test F1 for GraphSAGE: 0.8326, AUC: 0.9958


2025-04-12 23:47:28,102 - INFO - Starting training for GIN on UNSW_NB15


    --- Training GIN for UNSW_NB15 ---
    Epoch: 001, Train Loss: 1.3982 (CLS: 1.3982), Val Loss: 1.4246 (CLS: 1.4246), Val F1: 0.0762, Val AUC: 0.4905
    Epoch: 050, Train Loss: 0.7010 (CLS: 0.7010), Val Loss: 1.0064 (CLS: 1.0064), Val F1: 0.6201, Val AUC: 0.9435
    Epoch: 100, Train Loss: 0.3431 (CLS: 0.3431), Val Loss: 0.4849 (CLS: 0.4849), Val F1: 0.8646, Val AUC: 0.9939
    Epoch: 150, Train Loss: 0.1620 (CLS: 0.1620), Val Loss: 0.1775 (CLS: 0.1775), Val F1: 0.8943, Val AUC: 0.9977
    Epoch: 200, Train Loss: 0.2134 (CLS: 0.2134), Val Loss: 0.9718 (CLS: 0.9718), Val F1: 0.3267, Val AUC: 0.9859
    Test F1 for GIN: 0.8702, AUC: 0.9973

--- Results for UNSW_NB15 ---
| Model       |   Accuracy |   Precision |   Recall |     F1 |    AUC |   Test Loss (CLS) |   Training Time (s) |
|:------------|-----------:|------------:|---------:|-------:|-------:|------------------:|--------------------:|
| GIN         |     0.9883 |      0.7763 |   0.9899 | 0.8702 | 0.9973 |            0.1336 |