In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import kneighbors_graph
from scipy.sparse import coo_matrix
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
import torch.optim as optim
import time
from tqdm import tqdm

# Load the CSV file
file_path = 'CIC_data.csv'  # Update this with the actual path to your CSV file
start_time = time.time()
data = pd.read_csv(file_path, low_memory=False)
print(f"Data loading time: {time.time() - start_time:.2f} seconds")

# Strip any leading or trailing whitespace from the column names
data.columns = data.columns.str.strip()

# Strip whitespace from label values
data['Label'] = data['Label'].str.strip()

# Verify unique labels and their distribution
unique_labels = data['Label'].unique()
print(f"Unique labels in the dataset: {unique_labels}")

label_counts = data['Label'].value_counts()
print("Label distribution in the dataset:")
print(label_counts)

# Sample a fraction of the rows (e.g., 40%)
data_sampled = data.sample(frac=0.2, random_state=42)

# Convert all columns to numeric, coerce errors to NaN
start_time = time.time()
data_numeric = data_sampled.apply(pd.to_numeric, errors='coerce')
print(f"Conversion to numeric time: {time.time() - start_time:.2f} seconds")

# Fill NaN values with the mean of each column
start_time = time.time()
data_filled = data_numeric.fillna(data_numeric.mean())
print(f"Filling NaNs time: {time.time() - start_time:.2f} seconds")

# Handle infinite and very large values
start_time = time.time()
data_filled = data_filled.replace([np.inf, -np.inf], np.nan)
data_filled = data_filled.fillna(data_filled.max())
print(f"Handling infinities time: {time.time() - start_time:.2f} seconds")

# Check again for any remaining NaNs and fill them
start_time = time.time()
if data_filled.isnull().values.any():
    data_filled = data_filled.fillna(0)
print(f"Final NaN check and fill time: {time.time() - start_time:.2f} seconds")

# Extract labels
labels = data_sampled['Label']
data_filled = data_filled.drop(columns=['Label'])

# Print unique labels and their counts again after sampling
unique_labels = labels.unique()
print(f"Unique labels in the sampled dataset: {unique_labels}")

label_counts = labels.value_counts()
print("Label distribution in the sampled dataset:")
print(label_counts)

# Ensure that there are at least two unique labels
if len(unique_labels) <= 1:
    raise ValueError("The dataset contains only one class. Add more data with different classes to train the model.")

# Normalize the data
start_time = time.time()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_filled)
print(f"Normalization time: {time.time() - start_time:.2f} seconds")

# Convert to PyTorch tensors
node_features = torch.tensor(data_scaled, dtype=torch.float32)
print("Conversion to PyTorch tensors completed.")

# Create a k-nearest neighbors graph with tqdm progress bar
start_time = time.time()
k = 5
print("Constructing k-nearest neighbors graph...")
with tqdm(total=data_scaled.shape[0]) as pbar:
    knn_graph = kneighbors_graph(data_scaled, k, mode='connectivity', include_self=False)
    pbar.update(data_scaled.shape[0])
print(f"k-nearest neighbors graph construction time: {time.time() - start_time:.2f} seconds")

# Convert knn_graph to COO format
knn_graph_coo = coo_matrix(knn_graph)

# Create edge_index directly from COO format
edge_index = torch.tensor([knn_graph_coo.row, knn_graph_coo.col], dtype=torch.long)
print(f"Edge index shape: {edge_index.shape}")

# Create the PyTorch Geometric data object
graph_data = Data(x=node_features, edge_index=edge_index)

# Convert labels to numeric
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
labels_numeric = labels.map(label_mapping).values
labels_tensor = torch.tensor(labels_numeric, dtype=torch.long)

# Add labels to the graph data
graph_data.y = labels_tensor

# Check label distribution
print("Label distribution in the dataset:")
print(pd.Series(labels_numeric).value_counts())

# Create a train/test split
train_mask = torch.rand(node_features.size(0)) < 0.8
test_mask = ~train_mask
graph_data.train_mask = train_mask
graph_data.test_mask = test_mask

print(f"Train size: {graph_data.train_mask.sum()}, Test size: {graph_data.test_mask.sum()}")

# Define the GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the model
model = GraphSAGE(in_channels=node_features.size(1), hidden_channels=128, out_channels=len(label_mapping))
print(model)

# Define the loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Timing the training process
def train():
    model.train()
    optimizer.zero_grad()
    out = model(graph_data)
    loss = criterion(out[graph_data.train_mask], graph_data.y[graph_data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Test function
def test():
    model.eval()
    out = model(graph_data)
    pred = out.argmax(dim=1)
    correct = (pred[graph_data.test_mask] == graph_data.y[graph_data.test_mask]).sum()
    acc = int(correct) / int(graph_data.test_mask.sum())
    return acc

# Training for a number of epochs
for epoch in range(1, 101):
    loss = train()
    if epoch % 10 == 0:  # Every 10 epochs, evaluate the model on the test set
        acc = test()
        print(f'Epoch: {epoch}, Loss: {loss:.4f}, Test Accuracy: {acc:.4f}')

# Save the model
#torch.save(model.state_dict(), 'graphsage_model.pth')
#print("Model saved to graphsage_model.pth")

Data loading time: 29.90 seconds
Unique labels in the dataset: ['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']
Label distribution in the dataset:
Label
BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64
Conversion to numeric time: 0.67 seconds
Filling NaNs time: 0.55 seco

  0%|          | 0/566149 [00:00<?, ?it/s]