In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import kneighbors_graph
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
import torch.optim as optim
import time

# Load the CSV file
file_path = 'CIC_data.csv'
start_time = time.time()
data = pd.read_csv(file_path, low_memory=False)
print(f"Data loading time: {time.time() - start_time:.2f} seconds")

# Convert all columns to numeric, coerce errors to NaN
start_time = time.time()
data_numeric = data.apply(pd.to_numeric, errors='coerce')
print(f"Conversion to numeric time: {time.time() - start_time:.2f} seconds")

# Fill NaN values with the mean of each column
start_time = time.time()
data_filled = data_numeric.fillna(data_numeric.mean())
print(f"Filling NaNs time: {time.time() - start_time:.2f} seconds")

# Handle infinite and very large values
start_time = time.time()
data_filled = data_filled.replace([np.inf, -np.inf], np.nan)
data_filled = data_filled.fillna(data_filled.max())
print(f"Handling infinities time: {time.time() - start_time:.2f} seconds")

# Check again for any remaining NaNs and fill them
start_time = time.time()
if data_filled.isnull().values.any():
    data_filled = data_filled.fillna(0)  # Alternatively, you can use other strategies to fill NaNs
print(f"Final NaN check and fill time: {time.time() - start_time:.2f} seconds")

# Extract labels
labels = data_filled['Label']
data_filled = data_filled.drop(columns=['Label'])

# Normalize the data
start_time = time.time()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_filled)
print(f"Normalization time: {time.time() - start_time:.2f} seconds")

# Convert to PyTorch tensors
node_features = torch.tensor(data_scaled, dtype=torch.float32)
print("Conversion to PyTorch tensors completed.")

# Create a k-nearest neighbors graph
start_time = time.time()
k = 10  # You can adjust this parameter
knn_graph = kneighbors_graph(data_scaled, k, mode='connectivity', include_self=False)
print(f"k-nearest neighbors graph construction time: {time.time() - start_time:.2f} seconds")

edge_index = torch.tensor(knn_graph.nonzero(), dtype=torch.long).t().contiguous()

# Create the PyTorch Geometric data object
graph_data = Data(x=node_features, edge_index=edge_index)

# Convert labels to numeric
label_mapping = {label: idx for idx, label in enumerate(labels.unique())}
labels_numeric = labels.map(label_mapping).values
labels_tensor = torch.tensor(labels_numeric, dtype=torch.long)

# Add labels to the graph data
graph_data.y = labels_tensor

# Create a train/test split
train_mask = torch.rand(node_features.size(0)) < 0.8
test_mask = ~train_mask
graph_data.train_mask = train_mask
graph_data.test_mask = test_mask

# Define the GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the model
model = GraphSAGE(in_channels=node_features.size(1), hidden_channels=128, out_channels=len(label_mapping))
print(model)

# Define the loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Timing the training process
def train():
    model.train()
    optimizer.zero_grad()
    out = model(graph_data)
    loss = criterion(out[graph_data.train_mask], graph_data.y[graph_data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Test function
def test():
    model.eval()
    out = model(graph_data)
    pred = out.argmax(dim=1)
    correct = (pred[graph_data.test_mask] == graph_data.y[graph_data.test_mask]).sum()
    acc = int(correct) / int(graph_data.test_mask.sum())
    return acc

# Training for a number of epochs
for epoch in range(1, 401):
    loss = train()
    if epoch % 10 == 0:  # Every 10 epochs, evaluate the model on the test set
        acc = test()
        print(f'Epoch: {epoch}, Loss: {loss:.4f}, Test Accuracy: {acc:.4f}')


# Save the model
#torch.save(model.state_dict(), 'graphsage_model.pth')
#print("Model saved to graphsage_model.pth")


Data loading time: 25.74 seconds
Conversion to numeric time: 3.03 seconds
Filling NaNs time: 1.74 seconds
Handling infinities time: 5.85 seconds
Final NaN check and fill time: 0.75 seconds
Normalization time: 4.07 seconds
Conversion to PyTorch tensors completed.
