In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import kneighbors_graph
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
import torch.optim as optim
import time

# Load the CSV file
file_path = 'train_dataset2.csv'
data = pd.read_csv(file_path, low_memory=False)

# Convert all columns to numeric, coerce errors to NaN
data_numeric = data.apply(pd.to_numeric, errors='coerce')

# Fill NaN values with the mean of each column
data_filled = data_numeric.fillna(data_numeric.mean())

# Handle infinite and very large values
data_filled = data_filled.replace([np.inf, -np.inf], np.nan)
data_filled = data_filled.fillna(data_filled.max())

# Check again for any remaining NaNs and fill them
if data_filled.isnull().values.any():
    data_filled = data_filled.fillna(0)  # Alternatively, you can use other strategies to fill NaNs

# Normalize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_filled)

# Convert to PyTorch tensors
node_features = torch.tensor(data_scaled, dtype=torch.float32)

# Create a k-nearest neighbors graph
k = 10  # You can adjust this parameter
knn_graph = kneighbors_graph(data_scaled, k, mode='connectivity', include_self=False)
edge_index = torch.tensor(knn_graph.nonzero(), dtype=torch.long)

# Create the PyTorch Geometric data object
graph_data = Data(x=node_features, edge_index=edge_index)

print(graph_data)

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Example usage
model = GraphSAGE(in_channels=node_features.size(1), hidden_channels=128, out_channels=2)
print(model)

# Assume labels are available and split data into training and test sets
# For demonstration, let's create dummy labels
labels = torch.randint(0, 2, (node_features.size(0),))
train_mask = torch.rand(node_features.size(0)) < 0.8

# Prepare the data for training
graph_data.y = labels
graph_data.train_mask = train_mask
graph_data.test_mask = ~train_mask

# Define the model, loss, and optimizer
model = GraphSAGE(in_channels=node_features.size(1), hidden_channels=128, out_channels=2)
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

#Timing the training process
start_time = time.time()
# Training loop
def train():
    model.train()
    optimizer.zero_grad()
    out = model(graph_data)
    loss = criterion(out[graph_data.train_mask], graph_data.y[graph_data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Test function
def test():
    model.eval()
    out = model(graph_data)
    pred = out.argmax(dim=1)
    correct = (pred[graph_data.test_mask] == graph_data.y[graph_data.test_mask]).sum()
    acc = int(correct) / int(graph_data.test_mask.sum())
    return acc

# Training for a number of epochs
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        acc = test()
        print(f'Epoch: {epoch}, Loss: {loss:.4f}, Test Accuracy: {acc:.4f}')

end_time = time.time()
print(f"Training time: {end_time-start_time:.2f} seconds")


Data(x=[760151, 85], edge_index=[2, 7601510])
GraphSAGE(
  (conv1): SAGEConv(85, 128, aggr=mean)
  (conv2): SAGEConv(128, 2, aggr=mean)
)
Epoch: 10, Loss: 0.7054, Test Accuracy: 0.5014
Epoch: 20, Loss: 0.6959, Test Accuracy: 0.5011
Epoch: 30, Loss: 0.6938, Test Accuracy: 0.5009
Epoch: 40, Loss: 0.6933, Test Accuracy: 0.4994
Epoch: 50, Loss: 0.6931, Test Accuracy: 0.5006
Epoch: 60, Loss: 0.6930, Test Accuracy: 0.5007
Epoch: 70, Loss: 0.6929, Test Accuracy: 0.4987
Epoch: 80, Loss: 0.6929, Test Accuracy: 0.5003
Epoch: 90, Loss: 0.6929, Test Accuracy: 0.5003
Epoch: 100, Loss: 0.6928, Test Accuracy: 0.5004
Epoch: 110, Loss: 0.6928, Test Accuracy: 0.5006
Epoch: 120, Loss: 0.6928, Test Accuracy: 0.5003
Epoch: 130, Loss: 0.6927, Test Accuracy: 0.5004
Epoch: 140, Loss: 0.6927, Test Accuracy: 0.5007
Epoch: 150, Loss: 0.6927, Test Accuracy: 0.5002
Epoch: 160, Loss: 0.6927, Test Accuracy: 0.5004
Epoch: 170, Loss: 0.6926, Test Accuracy: 0.5004
Epoch: 180, Loss: 0.6926, Test Accuracy: 0.5003
Epoch: 