### Training GNNs

#### Step 1: Prepare Data for GNN

In [4]:
import torch
from torch_geometric.data import Data
import networkx as nx
import pandas as pd

# Function to convert data to PyTorch Geometric format
def create_pyg_data(features_df, classes_df, edgelist_df):
    # Create a directed graph
    G = nx.DiGraph()

    # Add nodes with features and labels
    for _, row in features_df.iterrows():
        tx_id = row['txId']
        features = row[1:].tolist()
        tx_class = classes_df.loc[classes_df['txId'] == tx_id, 'class'].values[0]
        G.add_node(tx_id, features=features, label=tx_class)

    # Add edges
    for _, row in edgelist_df.iterrows():
        G.add_edge(row['txId1'], row['txId2'])

    # Convert to PyTorch Geometric data format
    edge_index = torch.tensor(list(G.edges)).t().contiguous()
    x = torch.tensor([G.nodes[node]['features'] for node in G.nodes], dtype=torch.float)
    y = torch.tensor([G.nodes[node]['label'] for node in G.nodes], dtype=torch.long)
    
    data = Data(x=x, edge_index=edge_index, y=y)
    return data

# Load the split datasets
X_train = pd.read_csv('../data/splits/training_features.csv')
y_train = pd.read_csv('../data/splits/training_labels.csv')
X_test = pd.read_csv('../data/splits/test_features.csv')
y_test = pd.read_csv('../data/splits/test_labels.csv')
edgelist_df = pd.read_csv('../data/filtered/filtered_edgelist.csv')

# Merge features and labels for train and test datasets
train_data = create_pyg_data(X_train, y_train, edgelist_df)
test_data = create_pyg_data(X_test, y_test, edgelist_df)

print(train_data)
print(test_data)


KeyError: 'txId'

#### Step 2: Define the GNN Model

In [None]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the model
num_features = train_data.num_node_features
num_classes = len(set(train_data.y.numpy()))
model = GCN(num_features, num_classes)
print(model)


#### Step 3: Train the Model

In [None]:
import torch.optim as optim

# Training function
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 200
for epoch in range(num_epochs):
    loss = train(model, train_data, optimizer, criterion)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}')


#### Step 4: Evaluate the Model

In [None]:
def test(model, data):
    model.eval()
    out = model(data)
    pred = out.argmax(dim=1)
    correct = pred == data.y
    acc = int(correct.sum()) / correct.size(0)
    return acc

# Testing the model
train_acc = test(model, train_data)
test_acc = test(model, test_data)
print(f'Train Accuracy: {train_acc:.4f}')
print(f'Test Accuracy: {test_acc:.4f}')
