### Training GNNs

#### Step 1: Prepare Data for GNN

In [54]:
# Step 1: Import necessary libraries
import pandas as pd
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_scipy_sparse_matrix
import networkx as nx
from sklearn.model_selection import train_test_split

In [110]:
# Load the filtered datasets
classes_df = pd.read_csv('../data/filtered/filtered_classes.csv')
edgelist_df = pd.read_csv('../data/filtered/filtered_edgelist.csv')
features_df = pd.read_csv('../data/filtered/filtered_features.csv', header=None)

# Rename columns for features_df
features_df.columns = ['txId'] + [f'feature_{i}' for i in range(1, features_df.shape[1])]

# Merge features and classes data
data = pd.merge(features_df, classes_df, on='txId')

# Display the first few rows of the merged data
data.head()

Unnamed: 0,txId,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_158,feature_159,feature_160,feature_161,feature_162,feature_163,feature_164,feature_165,feature_166,class
0,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,2
1,232029206,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,-0.163645,...,-0.613614,0.241128,0.241406,0.60412,0.008632,-0.131155,0.333211,-0.120613,-0.119792,2
2,232344069,1,-0.147852,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.137933,...,-0.613614,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,2
3,27553029,1,-0.151357,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.141519,...,-0.582077,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,2
4,3881097,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.02914,0.242712,-0.16364,...,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.05445,-1.760926,-1.760984,2


In [111]:
# Separate features and labels
X = data.drop(columns=['class'])
y = data[['txId', 'class']]

In [55]:
# Step 2: Prepare the data
# Convert edgelist to adjacency matrix
G = nx.from_pandas_edgelist(edgelist_df, 'txId1', 'txId2')
adj_matrix = nx.adjacency_matrix(G)

In [57]:
G

<networkx.classes.graph.Graph at 0x7f657ebfe8e0>

In [58]:
# Convert adjacency matrix to edge index format
edge_index, _ = from_scipy_sparse_matrix(adj_matrix)

In [112]:
# Combine features and labels into a single data object
# Assuming features are sorted by txId
features = torch.tensor(X.drop('txId', axis=1).values, dtype=torch.float)
labels = torch.tensor(y['class'].values, dtype=torch.long)

In [113]:
data = Data(x=features, edge_index=edge_index, y=labels)

In [114]:
data

Data(x=[46564, 166], edge_index=[2, 73248], y=[46564])

In [115]:
# Save the prepared data object
torch.save(data, 'graph_data.pt')

### Step 2: Define the GCN Model

In [116]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 16)
        self.linear = torch.nn.Linear(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        x = F.dropout(x, training=self.training)
        x = self.linear(x)
        
        return F.log_softmax(x, dim=1)

In [117]:
# Load the prepared data object
data = torch.load('graph_data.pt')

In [118]:
# Instantiate the model
num_node_features = data.num_features
num_classes = len(torch.unique(data.y))

In [119]:
model = GCN(num_node_features, num_classes)

In [120]:
# Print the model architecture
print(model)

GCN(
  (conv1): GCNConv(166, 16)
  (conv2): GCNConv(16, 16)
  (linear): Linear(in_features=16, out_features=2, bias=True)
)


### Step 3: Training the GCN Model

In [121]:
import torch
import torch.optim as optim
from torch_geometric.data import DataLoader

# Adjust the labels to be in the range [0, 1]
data.y = data.y - 1

# Define the loss function and the optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

In [122]:
# Evaluation function
def evaluate():
    model.eval()
    _, pred = model(data).max(dim=1)
    correct = float (pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
    acc = correct / data.test_mask.sum().item()
    return acc

In [123]:
# Create masks for train and test data
data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.train_mask[:int(data.num_nodes * 0.8)] = True
data.test_mask = ~data.train_mask

In [124]:
# Training loop
epochs = 2000
for epoch in range(epochs):
    loss = train()
    if epoch % 10 == 0:
        acc = evaluate()
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, Test Accuracy: {acc:.4f}')

Epoch 000, Loss: 0.7652, Test Accuracy: 0.9490
Epoch 010, Loss: 0.3551, Test Accuracy: 0.9492
Epoch 020, Loss: 0.3140, Test Accuracy: 0.9492
Epoch 030, Loss: 0.2935, Test Accuracy: 0.9492
Epoch 040, Loss: 0.2848, Test Accuracy: 0.9492
Epoch 050, Loss: 0.2704, Test Accuracy: 0.9492
Epoch 060, Loss: 0.2611, Test Accuracy: 0.9492
Epoch 070, Loss: 0.2547, Test Accuracy: 0.9492
Epoch 080, Loss: 0.2495, Test Accuracy: 0.9492
Epoch 090, Loss: 0.2439, Test Accuracy: 0.9492
Epoch 100, Loss: 0.2401, Test Accuracy: 0.9492
Epoch 110, Loss: 0.2368, Test Accuracy: 0.9492
Epoch 120, Loss: 0.2343, Test Accuracy: 0.9492
Epoch 130, Loss: 0.2315, Test Accuracy: 0.9492
Epoch 140, Loss: 0.2291, Test Accuracy: 0.9492
Epoch 150, Loss: 0.2253, Test Accuracy: 0.9492
Epoch 160, Loss: 0.2234, Test Accuracy: 0.9492
Epoch 170, Loss: 0.2220, Test Accuracy: 0.9492
Epoch 180, Loss: 0.2209, Test Accuracy: 0.9492
Epoch 190, Loss: 0.2172, Test Accuracy: 0.9492
Epoch 200, Loss: 0.2156, Test Accuracy: 0.9492
Epoch 210, Lo

In [125]:
# Save the trained model weights
torch.save(model.state_dict(), 'gcn_model_weights.pth')
print("Model weights saved to 'gcn_model_weights.pth'")


Model weights saved to 'gcn_model_weights.pth'


### Loading the weights

In [109]:
# Define the GCN model (make sure the model architecture matches the saved one)
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 16)
        self.linear = torch.nn.Linear(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        x = F.dropout(x, training=self.training)
        x = self.linear(x)
        
        return F.log_softmax(x, dim=1)

# Instantiate the model
model = GCN(num_node_features, num_classes)

# Load the saved model weights
model.load_state_dict(torch.load('gcn_model_weights.pth'))
model.eval()  # Set the model to evaluation mode
print("Model weights loaded from 'gcn_model_weights.pth'")


Model weights loaded from 'gcn_model_weights.pth'
