In [13]:
# Importing required libraries
import pandas as pd
import json
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [14]:
# Prepare graph data
def prepare_graph_data(edges_file, features_file, targets_file):
    import numpy as np
    
    # Load edges
    edges_df = pd.read_csv(edges_file)
    edge_index = torch.tensor(edges_df.values.T, dtype=torch.long)  # Shape: [2, num_edges]
    
    # Load features
    with open(features_file, "r") as f:
        features_json = json.load(f)
    
    # Map features to a consistent matrix
    node_ids = list(map(int, features_json.keys()))  # Convert node IDs to integers
    node_map = {node_id: i for i, node_id in enumerate(sorted(node_ids))}  # Map node IDs to indices
    
    # Create a feature matrix
    num_nodes = len(node_ids)
    max_feature_id = max(max(features) for features in features_json.values())
    features_tensor = torch.zeros((num_nodes, max_feature_id + 1), dtype=torch.float)
    for node_id, feature_list in features_json.items():
        node_idx = node_map[int(node_id)]
        features_tensor[node_idx, feature_list] = 1  # One-hot encode features
    
    # Load targets
    targets_df = pd.read_csv(targets_file)
    target_mapping = {row["id"]: row["new_id"] for _, row in targets_df.iterrows()}
    target_labels = torch.zeros(num_nodes, dtype=torch.long)
    
    # Align targets with node indices
    for _, row in targets_df.iterrows():
        if row["id"] in node_map:
            target_labels[node_map[row["id"]]] = 1 if row["mature"] else 0  # Binary labels
    
    # Ensure edge indices map to node indices
    edges_mapped = np.array([[node_map.get(src, -1), node_map.get(dst, -1)] 
                              for src, dst in edges_df.values if src in node_map and dst in node_map])
    edges_mapped = edges_mapped[edges_mapped.min(axis=1) >= 0]  # Remove invalid edges
    edge_index = torch.tensor(edges_mapped.T, dtype=torch.long)  # Shape: [2, num_edges]
    
    # Create PyTorch Geometric Data object
    data = Data(x=features_tensor, edge_index=edge_index, y=target_labels)
    
    # Split data into train/val/test
    train_idx, test_idx = train_test_split(range(len(data.y)), test_size=0.3, stratify=data.y, random_state=42)
    val_idx, test_idx = train_test_split(test_idx, test_size=0.5, stratify=data.y[test_idx], random_state=42)
    
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    
    train_mask[train_idx] = True
    val_mask[val_idx] = True
    test_mask[test_idx] = True
    
    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask
    
    return data

In [15]:
# Define GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

In [16]:
# Train the model
def train_model(data, model, optimizer, criterion, epochs=200):
    best_val_f1 = 0
    for epoch in range(1, epochs + 1):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
        # Evaluate on validation set
        val_f1 = evaluate_model(data, model, data.val_mask)
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), "best_model.pt")
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}, Val F1: {val_f1:.4f}")
    
    return best_val_f1



In [17]:
# Evaluate the model
def evaluate_model(data, model, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        preds = out[mask].argmax(dim=1)
        f1 = f1_score(data.y[mask].cpu(), preds.cpu(), average="macro")
    return f1

In [18]:
# Main execution
if __name__ == "__main__":
    # File paths
    edges_file = "./dataset/twitch/ENGB/musae_ENGB_edges.csv"
    features_file = "./dataset/twitch/ENGB/musae_ENGB_features.json"
    targets_file = "./dataset/twitch/ENGB/musae_ENGB_target.csv"

    # Prepare data
    data = prepare_graph_data(edges_file, features_file, targets_file)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data = data.to(device)

    # Model, optimizer, and loss
    model = GraphSAGE(in_channels=data.num_node_features, hidden_channels=64, out_channels=2).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    # Train model
    print("Training the model...")
    train_model(data, model, optimizer, criterion)

    # Test the model
    print("Testing the model...")
    model.load_state_dict(torch.load("best_model.pt"))
    test_f1 = evaluate_model(data, model, data.test_mask)
    print(f"Test F1-score: {test_f1:.4f}")

Training the model...
Epoch 10, Loss: 0.0002, Val F1: 1.0000
Epoch 20, Loss: 0.0000, Val F1: 1.0000
Epoch 30, Loss: 0.0000, Val F1: 1.0000
Epoch 40, Loss: 0.0000, Val F1: 1.0000
Epoch 50, Loss: 0.0000, Val F1: 1.0000
Epoch 60, Loss: 0.0000, Val F1: 1.0000
Epoch 70, Loss: 0.0000, Val F1: 1.0000
Epoch 80, Loss: 0.0000, Val F1: 1.0000
Epoch 90, Loss: 0.0000, Val F1: 1.0000
Epoch 100, Loss: 0.0000, Val F1: 1.0000
Epoch 110, Loss: 0.0000, Val F1: 1.0000
Epoch 120, Loss: 0.0000, Val F1: 1.0000
Epoch 130, Loss: 0.0000, Val F1: 1.0000
Epoch 140, Loss: 0.0000, Val F1: 1.0000
Epoch 150, Loss: 0.0000, Val F1: 1.0000
Epoch 160, Loss: 0.0000, Val F1: 1.0000
Epoch 170, Loss: 0.0000, Val F1: 1.0000
Epoch 180, Loss: 0.0000, Val F1: 1.0000
Epoch 190, Loss: 0.0000, Val F1: 1.0000
Epoch 200, Loss: 0.0000, Val F1: 1.0000
Testing the model...
Test F1-score: 1.0000


  model.load_state_dict(torch.load("best_model.pt"))
