In [13]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
import torch.optim as optim
import os
# Move two steps up to access the 'ml_graphs' folder

# Load the processed dataset
try:
    data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt")
except FileNotFoundError:
    os.chdir(os.path.abspath(os.path.join(os.getcwd(), "../")))
    data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt")

# Extract the paper_cites_edge_index (for the paper -> cites -> paper relation)
paper_cites_edge_index = data.edge_index_dict[('paper', 'cites', 'paper')]

# Extract the paper node features (assuming paper feature is in data.x['paper'])
paper_node_features = data.x_dict['paper']

# Get the number of papers
num_papers = paper_node_features.size(0)



  data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt")


In [14]:
class GraphSAGEModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGEModel, self).__init__()
        # Two GraphSAGE layers
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
    
    def forward(self, x, edge_index):
        # Apply first GraphSAGE layer with ReLU activation
        x = F.relu(self.conv1(x, edge_index))
        # Apply second GraphSAGE layer
        x = self.conv2(x, edge_index)
        return x


In [21]:
import numpy as np

# Get the positive edges for paper_cites_edge_index
edges = paper_cites_edge_index.T.numpy()

edges_sample = edges[:20]
# Shuffle edges and split them into train and test sets (80% train, 20% test)
np.random.shuffle(edges_sample)
train_size = int(0.8 * len(edges_sample))
train_edges = edges_sample[:train_size]
test_edges = edges_sample[train_size:]

# Generate negative edges (pairs of nodes without edges between them)
def generate_negative_edges(num_samples, num_papers, existing_edges):
    neg_edges = set()
    while len(neg_edges) < num_samples:
        u = np.random.randint(0, num_papers)
        v = np.random.randint(0, num_papers)
        if (u, v) not in existing_edges and (v, u) not in existing_edges:
            neg_edges.add((u, v))
    return np.array(list(neg_edges))

# Get negative edges for training and testing
train_neg_edges = generate_negative_edges(len(train_edges), num_papers, set(map(tuple, train_edges)))
test_neg_edges = generate_negative_edges(len(test_edges), num_papers, set(map(tuple, train_edges)))

# Combine positive and negative edges for training
train_edges = np.concatenate([train_edges, train_neg_edges], axis=0)

# Convert the edges to PyTorch tensors
train_edges = torch.tensor(train_edges, dtype=torch.long).T  # Shape (2, num_edges)
test_edges = torch.tensor(test_edges, dtype=torch.long).T  # Shape (2, num_edges)


In [22]:
# Initialize the model, loss function, and optimizer
in_channels = paper_node_features.size(1)  # Size of paper node features
hidden_channels = 128
out_channels = 128  # Output embedding size
model = GraphSAGEModel(in_channels, hidden_channels, out_channels)

optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.BCEWithLogitsLoss()

# Train the model for edge prediction
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    
    optimizer.zero_grad()
    
    # Forward pass: Compute node embeddings
    node_embeddings = model(paper_node_features, paper_cites_edge_index)
    
    # For each edge pair, compute the probability of the edge
    u_embed = node_embeddings[train_edges[0]]
    v_embed = node_embeddings[train_edges[1]]
    
    # Compute the edge probability using cosine similarity (you can experiment with different methods)
    edge_prob = torch.sigmoid((u_embed * v_embed).sum(dim=1))  # Dot product for edge prediction
    
    # Create labels: 1 for positive edges (train), 0 for negative edges
    labels = torch.cat([torch.ones(len(train_edges) // 2), torch.zeros(len(train_edges) // 2)]).float()
    
    # Compute the loss
    loss = loss_fn(edge_prob, labels)
    
    # Backpropagation
    loss.backward()
    optimizer.step()
    
    if epoch % 100 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.4f}")


ValueError: Target size (torch.Size([2])) must be the same as input size (torch.Size([32]))

In [None]:
# Evaluate the model on test data
model.eval()

# Forward pass: Get node embeddings for the test set
node_embeddings = model(paper_node_features, paper_cites_edge_index)

# Get embeddings for test edges
u_embed_test = node_embeddings[test_edges[0]]
v_embed_test = node_embeddings[test_edges[1]]

# Compute edge probabilities for the test set
test_edge_prob = torch.sigmoid((u_embed_test * v_embed_test).sum(dim=1))

# Evaluate using a threshold (0.5)
pred_labels = (test_edge_prob > 0.5).float()

# Compare predicted labels to actual labels
# Test labels are 1 for positive edges, 0 for negative edges
test_labels = torch.cat([torch.ones(len(test_edges) // 2), torch.zeros(len(test_edges) // 2)]).float()

# Compute accuracy
accuracy = (pred_labels == test_labels).float().mean()
print(f"Test Accuracy: {accuracy:.4f}")
