In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import warnings
import json
import csv
warnings.filterwarnings('ignore')

# Paths
BASE_DIR = Path.cwd().parent.parent.parent.parent
DATA_DIR = BASE_DIR / "oc_mini"

# Add gnn package to path (parent directory)
sys.path.insert(0, str(Path.cwd().parent))

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Configuration

You can easily swap the transformer model and data source by modifying these variables below.

# GNN Baseline 1: Train-only Graph with Frozen Transformer

**Experiment Goal**: Demonstrate that test nodes receive NO benefit from GNN when trained separately on a train-only graph.

**Setup**:
- 90% train nodes, 10% test nodes  
- Create train-only graph (edges only between train nodes, indices 0..N-1)
- Compute embeddings separately:
  - **Train nodes**: Transformer → GNN (graph-enhanced)
  - **Test nodes**: Transformer ONLY (no GNN applied)
- Transformer (SciBERT) is frozen
- Only GNN layers are trainable
- Input: Title + Abstract concatenated

**Expected Result**: Test nodes will have pure transformer embeddings (no graph enhancement), demonstrating the degradation when train/test graphs are separated.

---

In [2]:
# Import GNN modules
from model import TransformerGNN
from graph_utils import (
    create_induced_subgraph,
    create_train_only_graph,
    analyze_graph_statistics,
    get_node_texts
)
from split_utils import create_node_based_split

# Data paths
edgelist_path = DATA_DIR / "network" / "oc_mini_edgelist.csv"
metadata_path = DATA_DIR / "metadata" / "oc_mini_node_metadata.csv"

In [3]:
# Load metadata
metadata_df = pd.read_csv(metadata_path)
print(f"Metadata loaded: {len(metadata_df)} entries")

# Create train/test split (90/10 split)
all_node_ids = [str(node_id) for node_id in metadata_df['id'].values]
train_nodes, test_nodes = create_node_based_split(all_node_ids, test_ratio=0.1, seed=42)

print(f"\nTrain nodes: {len(train_nodes)} ({len(train_nodes)/len(all_node_ids)*100:.1f}%)")
print(f"Test nodes: {len(test_nodes)} ({len(test_nodes)/len(all_node_ids)*100:.1f}%)")

metadata_df.head()

Metadata loaded: 14442 entries

Train nodes: 12998 (90.0%)
Test nodes: 1444 (10.0%)


Unnamed: 0,id,doi,title,abstract
0,128,10.1101/2021.05.10.443415,Improved protein contact prediction using dime...,AbstractDeep residual learning has shown great...
1,163,10.1101/2021.05.07.443114,Following the Trail of One Million Genomes: Fo...,AbstractSevere acute respiratory syndrome coro...
2,200,10.1101/2021.05.11.443555,Mechanism of molnupiravir-induced SARS-CoV-2 m...,Molnupiravir is an orally available antiviral ...
3,941,10.3390/ijms20020449,Bactericidal and Cytotoxic Properties of Silve...,Silver nanoparticles (AgNPs) can be synthesize...
4,1141,10.3390/ijms20040865,Silver Nanoparticles: Synthesis and Applicatio...,"Over the past few decades, metal nanoparticles..."


In [4]:
# Create train-only graph for GNN
# This graph contains ONLY training nodes with remapped indices 0..N-1
# Test nodes are not included at all

train_edge_index, train_node_to_idx, train_idx_to_node = create_train_only_graph(
    edgelist_path,
    train_nodes
)

print(f"\nTrain-only graph created:")
print(f"  Train nodes: {len(train_node_to_idx)}")
print(f"  Edges: {train_edge_index.shape[1]}")

Loading edgelist from /home/vikramr2/oc_mini/network/oc_mini_edgelist.csv...
  Full graph: 111873 edges

Filtering to edges between 12998 training nodes...
  Filtered edges: 91586

Node mapping (train only):
  Train nodes: 12998
  Index range: 0 to 12997

Final edge_index shape: torch.Size([2, 183172])
  Directed edges: 183172

Train-only graph created:
  Train nodes: 12998
  Edges: 183172


In [5]:
# Initialize TransformerGNN model with SciBERT
# Key: Transformer is FROZEN, only GNN layers are trainable

model_name = 'allenai/scibert_scivocab_uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = TransformerGNN(
    model_name=model_name,
    gnn_type='gcn',           # Use GCN layers
    hidden_dim=768,           # Match SciBERT output
    num_gnn_layers=2,         # 2 GNN layers
    dropout=0.1,
    pooling='cls',
    freeze_transformer=True   # IMPORTANT: Freeze transformer!
).to(device)

print("\nModel Summary:")
print(f"  Transformer: {model_name}")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"  Trainable (GNN only): {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"  Frozen (Transformer): {sum(p.numel() for p in model.parameters() if not p.requires_grad):,}")

✓ Transformer weights frozen
✓ Created 2-layer GCN model
  Total GNN parameters: 1,181,184

Model Summary:
  Transformer: allenai/scibert_scivocab_uncased
  Total parameters: 111,102,720
  Trainable (GNN only): 1,184,256
  Frozen (Transformer): 109,918,464


In [6]:
# BASELINE DEMONSTRATION - CORRECTED APPROACH
# 1. Compute GNN embeddings ONLY on train nodes (using train-only graph)
# 2. Compute transformer-only embeddings for test nodes
# This way test nodes truly get transformer-only, demonstrating the degradation

print("="*70)
print("BASELINE: Separate Train (GNN) vs Test (Transformer-only)")
print("="*70)

model.eval()

# Step 1: Get transformer embeddings for TRAIN nodes only
print("\n1. Computing transformer embeddings for TRAIN nodes...")
# Sort train nodes by their index in the train graph (0..N-1)
train_node_list = [train_idx_to_node[i] for i in range(len(train_idx_to_node))]
train_texts = get_node_texts(train_node_list, metadata_df)

train_transformer_embs = []
batch_size = 32
with torch.no_grad():
    for i in tqdm(range(0, len(train_texts), batch_size), desc="Train transformer"):
        batch_texts = train_texts[i:i+batch_size]
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)
        batch_embs = model.encode_text(inputs['input_ids'], inputs['attention_mask'])
        train_transformer_embs.append(batch_embs)

train_transformer_embs = torch.cat(train_transformer_embs, dim=0)
print(f"   Train transformer embeddings: {train_transformer_embs.shape}")

# Step 2: Apply GNN ONLY to train nodes with train-only edge index
print("\n2. Applying GNN to TRAIN nodes only...")
with torch.no_grad():
    train_gnn_embs = model(train_transformer_embs, train_edge_index.to(device))
print(f"   Train GNN embeddings: {train_gnn_embs.shape}")

# Step 3: Get transformer embeddings for TEST nodes (no GNN!)
print("\n3. Computing transformer-only embeddings for TEST nodes...")
test_texts = get_node_texts(test_nodes, metadata_df)

test_transformer_embs = []
with torch.no_grad():
    for i in tqdm(range(0, len(test_texts), batch_size), desc="Test transformer"):
        batch_texts = test_texts[i:i+batch_size]
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)
        batch_embs = model.encode_text(inputs['input_ids'], inputs['attention_mask'])
        test_transformer_embs.append(batch_embs)

test_transformer_embs = torch.cat(test_transformer_embs, dim=0)
print(f"   Test transformer embeddings: {test_transformer_embs.shape}")

# Step 4: Create combined embedding dictionary
print("\n4. Creating combined embedding dictionary...")
embeddings_dict = {}

# Add train nodes (with GNN)
train_gnn_embs_np = train_gnn_embs.cpu().numpy()
for i, node_id in enumerate(train_node_list):
    embeddings_dict[node_id] = train_gnn_embs_np[i]

# Add test nodes (transformer-only)
test_transformer_embs_np = test_transformer_embs.cpu().numpy()
for i, node_id in enumerate(test_nodes):
    embeddings_dict[node_id] = test_transformer_embs_np[i]

print(f"   Total embeddings: {len(embeddings_dict)}")
print(f"   Train (GNN-enhanced): {len(train_node_list)}")
print(f"   Test (Transformer-only): {len(test_nodes)}")

print("\n" + "="*70)
print("BASELINE RESULT:")
print("- Train nodes: Transformer → GNN (graph-enhanced embeddings)")
print("- Test nodes: Transformer ONLY (no graph information)")
print("\nThis demonstrates that test nodes get NO benefit from the GNN")
print("because they have no edges in the training graph.")
print("="*70)

BASELINE: Separate Train (GNN) vs Test (Transformer-only)

1. Computing transformer embeddings for TRAIN nodes...


Train transformer:   0%|          | 0/407 [00:00<?, ?it/s]

   Train transformer embeddings: torch.Size([12998, 768])

2. Applying GNN to TRAIN nodes only...
   Train GNN embeddings: torch.Size([12998, 768])

3. Computing transformer-only embeddings for TEST nodes...


Test transformer:   0%|          | 0/46 [00:00<?, ?it/s]

   Test transformer embeddings: torch.Size([1444, 768])

4. Creating combined embedding dictionary...
   Total embeddings: 14442
   Train (GNN-enhanced): 12998
   Test (Transformer-only): 1444

BASELINE RESULT:
- Train nodes: Transformer → GNN (graph-enhanced embeddings)
- Test nodes: Transformer ONLY (no graph information)

This demonstrates that test nodes get NO benefit from the GNN
because they have no edges in the training graph.
