In [15]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import warnings
import json
import csv
warnings.filterwarnings('ignore')

# Paths
BASE_DIR = Path.cwd().parent.parent.parent
DATA_DIR = BASE_DIR / "oc_mini"

# Add dcat package to path
sys.path.insert(0, str(BASE_DIR / "dcat"))

# Device
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device = torch.device("mps")
print(f"Using device: {device}")


Using device: mps


In [16]:
from notebook_utils import load_cluster_and_metadata

# Load tree and metadata
tree_path = DATA_DIR / "clustering" / "disjoint" / "oc_mini_clusters_0.001.csv"
metadata_path = DATA_DIR / "metadata" / "oc_mini_node_metadata.csv"

cluster_df, metadata_df = load_cluster_and_metadata(tree_path, metadata_path)

print(f"\nFirst few rows of cluster data:")
print(cluster_df.head(10))

print(f"\nFirst few rows of metadata:")
print(metadata_df.head())


Loading cluster data from /Users/rajkiritivelicheti/Documents/CS_546_NLP/project/oc_mini/clustering/disjoint/oc_mini_clusters_0.001.csv...
  ✓ Loaded: 19705 node-cluster assignments

Loading metadata from /Users/rajkiritivelicheti/Documents/CS_546_NLP/project/oc_mini/metadata/oc_mini_node_metadata.csv...
  ✓ Loaded: 14442 entries

Cluster Statistics:
  Unique nodes: 19705
  Unique clusters: 5
  Mean cluster size: 3941.00
  Median cluster size: 2480
  Largest cluster: 8989 nodes

First few rows of cluster data:
      node  cluster
0    45066        5
1   989648        0
2  1146632        0
3  3732252        0
4  9488729        5
5  9489474        5
6  9489060        5
7  6382148        5
8  6382959        5
9  1623959        5

First few rows of metadata:
     id                        doi  \
0   128  10.1101/2021.05.10.443415   
1   163  10.1101/2021.05.07.443114   
2   200  10.1101/2021.05.11.443555   
3   941       10.3390/ijms20020449   
4  1141       10.3390/ijms20040865   

      

In [17]:
# Reload modules to pick up changes
import importlib
import split_utils
importlib.reload(split_utils)
from split_utils import create_node_based_split, print_split_info

In [18]:
from split_utils import create_node_based_split, print_split_info

# Create train/test split at NODE level WITHIN each cluster
# This ensures all clusters have both train and test nodes
# Test nodes can be used to evaluate cluster membership prediction
train_node_ids, test_node_ids = create_node_based_split(
    cluster_df, 
    metadata_df,
    test_ratio=0.1,  # 10% of nodes from each cluster for testing
    seed=42
)

# Print detailed split information
print_split_info(train_node_ids, test_node_ids, cluster_df)

Creating node-based split from 5 clusters...
  Cluster 5: 2922 nodes -> 2630 train, 292 test
  Cluster 0: 7058 nodes -> 6353 train, 705 test
  Cluster 40: 1447 nodes -> 1303 train, 144 test
  Cluster 58: 1728 nodes -> 1556 train, 172 test
  Cluster 30: 1287 nodes -> 1159 train, 128 test

Node-based Split Summary:
  Train nodes: 13001 (from all clusters)
  Test nodes: 1441 (from all clusters)
  Total: 14442 nodes

TRAIN/TEST SPLIT SUMMARY

TRAIN SET:
  Nodes: 13001
  Clusters represented: 5
  Avg nodes per cluster: 2600.20
  Median nodes per cluster: 1556
  Range: [1159, 6353]

TEST SET:
  Nodes: 1441
  Clusters represented: 5
  Avg nodes per cluster: 288.20
  Median nodes per cluster: 172
  Range: [128, 705]

CLUSTER DISTRIBUTION:
  Cluster 0: 7058 total -> 6353 train (90.0%), 705 test (10.0%)
  Cluster 5: 2922 total -> 2630 train (90.0%), 292 test (10.0%)
  Cluster 30: 1287 total -> 1159 train (90.1%), 128 test (9.9%)
  Cluster 40: 1447 total -> 1303 train (90.0%), 144 test (10.0%)
  

In [None]:
from notebook_utils import train_disjoint_model, plot_training_history

# Training hyperparameters
MODEL_NAME = 'allenai/scibert_scivocab_uncased'  # Changed to match hcat experiments
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
MARGIN = 0.5  # Changed to match hcat experiments
SAMPLES_PER_CLUSTER = 3  # Changed to match hcat experiments

# Train the model using ONLY train nodes
model, tokenizer, history = train_disjoint_model(
    cluster_df=cluster_df,
    metadata_df=metadata_df,
    train_node_ids=train_node_ids,  # Train on these nodes only
    model_name=MODEL_NAME,
    device=device,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    lr=LEARNING_RATE,
    margin=MARGIN,
    samples_per_cluster=SAMPLES_PER_CLUSTER,
    pooling='cls'
)

# Plot training history
plot_training_history(history)


DISJOINT CLUSTER TRIPLET LOSS TRAINING

Loading ncbi/MedCPT-Article-Encoder...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]


Creating training dataset...
Generating triplets from 5 clusters...
Total nodes: 19705


Generating triplets: 100%|██████████| 5/5 [00:00<00:00, 90.37it/s]
Generating triplets: 100%|██████████| 5/5 [00:00<00:00, 90.37it/s]


Generated 25 triplets
  Generated 25 training triplets
  Train samples: 22
  Val samples: 3

Triplet Margin Loss:
  Margin: 1.0

Training for 3 epochs...

Epoch 1/3


Training:   0%|          | 0/2 [00:47<?, ?it/s]
Training:   0%|          | 0/2 [00:47<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 17.91 GB, other allocations: 218.67 MB, max allowed: 18.13 GB). Tried to allocate 24.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
from notebook_utils import compute_embeddings

# Compute embeddings for ALL nodes (both train and test)
# But we will evaluate only on test nodes
print("Computing embeddings for all nodes...")
embeddings_dict = compute_embeddings(
    model=model,
    tokenizer=tokenizer,
    metadata_df=metadata_df,
    device=device,
    batch_size=32
)

print(f"\nTotal embeddings computed: {len(embeddings_dict)}")
print(f"Test nodes with embeddings: {len([n for n in test_node_ids if n in embeddings_dict])}")

# Evaluation on Test Set

Now we evaluate the model performance on the **held-out test nodes**. These nodes were excluded during training, so we can test if the model learned to predict their cluster membership!

In [None]:
from evaluation import evaluate_cluster_quality

# Evaluate how well embeddings preserve cluster structure on TEST set
cluster_quality_results = evaluate_cluster_quality(
    embeddings_dict=embeddings_dict,
    cluster_df=cluster_df,
    test_node_ids=test_node_ids  # Only evaluate on test nodes
)

In [None]:
from evaluation import evaluate_intra_cluster_similarity, plot_similarity_distributions

# Evaluate intra vs inter cluster similarity on TEST nodes
intra_inter_results = evaluate_intra_cluster_similarity(
    embeddings_dict=embeddings_dict,
    cluster_df=cluster_df,
    test_node_ids=test_node_ids,  # Only evaluate on test nodes
    n_samples=1000
)

# Visualize the distributions
plot_similarity_distributions(intra_inter_results)

In [None]:
from evaluation import evaluate_retrieval, plot_retrieval_results

# Evaluate retrieval: given a test node, can we retrieve other nodes from same cluster?
retrieval_results = evaluate_retrieval(
    embeddings_dict=embeddings_dict,
    cluster_df=cluster_df,
    test_node_ids=test_node_ids,  # Only test on held-out nodes
    k_values=[5, 10, 20, 50]
)

# Visualize retrieval performance
plot_retrieval_results(retrieval_results)

# Link Prediction Evaluation

This is the primary evaluation metric - same as in hcat experiments!

In [None]:
# Add path to utils for link prediction
sys.path.insert(0, str(BASE_DIR / "utils" / "evaluation"))

from link_prediction import evaluate_network_link_prediction, plot_link_prediction_results

# Path to edgelist (if available)
edgelist_path = DATA_DIR / "network" / "oc_mini_edgelist.csv"

# Check if edgelist exists
if edgelist_path.exists():
    print(f"Found edgelist at {edgelist_path}")
    
    # Evaluate link prediction on TEST nodes only
    # K values matching hcat experiments
    link_pred_results = evaluate_network_link_prediction(
        edgelist_path=str(edgelist_path),
        embeddings_dict=embeddings_dict,
        test_nodes=test_node_ids,  # Only evaluate on test nodes
        k_values=[5, 10, 20, 50, 100, 500, 1000, 2000],
        compute_auc=True,
        num_negative_samples=10
    )
    
    # Plot results
    if 'topk' in link_pred_results:
        plot_link_prediction_results(
            link_pred_results['topk'],
            link_pred_results.get('auc')
        )
else:
    print(f"Edgelist not found at {edgelist_path}")
    print("Skipping link prediction evaluation.")

# Baseline Comparison: Evaluate Pre-trained Model (No Fine-tuning)

Let's also compute embeddings using the base model without fine-tuning to see the improvement.

In [None]:
print("Loading baseline (pre-trained) model...")
baseline_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
baseline_model = AutoModel.from_pretrained(MODEL_NAME).to(device)

print("Computing baseline embeddings...")
baseline_embeddings_dict = compute_embeddings(
    model=baseline_model,
    tokenizer=baseline_tokenizer,
    metadata_df=metadata_df,
    device=device,
    batch_size=32
)

print(f"Baseline embeddings computed: {len(baseline_embeddings_dict)}")

In [None]:
print("\n" + "="*80)
print("BASELINE MODEL EVALUATION")
print("="*80)

# Evaluate baseline on same test set - Link prediction only (matching hcat)
if edgelist_path.exists():
    baseline_link_pred = evaluate_network_link_prediction(
        edgelist_path=str(edgelist_path),
        embeddings_dict=baseline_embeddings_dict,
        test_nodes=test_node_ids,
        k_values=[5, 10, 20, 50, 100, 500, 1000, 2000],
        compute_auc=True,
        num_negative_samples=10
    )
else:
    print("Edgelist not found, skipping baseline link prediction")

# Comparison: Fine-tuned vs Baseline

Compare the performance of fine-tuned model vs baseline on **link prediction** (matching hcat experiments).

In [None]:
import pandas as pd

# Create comparison table for link prediction (matching hcat experiments)
if edgelist_path.exists() and 'link_pred_results' in locals() and 'baseline_link_pred' in locals():
    
    comparison_data = {
        'K': [],
        'Baseline Precision@K': [],
        'Fine-tuned Precision@K': [],
        'Improvement': []
    }

    k_values = link_pred_results['topk']['k_values']
    
    for k in k_values:
        bl_prec = baseline_link_pred['topk']['summary'][k]['precision@k']
        ft_prec = link_pred_results['topk']['summary'][k]['precision@k']
        improvement = ((ft_prec - bl_prec) / bl_prec) * 100 if bl_prec != 0 else 0
        
        comparison_data['K'].append(k)
        comparison_data['Baseline Precision@K'].append(f"{bl_prec:.4f}")
        comparison_data['Fine-tuned Precision@K'].append(f"{ft_prec:.4f}")
        comparison_data['Improvement'].append(f"{improvement:+.2f}%")

    comparison_df = pd.DataFrame(comparison_data)

    print("\n" + "="*80)
    print("PERFORMANCE COMPARISON: BASELINE VS FINE-TUNED (LINK PREDICTION)")
    print("="*80)
    print(comparison_df.to_string(index=False))
    
    # AUC comparison
    if 'auc' in link_pred_results and 'auc' in baseline_link_pred:
        bl_auc = baseline_link_pred['auc']['auc_roc']
        ft_auc = link_pred_results['auc']['auc_roc']
        auc_improvement = ((ft_auc - bl_auc) / bl_auc) * 100
        
        print(f"\nAUC-ROC:")
        print(f"  Baseline: {bl_auc:.4f}")
        print(f"  Fine-tuned: {ft_auc:.4f}")
        print(f"  Improvement: {auc_improvement:+.2f}%")
    
    print("="*80)
else:
    print("Link prediction results not available for comparison")

In [None]:
# Save results to JSON
import json
from datetime import datetime

if 'link_pred_results' in locals():
    results_data = {
        'timestamp': datetime.now().isoformat(),
        'model_name': MODEL_NAME,
        'hyperparameters': {
            'epochs': EPOCHS,
            'batch_size': BATCH_SIZE,
            'learning_rate': LEARNING_RATE,
            'margin': MARGIN,
            'samples_per_cluster': SAMPLES_PER_CLUSTER
        },
        'split': {
            'train_nodes': len(train_node_ids),
            'test_nodes': len(test_node_ids),
            'test_ratio': len(test_node_ids) / (len(train_node_ids) + len(test_node_ids))
        },
        'fine_tuned': {
            'link_prediction': {
                'topk': link_pred_results['topk']['summary'],
                'auc': link_pred_results.get('auc')
            }
        }
    }
    
    if 'baseline_link_pred' in locals():
        results_data['baseline'] = {
            'link_prediction': {
                'topk': baseline_link_pred['topk']['summary'],
                'auc': baseline_link_pred.get('auc')
            }
        }

    # Create results directory
    results_dir = Path.cwd() / "results"
    results_dir.mkdir(exist_ok=True)

    results_file = results_dir / f"disjoint_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(results_file, 'w') as f:
        json.dump(results_data, f, indent=2)

    print(f"\nResults saved to: {results_file}")
else:
    print("Link prediction results not available to save")