In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import warnings
import json
warnings.filterwarnings('ignore')

# Paths
BASE_DIR = Path.cwd().parent.parent
DATA_DIR = BASE_DIR / "oc_mini"

# Add hcat package to path
sys.path.insert(0, str(BASE_DIR / "cat" / "dcat"))

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Load clustering and metadata
metadata_path = DATA_DIR / "metadata" / "oc_mini_node_metadata.csv"
clustering_path = DATA_DIR / "clustering" / "disjoint" / "oc_mini_clusters_0.001.csv"

metadata_df = pd.read_csv(metadata_path)
clustering_df = pd.read_csv(clustering_path)

print(f"Metadata loaded: {len(metadata_df)} entries")
print(f"Clustering loaded: {len(clustering_df)} entries")

# IMPORTANT: Create test split BEFORE training
# This ensures validation and test sets use the same nodes
from notebook_utils import create_test_split

all_node_ids = [str(node_id) for node_id in metadata_df['id'].values]
test_val_nodes = create_test_split(all_node_ids, test_ratio=0.1, seed=42)

print(f"\nTest/Val set: {len(test_val_nodes)} nodes ({len(test_val_nodes)/len(all_node_ids)*100:.1f}%)")
print(f"Train set: {len(all_node_ids) - len(test_val_nodes)} nodes")

metadata_df.head()

In [None]:
from train import train_model

# Train model with standard triplet loss
# Using test_val_nodes for validation to ensure consistency
finetuned_model, tokenizer, history = train_model(
    clustering_csv_path=str(clustering_path),
    metadata_csv_path=str(metadata_path),
    output_dir=str(BASE_DIR / "cat" / "models" / "finetuned_dcat_triplet"),
    model_name='allenai/scibert_scivocab_uncased',
    device=str(device),
    batch_size=16,
    epochs=3,
    lr=1e-5,
    margin=0.5,              # Standard triplet margin
    samples_per_node=3,      # ~43K triplets for 14K nodes
    pooling='cls',
    loss_type='triplet',     # Standard triplet loss (not adaptive)
    val_nodes=test_val_nodes  # Use same nodes for validation as we'll use for testing
)

In [None]:
# Plot training curves
plt.figure(figsize=(10, 5))
plt.plot(history['train_loss'], marker='o', label='Train Loss', linewidth=2)
plt.plot(history['val_loss'], marker='s', label='Val Loss', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Disjoint Clustering Triplet Loss Training', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

print(f"\nFinal train loss: {history['train_loss'][-1]:.4f}")
print(f"Final val loss: {history['val_loss'][-1]:.4f}")

In [None]:
from notebook_utils import compute_embeddings

# Load the BEST saved model (not the final epoch model)
output_dir = BASE_DIR / "cat" / "models" / "finetuned_dcat_triplet"

print("Loading best saved model...")
best_tokenizer = AutoTokenizer.from_pretrained(str(output_dir))
best_model = AutoModel.from_pretrained(str(output_dir)).to(device)
best_model.eval()

print(f"✓ Loaded best model from: {output_dir}")

# Compute embeddings for all nodes using BEST model
print("\nComputing embeddings with best model...")
embeddings_dict = compute_embeddings(
    best_model,  # Use BEST saved model
    best_tokenizer,
    metadata_df,
    device,
    batch_size=32
)

print(f"✓ Computed embeddings for {len(embeddings_dict)} nodes")

In [None]:
# Import evaluation functions
sys.path.insert(0, str(BASE_DIR / "cat" / "experiments"))

from network_link_prediction import (
    evaluate_network_link_prediction,
    plot_link_prediction_results
)

edgelist_path = DATA_DIR / "network" / "oc_mini_edgelist.csv"

# Evaluate fine-tuned model using the SAME nodes as validation
print(f"Evaluating on test set: {len(test_val_nodes)} nodes")
results = evaluate_network_link_prediction(
    edgelist_path=str(edgelist_path),
    embeddings_dict=embeddings_dict,
    test_nodes=test_val_nodes,  # Same as validation nodes
    k_values=[5, 10, 20, 50, 100],
    compute_auc=True,
    num_negative_samples=10
)

# Visualize
plot_link_prediction_results(results['topk'], results['auc'])

In [None]:
# Load baseline SciBERT (not fine-tuned)
print("Loading baseline model...")
baseline_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(device)
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Compute baseline embeddings
print("Computing baseline embeddings...")
baseline_embeddings = compute_embeddings(
    baseline_model,
    tokenizer,
    metadata_df,
    device,
    batch_size=32
)

print(f"✓ Baseline embeddings: {len(baseline_embeddings)} nodes")

In [None]:
# Evaluate baseline
print("Evaluating baseline...")
baseline_results = evaluate_network_link_prediction(
    edgelist_path=str(edgelist_path),
    embeddings_dict=baseline_embeddings,
    test_nodes=test_val_nodes,
    k_values=[5, 10, 20, 50, 100],
    compute_auc=True,
    num_negative_samples=10
)

In [None]:
# Create comparison table
k_values = [5, 10, 20, 50, 100]
comparison_data = []

for k in k_values:
    baseline_prec = baseline_results['topk']['summary'][k]['precision@k']
    finetuned_prec = results['topk']['summary'][k]['precision@k']

    comparison_data.append({
        'K': k,
        'Baseline Precision': baseline_prec,
        'Fine-tuned Precision': finetuned_prec,
        'Improvement': finetuned_prec - baseline_prec,
        'Improvement %': ((finetuned_prec - baseline_prec) / baseline_prec) * 100
    })

comparison_df = pd.DataFrame(comparison_data)

print("\n" + "="*80)
print("PRECISION@K COMPARISON: Baseline vs Fine-tuned (DCAT Triplet Loss)")
print("="*80)
print(comparison_df.to_string(index=False))

# AUC comparison
print("\n" + "="*80)
print("AUC METRICS")
print("="*80)
print(f"Baseline AUC-ROC:    {baseline_results['auc']['auc_roc']:.4f}")
print(f"Fine-tuned AUC-ROC:  {results['auc']['auc_roc']:.4f}")
print(f"Improvement:         {results['auc']['auc_roc'] - baseline_results['auc']['auc_roc']:.4f} " +
      f"({((results['auc']['auc_roc'] - baseline_results['auc']['auc_roc'])/baseline_results['auc']['auc_roc'])*100:.1f}%)")
print()
print(f"Baseline AUC-PR:     {baseline_results['auc']['auc_pr']:.4f}")
print(f"Fine-tuned AUC-PR:   {results['auc']['auc_pr']:.4f}")
print(f"Improvement:         {results['auc']['auc_pr'] - baseline_results['auc']['auc_pr']:.4f} " +
      f"({((results['auc']['auc_pr'] - baseline_results['auc']['auc_pr'])/baseline_results['auc']['auc_pr'])*100:.1f}%)")

# Visualize
plot_link_prediction_results(baseline_results['topk'], baseline_results['auc'])

In [None]:
from sklearn.metrics import roc_curve

# Side-by-side comparison plot
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# AUC-ROC
ax = axes[0]

fpr, tpr, _ = roc_curve(results['auc']['y_true'], results['auc']['y_scores'])
baseline_fpr, baseline_tpr, _ = roc_curve(results['auc']['y_true'], baseline_results['auc']['y_scores'])

ax.plot(baseline_fpr, baseline_tpr, linewidth=2, label=f"Baseline = {baseline_results['auc']['auc_roc']:.3f}", color='#95a5a6')
ax.plot(fpr, tpr, linewidth=2, label=f"D-CAT = {results['auc']['auc_roc']:.3f}", color='#3498db')
ax.plot([0, 1], [0, 1], 'k--', alpha=0.3)
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('AUC-ROC', fontsize=13, fontweight='bold')
ax.legend(fontsize=11)

# Hit Rate @ K
ax = axes[1]
x = np.arange(len(k_values))
width = 0.35

baseline_prec = [baseline_results['topk']['summary'][k]['hit_rate@k'] for k in k_values]
finetuned_prec = [results['topk']['summary'][k]['hit_rate@k'] for k in k_values]

bars1 = ax.bar(x - width/2, baseline_prec, width, label='Baseline', alpha=0.8, color='#95a5a6')
bars2 = ax.bar(x + width/2, finetuned_prec, width, label='D-CAT', alpha=0.8, color='#3498db')

ax.set_xlabel('K', fontsize=12)
ax.set_ylabel('Hits@K', fontsize=12)
ax.set_title('Link Prediction: Hits@K', fontsize=13, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(k_values)
ax.legend(fontsize=11)
ax.grid(True, axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Create text dictionary for content evaluation
text_dict = {}
for _, row in metadata_df.iterrows():
    node_id = str(row['id'])
    title = str(row['title']) if pd.notna(row['title']) else ''
    abstract = str(row['abstract']) if pd.notna(row['abstract']) else ''
    
    combined_text = f"{title} {abstract}".strip()
    if combined_text:
        text_dict[node_id] = combined_text

print(f"Created text dictionary with {len(text_dict)} entries")

# Verify overlap
overlap = set(embeddings_dict.keys()) & set(text_dict.keys())
print(f"Overlap: {len(overlap)} keys in common")

In [None]:
# Import evaluation functions
sys.path.insert(0, str(BASE_DIR / "cat"))

from utils.evaluation.content_eval import evaluate_content_preservation

content_results = evaluate_content_preservation(
    embeddings_dict=embeddings_dict,
    baseline_embeddings_dict=baseline_embeddings,
    content_dict=text_dict,
    test_nodes=test_val_nodes,
    sample_size=2000,
    random_state=42
)

In [None]:
baseline_spearman = content_results['baseline']['spearman_correlation']
finetuned_spearman = content_results['finetuned']['spearman_correlation']

fig, ax = plt.subplots(1, 1, figsize=(5, 5))

x = np.arange(1)
width = 0.35

baseline_prec = [baseline_spearman]
finetuned_prec = [finetuned_spearman]

bars1 = ax.bar(x - width/2, baseline_prec, width, label='Baseline', alpha=0.8, color='#95a5a6')
bars2 = ax.bar(x + width/2, finetuned_prec, width, label='D-CAT', alpha=0.8, color='#3498db')

ax.set_ylabel('Spearman Correlation', fontsize=12)
ax.set_title('Content Preservation', fontsize=13, fontweight='bold')
plt.tick_params(
    axis='x',
    which='both',
    bottom=False,
    top=False,
    labelbottom=False)
ax.legend(fontsize=11)
ax.grid(True, axis='y', alpha=0.3)

In [None]:
from utils.evaluation.graph_dist_eval import evaluate_distance_correlation, plot_distance_correlation

edgelist_path = DATA_DIR / "network" / "oc_mini_edgelist.csv"

# Evaluate fine-tuned model
finetuned_distance_evaluation = evaluate_distance_correlation(
    edgelist_path=edgelist_path,
    embeddings_dict=embeddings_dict,
    test_nodes=test_val_nodes,
    num_samples_per_node=100,
    max_graph_distance=10,
    embedding_distance_metric='cosine',
    sampling_strategy='stratified'
)

plot_distance_correlation(finetuned_distance_evaluation)

In [None]:
# Evaluate baseline model
baseline_distance_evaluation = evaluate_distance_correlation(
    edgelist_path=edgelist_path,
    embeddings_dict=baseline_embeddings,
    test_nodes=test_val_nodes,
    num_samples_per_node=100,
    max_graph_distance=10,
    embedding_distance_metric='cosine',
    sampling_strategy='stratified'
)

plot_distance_correlation(baseline_distance_evaluation)

In [None]:
# Evaluate clustering structure preservation
from cluster_utils import DisjointClustering
from utils.evaluation.clustering_eval import evaluate_clustering_structure

# Load clustering
clustering = DisjointClustering(clustering_df)

# Evaluate baseline embeddings
baseline_cluster_results = evaluate_clustering_structure(
    clustering=clustering,
    embeddings_dict=baseline_embeddings,
    test_nodes=test_val_nodes,
    distance_sample_size=2000,
    random_state=42
)

In [None]:
# Evaluate fine-tuned embeddings
finetuned_cluster_results = evaluate_clustering_structure(
    clustering=clustering,
    embeddings_dict=embeddings_dict,
    test_nodes=test_val_nodes,
    distance_sample_size=2000,
    random_state=42
)

In [None]:
# Save results
results_dir = BASE_DIR / "cat" / "dcat" / "experiments" / "results"
results_dir.mkdir(parents=True, exist_ok=True)

with open(results_dir / "content_results.json", 'w') as f:
    json.dump(content_results, f, default=lambda o: float(o) if isinstance(o, np.floating) else int(o) if isinstance(o, np.integer) else o)

with open(results_dir / "finetuned_cluster_results.json", 'w') as f:
    json.dump(finetuned_cluster_results, f, default=lambda o: float(o) if isinstance(o, np.floating) else int(o) if isinstance(o, np.integer) else o)

with open(results_dir / "baseline_cluster_results.json", 'w') as f:
    json.dump(baseline_cluster_results, f, default=lambda o: float(o) if isinstance(o, np.floating) else int(o) if isinstance(o, np.integer) else o)

with open(results_dir / "baseline_distance_results.json", 'w') as f:
    json.dump(baseline_distance_evaluation, f, 
              default=lambda o: o.tolist() if isinstance(o, np.ndarray) else float(o) if isinstance(o, np.floating) else int(o) if isinstance(o, np.integer) else o)

with open(results_dir / "finetuned_distance_results.json", 'w') as f:
    json.dump(finetuned_distance_evaluation, f, 
              default=lambda o: o.tolist() if isinstance(o, np.ndarray) else float(o) if isinstance(o, np.floating) else int(o) if isinstance(o, np.integer) else o)

with open(results_dir / "link_prediction_results.json", 'w') as f:
    json.dump(results, f,
              default=lambda o: o.tolist() if isinstance(o, np.ndarray) else float(o) if isinstance(o, np.floating) else int(o) if isinstance(o, np.integer) else o)

with open(results_dir / "baseline_link_prediction_results.json", 'w') as f:
    json.dump(baseline_results, f,
              default=lambda o: o.tolist() if isinstance(o, np.ndarray) else float(o) if isinstance(o, np.floating) else int(o) if isinstance(o, np.integer) else o)

print("✓ All results saved to:", results_dir)