In [20]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.metrics import roc_auc_score, average_precision_score

In [21]:
# Preferential Attachment

# 1. Load edge list (assumes columns: source,target,weight) 
EDGE_CSV = "ingredient_cooccur_graph.csv"  # path can be changed as needed

print("Loading edge list …")
edges_df = pd.read_csv(EDGE_CSV, header=0)

author_columns = ["source", "target"]
if len(edges_df.columns) < 2:
    raise ValueError("Edge CSV must contain at least two columns: source, target")

pos_edges = list(zip(edges_df["source"], edges_df["target"]))
print(f"Positive edges loaded: {len(pos_edges):,}")


# 2. Train / validation / test split 
print("Splitting into train/val/test …")
train_edges, tmp_edges = train_test_split(pos_edges, test_size=0.30, random_state=42)
val_edges,  test_edges = train_test_split(tmp_edges, test_size=0.50, random_state=42)

print(f"Train edges: {len(train_edges):,}  |  Val edges: {len(val_edges):,}  |  Test edges: {len(test_edges):,}")


# 3. Build the training graph 
G_train = nx.Graph()
all_nodes = set(edges_df["source"]).union(set(edges_df["target"]))
G_train.add_nodes_from(all_nodes)
G_train.add_edges_from(train_edges)

degree_dict = dict(G_train.degree())  # Pre‑compute degrees for PA


# 4. Preferential‑Attachment scoring function 
def pa_score(u: str, v: str) -> int:
    """PA score = degree(u) × degree(v).  If node unseen in train, degree=0."""
    return degree_dict.get(u, 0) * degree_dict.get(v, 0)


# 5. Negative sampling helper 
random.seed(42)
np.random.seed(42)

def sample_negative_edges(num_samples: int, forbidden: set) -> list:
    """Randomly sample `num_samples` non‑existing edges from the node set."""
    neg = set()
    nodes = list(all_nodes)
    while len(neg) < num_samples:
        u, v = random.sample(nodes, 2)
        if (u, v) not in forbidden and (v, u) not in forbidden and not G_train.has_edge(u, v):
            neg.add((u, v))
    return list(neg)

forbidden_pairs = set(pos_edges)
neg_test_edges = sample_negative_edges(len(test_edges), forbidden_pairs)


# 6. Evaluate on original test set
test_pairs = test_edges + neg_test_edges
test_labels = [1] * len(test_edges) + [0] * len(neg_test_edges)
pa_scores  = [pa_score(u, v) for u, v in test_pairs]

auc_orig = roc_auc_score(test_labels, pa_scores)
ap_orig  = average_precision_score(test_labels, pa_scores)

print("\nPA – Original test metrics:")
print(f"  AUC: {auc_orig:.4f},  AP: {ap_orig:.4f}")


# 7. Rare‑subset evaluation (nodes with degree ≤ 25th percentile) 
deg_values = list(degree_dict.values())
if len(deg_values) == 0:
    raise ValueError("Graph has no edges after train split – check data.")

deg_threshold = np.percentile(deg_values, 25)

def is_rare(edge):
    u, v = edge
    return degree_dict.get(u, 0) <= deg_threshold and degree_dict.get(v, 0) <= deg_threshold

rare_pos = [e for e in test_edges     if is_rare(e)]
rare_neg = [e for e in neg_test_edges if is_rare(e)]

if len(rare_pos) > 0 and len(rare_neg) > 0:
    rare_pairs  = rare_pos + rare_neg
    rare_labels = [1]*len(rare_pos) + [0]*len(rare_neg)
    rare_scores = [pa_score(u, v) for u, v in rare_pairs]

    auc_rare = roc_auc_score(rare_labels, rare_scores)
    ap_rare  = average_precision_score(rare_labels, rare_scores)

    print("\nPA – Rare test metrics (≤25th‑percentile degree nodes):")
    print(f"  AUC: {auc_rare:.4f},  AP: {ap_rare:.4f}")
else:
    print("\n[INFO] Not enough rare pairs to compute rare‑subset metrics.")





Loading edge list …
Positive edges loaded: 54,138
Splitting into train/val/test …
Train edges: 37,896  |  Val edges: 8,121  |  Test edges: 8,121

PA – Original test metrics:
  AUC: 0.9517,  AP: 0.9550

PA – Rare test metrics (≤25th‑percentile degree nodes):
  AUC: 0.6336,  AP: 0.0080


In [22]:
#  DeepWalk (torch) + MLP 
#  1. Perform random walks on graph.
#  2. Train Skip-gram model using PyTorch.
#  3. Train MLP for link prediction.
#  No gensim needed.

import torch
import torch.nn as nn

# ---------------------------- Hyper-parameters -------------------------------
EDGE_CSV = "ingredient_cooccur_graph.csv"
EMB_DIM  = 64
WALK_LEN = 20
NUM_WALKS = 10
WINDOW_SIZE = 5
SG_EPOCHS = 5
MLP_EPOCHS = 20
LR_SG = 0.01
LR_MLP = 0.001
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ---------------------------- 1. Load edge list ------------------------------
print("Loading edge list …")
edf = pd.read_csv(EDGE_CSV, header=0)
positives = list(zip(edf["source"], edf["target"]))
all_nodes = set(edf["source"]).union(edf["target"])
print(f"Total positive edges: {len(positives):,}")

# ---------------------------- 2. Split data ----------------------------------
print("Splitting into train/val/test …")
train_pos, tmp_pos = train_test_split(positives, test_size=0.30, random_state=SEED)
val_pos,   test_pos = train_test_split(tmp_pos,    test_size=0.50, random_state=SEED)
print(f"Train={len(train_pos):,}, Val={len(val_pos):,}, Test={len(test_pos):,}")

# ---------------------------- 3. Build graph ---------------------------------
G = nx.Graph()
G.add_nodes_from(all_nodes)
G.add_edges_from(train_pos)
nodes = list(G.nodes())
node_to_idx = {node: idx for idx, node in enumerate(nodes)}

# ---------------------------- 4. Random Walks -------------------------------
print("Generating random walks …")
def generate_walks(G, num_walks, walk_length):
    walks = []
    for _ in range(num_walks):
        random.shuffle(nodes)
        for node in nodes:
            walk = [node]
            while len(walk) < walk_length:
                neighbors = list(G.neighbors(walk[-1]))
                if neighbors:
                    walk.append(random.choice(neighbors))
                else:
                    break
            walks.append(walk)
    return walks

walks = generate_walks(G, NUM_WALKS, WALK_LEN)

# ---------------------------- 5. Prepare Skip-gram Training Data ------------
print("Preparing skip-gram pairs …")
sg_pairs = []
for walk in walks:
    for i, center in enumerate(walk):
        window = walk[max(0, i-WINDOW_SIZE):i] + walk[i+1:i+1+WINDOW_SIZE]
        for context in window:
            sg_pairs.append((node_to_idx[center], node_to_idx[context]))

print(f"Total training pairs: {len(sg_pairs):,}")

# ---------------------------- 6. Define Skip-gram Model ----------------------
class SkipGram(nn.Module):
    def __init__(self, num_nodes, emb_dim):
        super().__init__()
        self.emb = nn.Embedding(num_nodes, emb_dim)

    def forward(self, center, context):
        center_emb = self.emb(center)
        context_emb = self.emb(context)
        scores = (center_emb * context_emb).sum(dim=1)
        return scores

num_nodes = len(nodes)
sg_model = SkipGram(num_nodes, EMB_DIM)
optimizer_sg = torch.optim.Adam(sg_model.parameters(), lr=LR_SG)
loss_fn_sg = nn.BCEWithLogitsLoss()

# ---------------------------- 7. Train Skip-gram -----------------------------
print("Training Skip-gram …")
centers = torch.tensor([c for c, _ in sg_pairs], dtype=torch.long)
contexts = torch.tensor([c for _, c in sg_pairs], dtype=torch.long)
labels = torch.ones(len(sg_pairs))

for epoch in range(1, SG_EPOCHS+1):
    sg_model.train()
    optimizer_sg.zero_grad()
    preds = sg_model(centers, contexts)
    loss = loss_fn_sg(preds, labels)
    loss.backward()
    optimizer_sg.step()
    print(f"Epoch {epoch:02d} | Skip-gram Loss: {loss.item():.4f}")

# Get final embeddings
with torch.no_grad():
    embeddings = sg_model.emb.weight.data.cpu().numpy()

# ---------------------------- 8. Edge Feature Construction ------------------
def get_edge_feature(u, v):
    """Concatenate the embeddings for edge (u,v) into a feature vector."""
    if u not in node_to_idx or v not in node_to_idx:
        return np.zeros(2 * EMB_DIM)
    emb_u = embeddings[node_to_idx[u]]
    emb_v = embeddings[node_to_idx[v]]
    return np.concatenate([emb_u, emb_v])

# ---------------------------- 9. Negative Sampling ---------------------------
print("Sampling negatives …")
def sample_neg(G, k, forb):
    neg = set()
    while len(neg) < k:
        u, v = random.sample(nodes, 2)
        if (u, v) not in forb and (v, u) not in forb and not G.has_edge(u, v):
            neg.add((u, v))
    return list(neg)

forbidden = set(positives)
neg_train = sample_neg(G, len(train_pos), forbidden)
neg_val   = sample_neg(G, len(val_pos), forbidden)
neg_test  = sample_neg(G, len(test_pos), forbidden)

# ---------------------------- 10. Build datasets -----------------------------
X_train = np.array([get_edge_feature(u,v) for u,v in train_pos+neg_train])
y_train = np.array([1]*len(train_pos) + [0]*len(neg_train))

X_val = np.array([get_edge_feature(u,v) for u,v in val_pos+neg_val])
y_val = np.array([1]*len(val_pos) + [0]*len(neg_val))

X_test = np.array([get_edge_feature(u,v) for u,v in test_pos+neg_test])
y_test = np.array([1]*len(test_pos) + [0]*len(neg_test))

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# ---------------------------- 11. Define and Train MLP -----------------------
class EdgeMLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        return self.fc(x).squeeze(-1)

print("Training MLP …")
model = EdgeMLP(2*EMB_DIM)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR_MLP)

X_train, y_train = X_train.to(device), y_train.to(device)
X_val, y_val = X_val.to(device), y_val.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

for epoch in range(1, MLP_EPOCHS+1):
    model.train()
    optimizer.zero_grad()
    logits = model(X_train)
    loss = loss_fn(logits, y_train)
    loss.backward()
    optimizer.step()
    if epoch % 5 == 0:
        model.eval()
        with torch.no_grad():
            val_logits = model(X_val)
            val_loss = loss_fn(val_logits, y_val)
            print(f"Epoch {epoch:02d} | Train Loss: {loss.item():.4f} | Val Loss: {val_loss.item():.4f}")

# ---------------------------- 12. Evaluate -----------------------------------
print("Evaluating …")
model.eval()
with torch.no_grad():
    pred = torch.sigmoid(model(X_test)).cpu().numpy()
    auc = roc_auc_score(y_test.cpu().numpy(), pred)
    ap  = average_precision_score(y_test.cpu().numpy(), pred)

print(f"\nDeepWalk (torch) + MLP – Test Metrics:")
print(f"AUC: {auc:.4f}, AP: {ap:.4f}")


Loading edge list …
Total positive edges: 54,138
Splitting into train/val/test …
Train=37,896, Val=8,121, Test=8,121
Generating random walks …
Preparing skip-gram pairs …
Total training pairs: 1,159,400
Training Skip-gram …
Epoch 01 | Skip-gram Loss: 3.2600
Epoch 02 | Skip-gram Loss: 3.1947
Epoch 03 | Skip-gram Loss: 3.1296
Epoch 04 | Skip-gram Loss: 3.0649
Epoch 05 | Skip-gram Loss: 3.0004
Sampling negatives …
Training MLP …
Epoch 05 | Train Loss: 0.6734 | Val Loss: 0.6702
Epoch 10 | Train Loss: 0.6538 | Val Loss: 0.6519
Epoch 15 | Train Loss: 0.6363 | Val Loss: 0.6357
Epoch 20 | Train Loss: 0.6200 | Val Loss: 0.6207
Evaluating …

DeepWalk (torch) + MLP – Test Metrics:
AUC: 0.7386, AP: 0.7275


In [23]:
# LINE(Large-scale Information Network Embedding): first-order

import torch
import torch.nn as nn

# ---------------------------- Hyper-parameters -------------------------------
EDGE_CSV       = "ingredient_cooccur_graph.csv"  # path to CSV with columns: source,target,weight
EMB_DIM        = 64
EPOCHS         = 5
LR             = 0.01
NEG_SAMPLES    = 5   # number of negative samples per positive edge
SEED           = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ---------------------------- 1. Load edge list ------------------------------
print("Loading edge list …")
edf = pd.read_csv(EDGE_CSV)
# if weight column not present, assume weight=1
if "weight" not in edf.columns:
    edf["weight"] = 1.0
edges = list(zip(edf["source"], edf["target"], edf["weight"]))
all_nodes = sorted(set(edf["source"]).union(edf["target"]))
print(f"Total edges: {len(edges):,}")

# ---------------------------- 2. Split data ----------------------------------
print("Splitting into train/val/test …")
pairs = [(u, v) for u, v, _ in edges]
train_pos, tmp = train_test_split(pairs, test_size=0.30, random_state=SEED)
pos_val, test_pos = train_test_split(tmp, test_size=0.50, random_state=SEED)
print(f"Train={len(train_pos):,}, Val={len(pos_val):,}, Test={len(test_pos):,}")

# ---------------------------- 3. Build training graph ------------------------
print("Building NetworkX training graph …")
G = nx.Graph()
G.add_nodes_from(all_nodes)
G.add_weighted_edges_from([(u, v, w) for u, v, w in edges if (u, v) in train_pos])

node_to_idx = {n: i for i, n in enumerate(all_nodes)}
num_nodes = len(all_nodes)

# ---------------------------- 4. Define LINE model ---------------------------
class LINEFirstOrder(nn.Module):
    def __init__(self, num_nodes, emb_dim):
        super().__init__()
        self.emb_u = nn.Embedding(num_nodes, emb_dim)
        self.emb_v = nn.Embedding(num_nodes, emb_dim)
        nn.init.xavier_uniform_(self.emb_u.weight)
        nn.init.xavier_uniform_(self.emb_v.weight)

    def forward(self, u_idx, v_idx, neg_v_idx):
        # positive score
        u_emb = self.emb_u(u_idx)
        v_emb = self.emb_v(v_idx)
        pos_score = torch.sum(u_emb * v_emb, dim=-1)
        # negative scores: broadcast
        neg_emb = self.emb_v(neg_v_idx)  # shape: [B, NEG, D]
        u_emb_neg = u_emb.unsqueeze(1)   # shape: [B, 1, D]
        neg_score = torch.bmm(neg_emb, u_emb_neg.transpose(1,2)).squeeze(-1)
        return pos_score, neg_score

model = LINEFirstOrder(num_nodes, EMB_DIM)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.BCEWithLogitsLoss()

# ---------------------------- 5. Negative Sampling ---------------------------
nodes = all_nodes
all_idx = list(range(num_nodes))

print("Starting LINE training …")
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    random.shuffle(train_pos)
    for u, v in train_pos:
        u_idx = torch.tensor([node_to_idx[u]], dtype=torch.long)
        v_idx = torch.tensor([node_to_idx[v]], dtype=torch.long)
        # sample negatives
        neg_vs = random.choices(all_idx, k=NEG_SAMPLES)
        neg_v_idx = torch.tensor([neg_vs], dtype=torch.long)
        # forward
        pos_score, neg_score = model(u_idx, v_idx, neg_v_idx)
        # labels
        pos_label = torch.ones_like(pos_score)
        neg_label = torch.zeros_like(neg_score)
        # compute loss
        loss = loss_fn(pos_score, pos_label)
        loss += loss_fn(neg_score, neg_label).sum()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}/{EPOCHS} – Loss: {total_loss/len(train_pos):.4f}")

# ---------------------------- 6. Extract embeddings -------------------------
model.eval()
with torch.no_grad():
    emb_u = model.emb_u.weight.data.cpu().numpy()
    emb_v = model.emb_v.weight.data.cpu().numpy()
# for first-order, you can average emb_u and emb_v or just use emb_u
emb = (emb_u + emb_v) / 2.0  # final node embedding matrix

# ---------------------------- 7. Evaluate link prediction -------------------
print("Evaluating LINE embeddings …")
def edge_score(u, v):
    ui, vi = node_to_idx[u], node_to_idx[v]
    return float(np.dot(emb[ui], emb[vi]))

# prepare test negatives
forbidden = set(pairs)
neg_test = []
while len(neg_test) < len(test_pos):
    u, v = random.sample(all_nodes, 2)
    if (u, v) not in forbidden and (v, u) not in forbidden and not G.has_edge(u, v):
        neg_test.append((u, v))

test_pairs = test_pos + neg_test
test_labels = [1]*len(test_pos) + [0]*len(neg_test)

scores = [edge_score(u, v) for u, v in test_pairs]
auc = roc_auc_score(test_labels, scores)
ap = average_precision_score(test_labels, scores)
print(f"LINE – Test AUC: {auc:.4f}, AP: {ap:.4f}")

# ---------------------------- 8. Rare subset --------------------------------
deg = dict(G.degree())
thresh = np.percentile(list(deg.values()), 25)
rare_pos = [e for e in test_pos if deg[e[0]]<=thresh and deg[e[1]]<=thresh]
rare_neg = [e for e in neg_test if deg[e[0]]<=thresh and deg[e[1]]<=thresh]

if rare_pos and rare_neg:
    rare_pairs = rare_pos + rare_neg
    rare_labels = [1]*len(rare_pos) + [0]*len(rare_neg)
    rare_scores = [edge_score(u, v) for u, v in rare_pairs]
    auc_r = roc_auc_score(rare_labels, rare_scores)
    ap_r  = average_precision_score(rare_labels, rare_scores)
    print(f"LINE – Rare AUC: {auc_r:.4f}, AP: {ap_r:.4f}")
else:
    print("Not enough rare-node test pairs.")



Loading edge list …
Total edges: 54,138
Splitting into train/val/test …
Train=37,896, Val=8,121, Test=8,121
Building NetworkX training graph …
Starting LINE training …
Epoch 1/5 – Loss: 2.3991
Epoch 2/5 – Loss: 2.6548
Epoch 3/5 – Loss: 2.6645
Epoch 4/5 – Loss: 2.7025
Epoch 5/5 – Loss: 2.7545
Evaluating LINE embeddings …
LINE – Test AUC: 0.4238, AP: 0.4266
LINE – Rare AUC: 0.4506, AP: 0.0041

Finished LINE baseline ✨


In [26]:
# HOPE (High-Order Proximity preserved Embedding)

import networkx as nx
from sklearn.decomposition import TruncatedSVD

# ---------------------------- Hyper-parameters -------------------------------
EDGE_CSV = "ingredient_cooccur_graph.csv"  # must contain source,target,weight
EMB_DIM = 64
TEST_RATIO = 0.3
VAL_RATIO = 0.5
BETA = 0.01    # decay factor for Katz
SEED = 42

random.seed(SEED)
np.random.seed(SEED)

# ---------------------------- 1. Load data -----------------------------------
print("Loading edge list …")
edf = pd.read_csv(EDGE_CSV)
if "weight" not in edf.columns:
    edf["weight"] = 1.0
edges = list(zip(edf["source"], edf["target"], edf["weight"]))
nodes = sorted(set(edf["source"]).union(edf["target"]))
node_to_idx = {n: i for i, n in enumerate(nodes)}
n = len(nodes)
print(f"Nodes: {n}, Edges: {len(edges)}")

# ---------------------------- 2. Train/Val/Test split ------------------------
pairs = [(u, v) for u, v, _ in edges]
train_pos, tmp = train_test_split(pairs, test_size=TEST_RATIO, random_state=SEED)
val_pos, test_pos = train_test_split(tmp, test_size=VAL_RATIO, random_state=SEED)
print(f"Train pos: {len(train_pos)}, Val pos: {len(val_pos)}, Test pos: {len(test_pos)}")

# ---------------------------- 3. Build adjacency -----------------------------
print("Building adjacency matrix …")
A = np.zeros((n, n))
for u, v, w in edges:
    if (u, v) in train_pos or (v, u) in train_pos:
        i, j = node_to_idx[u], node_to_idx[v]
        A[i, j] = w
        A[j, i] = w

# ---------------------------- 4. Compute Katz proximity -----------------------
print("Computing Katz index matrix (S) …")
I = np.eye(n)
# S = (I - beta*A)^{-1} - I
M = I - BETA * A
# use pseudo-inverse if singular
try:
    Minv = np.linalg.inv(M)
except np.linalg.LinAlgError:
    Minv = np.linalg.pinv(M)
S = Minv - I
# replace NaNs and infinite values with zero
S = np.nan_to_num(S, nan=0.0, posinf=0.0, neginf=0.0)

# ---------------------------- 5. Truncated SVD -------------------------------- Truncated SVD --------------------------------
print(f"Performing Truncated SVD for {EMB_DIM} dims …")
svd = TruncatedSVD(n_components=EMB_DIM, random_state=SEED)
embeddings = svd.fit_transform(S)
# normalize embeddings with safe division
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
# avoid division by zero
norms[norms == 0] = 1.0
embeddings = embeddings / norms
# ensure no NaNs
embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)

# ---------------------------- Helper: edge score -----------------------------
def edge_score(u, v):
    i, j = node_to_idx[u], node_to_idx[v]
    return float(np.dot(embeddings[i], embeddings[j]))

# ---------------------------- 6. Negative sampling ---------------------------
print("Generating negative test edges …")
forbidden = set(pairs)
neg_test = set()
all_nodes = list(nodes)
while len(neg_test) < len(test_pos):
    u, v = random.sample(all_nodes, 2)
    if (u, v) not in forbidden and (v, u) not in forbidden:
        neg_test.add((u, v))
neg_test = list(neg_test)

# ---------------------------- 7. Evaluate on original ------------------------
print("Evaluating on original test set …")
test_pairs = test_pos + neg_test
test_labels = [1]*len(test_pos) + [0]*len(neg_test)
scores = [edge_score(u, v) for u, v in test_pairs]
auc = roc_auc_score(test_labels, scores)
ap = average_precision_score(test_labels, scores)
print(f"HOPE – Original AUC: {auc:.4f}, AP: {ap:.4f}")

# ---------------------------- 8. Rare subset ----------------------------------
print("Evaluating on rare-node subset …")
G_train = nx.Graph()
G_train.add_nodes_from(nodes)
G_train.add_edges_from(train_pos)
deg = dict(G_train.degree())
thresh = np.percentile(list(deg.values()), 25)
rare_pos = [e for e in test_pos if deg[e[0]]<=thresh and deg[e[1]]<=thresh]
rare_neg = [e for e in neg_test if deg[e[0]]<=thresh and deg[e[1]]<=thresh]
if rare_pos and rare_neg:
    pairs_r = rare_pos + rare_neg
    labels_r = [1]*len(rare_pos) + [0]*len(rare_neg)
    scores_r = [edge_score(u, v) for u, v in pairs_r]
    auc_r = roc_auc_score(labels_r, scores_r)
    ap_r = average_precision_score(labels_r, scores_r)
    print(f"HOPE – Rare AUC: {auc_r:.4f}, AP: {ap_r:.4f}")
else:
    print("Not enough rare-node pairs for evaluation.")


Loading edge list …
Nodes: 686, Edges: 54138
Train pos: 37896, Val pos: 8121, Test pos: 8121
Building adjacency matrix …
Computing Katz index matrix (S) …
Performing Truncated SVD for 64 dims …
Generating negative test edges …
Evaluating on original test set …
HOPE – Original AUC: 0.4760, AP: 0.4637
Evaluating on rare-node subset …
HOPE – Rare AUC: 0.8460, AP: 0.2547
