In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, ndcg_score
from collections import defaultdict
import matplotlib.pyplot as plt

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load entity embeddings from pretrained .vec file
def load_entity_embedding(path):
    embedding_dict = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            key = parts[0]
            vec = np.array([float(x) for x in parts[1:]], dtype=np.float32)
            embedding_dict[key] = vec
    return embedding_dict

# One-hot encode a pandas Series
def one_hot_encode(series, mapping=None):
    if mapping is None:
        unique_vals = sorted(series.dropna().unique())
        mapping = {val: idx for idx, val in enumerate(unique_vals)}
    dim = len(mapping)
    mat = np.zeros((len(series), dim), dtype=np.float32)
    for i, val in enumerate(series):
        if val in mapping:
            mat[i, mapping[val]] = 1.0
    return mat, mapping

# Combine multiple content-based features into a dense vector for each news item
def compute_news_embeddings(news_df,
                            entity_embeddings,
                            ent_dim,
                            tfidf_title=None,
                            tfidf_abs=None,
                            cat_mapping=None,
                            subcat_mapping=None):
    # Encode categorical features
    cat_vecs, cat_mapping = one_hot_encode(news_df['Category'], mapping=cat_mapping)
    subcat_vecs, subcat_mapping = one_hot_encode(news_df['SubCategory'], mapping=subcat_mapping)

    # Encode textual features using TF-IDF (title & abstract)
    if tfidf_title is None:
        tfidf_title = TfidfVectorizer(max_features=100)
        title_mat = tfidf_title.fit_transform(news_df['Title'].fillna('')).toarray()
    else:
        title_mat = tfidf_title.transform(news_df['Title'].fillna('')).toarray()

    if tfidf_abs is None:
        tfidf_abs = TfidfVectorizer(max_features=100)
        abs_mat = tfidf_abs.fit_transform(news_df['Abstract'].fillna('')).toarray()
    else:
        abs_mat = tfidf_abs.transform(news_df['Abstract'].fillna('')).toarray()

    # Feature dimensions
    cat_dim = cat_vecs.shape[1]
    subcat_dim = subcat_vecs.shape[1]
    title_dim = title_mat.shape[1]
    abs_dim = abs_mat.shape[1]
    total_dim = ent_dim + cat_dim + subcat_dim + title_dim + abs_dim

    # Build embedding for each news article
    news_features = {}
    for idx, row in news_df.iterrows():
        ent_ids = str(row['Entity']).split(';') if pd.notna(row['Entity']) else []
        vecs = [entity_embeddings[e] for e in ent_ids if e in entity_embeddings]
        ent_vec = np.mean(vecs, axis=0) if vecs else np.zeros(ent_dim, dtype=np.float32)

        full = np.concatenate([
            ent_vec,
            cat_vecs[idx],
            subcat_vecs[idx],
            title_mat[idx],
            abs_mat[idx]
        ]).astype(np.float32)

        news_features[row['NewsID']] = full

    return news_features, total_dim, tfidf_title, tfidf_abs, cat_mapping, subcat_mapping

# Aggregate clicked news embeddings into test-time user embeddings
def compute_user_history_embedding(behaviors_path, news_embed_path):
    # Load precomputed news vectors
    news_vec = {}
    with open(news_embed_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            news_id = parts[0]
            vec = np.array([float(x) for x in parts[1:]], dtype=np.float32)
            news_vec[news_id] = vec

    # Parse click logs
    behaviors = pd.read_csv(behaviors_path, sep='\t', header=None,
        names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
    user_clicks = defaultdict(list)
    for _, row in behaviors.iterrows():
        for imp in row['Impressions'].split():
            nid, label = imp.split('-')
            if int(label) == 1:
                user_clicks[row['UserID']].append(nid)

    # Average clicked news vectors per user
    user_embed = {}
    for uid, nids in user_clicks.items():
        vecs = [news_vec[nid] for nid in nids if nid in news_vec]
        if vecs:
            user_embed[uid] = np.mean(vecs, axis=0)
    return user_embed

# Build user-news bipartite graph for GNN input
def build_graph(news_path,
                behaviors_path,
                entity_embedding_path,
                tfidf_title=None,
                tfidf_abs=None,
                cat_mapping=None,
                subcat_mapping=None,
                user_history_embedding=None):

    # Load raw data
    behaviors = pd.read_csv(behaviors_path, sep='\t', header=None,
        names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
    news = pd.read_csv(news_path, sep='\t', header=None,
        names=['NewsID','Category','SubCategory','Title','Abstract',
               'URL','Entity','AbstractEntities'])

    # Compute embeddings
    entity_embeddings = load_entity_embedding(entity_embedding_path)
    ent_dim = len(next(iter(entity_embeddings.values())))
    news_embeds, feat_dim, tfidf_title, tfidf_abs, \
        cat_mapping, subcat_mapping = compute_news_embeddings(
            news, entity_embeddings, ent_dim,
            tfidf_title, tfidf_abs,
            cat_mapping, subcat_mapping
        )

    # Assign unique node indices
    user2id = {u: i for i, u in enumerate(behaviors['UserID'].unique())}
    news2id = {n: i for i, n in enumerate(news['NewsID'].unique())}

    # Create edges: user → news (offset by user count)
    edges, labels, pairs = [], [], []
    for _, row in behaviors.iterrows():
        u = user2id[row['UserID']]
        for imp in row['Impressions'].split():
            nid, lab = imp.split('-')
            if nid not in news2id:
                continue
            v = news2id[nid] + len(user2id)
            edges.append((u, v))
            labels.append(int(lab))
            pairs.append((u, v))

    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    y = torch.tensor(labels, dtype=torch.float32, device=device)

    # Node feature matrix
    total_nodes = len(user2id) + len(news2id)
    x = torch.zeros((total_nodes, feat_dim), dtype=torch.float32)

    # Initialize user features
    for uid, idx in user2id.items():
        if user_history_embedding and uid in user_history_embedding:
            x[idx] = torch.tensor(user_history_embedding[uid], dtype=torch.float32)
        else:
            x[idx] = torch.randn(feat_dim) * 0.01  # random init for train users

    # Initialize news features
    for nid, idx in news2id.items():
        vec = news_embeds.get(nid, np.zeros(feat_dim, dtype=np.float32))
        x[len(user2id) + idx] = torch.tensor(vec)

    data = Data(x=x.to(device), edge_index=edge_index.to(device), y=y)
    data.user_news_pairs = pairs
    return data, feat_dim, tfidf_title, tfidf_abs, cat_mapping, subcat_mapping

# Define 3-layer GraphSAGE model + MLP classifier
def GNNRecommender(in_dim, hidden_dim):
    class Model(nn.Module):
        def __init__(self):
            super().__init__()
            self.gnn1 = SAGEConv(in_dim, hidden_dim)
            self.gnn2 = SAGEConv(hidden_dim, hidden_dim)
            self.gnn3 = SAGEConv(hidden_dim, hidden_dim)
            self.classifier = nn.Sequential(
                nn.Linear(hidden_dim * 2, 64),
                nn.ReLU(),
                nn.Linear(64, 1)
            )

        def forward(self, data):
            x = torch.relu(self.gnn1(data.x, data.edge_index))
            x = torch.relu(self.gnn2(x, data.edge_index))
            x = self.gnn3(x, data.edge_index)
            return x

        def predict_logits(self, x, edge_index):
            h_u = x[edge_index[0]]
            h_v = x[edge_index[1]]
            return self.classifier(torch.cat([h_u, h_v], dim=1)).squeeze()

    return Model().to(device)

# Training loop with full-batch graph training
def train_model(train_data, test_data, model, epochs=10, lr=1e-3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([10.0], device=device))
    model.train()
    for ep in range(epochs):
        optimizer.zero_grad()
        x = model(train_data)
        logits = model.predict_logits(x, train_data.edge_index)
        loss = loss_fn(logits, train_data.y)
        loss.backward()
        optimizer.step()
        print(f"Epoch {ep+1}/{epochs}  Loss: {loss.item():.4f}")

        # Evaluate for overfitting
        print("→ Train Eval", end=' | ')
        auc_t, ndcg_t = evaluate_model(train_data, model, k=5, silent=True)
        print("→ Test Eval", end=' | ')
        auc_v, ndcg_v = evaluate_model(test_data, model, k=5, silent=True)

        train_auc_list.append(auc_t)
        test_auc_list.append(auc_v)
        train_ndcg_list.append(ndcg_t)
        test_ndcg_list.append(ndcg_v)

# Per-user evaluation: AUC + ranking metrics
def evaluate_model(data, model, k=5, silent=False):
    model.eval()
    with torch.no_grad():
        x = model(data)
        probs = torch.sigmoid(model.predict_logits(x, data.edge_index)).cpu().numpy()
        labels = data.y.cpu().numpy()
        pairs = data.user_news_pairs

        user_dict = defaultdict(list)
        for i, (u, v) in enumerate(pairs):
            user_dict[u].append((labels[i], probs[i]))

        ndcg_list, prec_list, rec_list, hit_list = [], [], [], []
        y_true_all, y_score_all = [], []
        for u, items in user_dict.items():
            items = sorted(items, key=lambda x: x[1], reverse=True)
            topk = items[:k]
            y_true = [t for t,_ in items]
            y_score = [s for _,s in items]
            y_topk = [t for t,_ in topk]
            if sum(y_true)==0: continue
            ndcg_list.append(ndcg_score([y_true], [y_score], k=k))
            prec_list.append(sum(y_topk)/k)
            rec_list.append(sum(y_topk)/sum(y_true))
            hit_list.append(1.0 if sum(y_topk)>0 else 0.0)
            y_true_all.extend(y_true)
            y_score_all.extend(y_score)

        auc = roc_auc_score(y_true_all, y_score_all)
        if not silent:
            print(f"AUC: {auc:.4f}, nDCG@{k}: {np.mean(ndcg_list):.4f}, "
                  f"Precision@{k}: {np.mean(prec_list):.4f}, "
                  f"Recall@{k}: {np.mean(rec_list):.4f}, "
                  f"HitRate@{k}: {np.mean(hit_list):.4f}")
        return auc, np.mean(ndcg_list)

# Main execution flow
def main():
    user_embed = compute_user_history_embedding(
        'train/behaviors.tsv',
        'train/entity_embedding.vec'
    )

    print("=== Building TRAIN graph ===")
    train_data, feat_dim, tfidf_title, tfidf_abs, cat_map, subcat_map = build_graph(
        'train/news.tsv',
        'train/behaviors.tsv',
        'train/entity_embedding.vec',
        user_history_embedding=None
    )

    print("=== Building TEST graph ===")
    test_data, _, _, _, _, _ = build_graph(
        'test/news.tsv',
        'test/behaviors.tsv',
        'test/entity_embedding.vec',
        tfidf_title=tfidf_title,
        tfidf_abs=tfidf_abs,
        cat_mapping=cat_map,
        subcat_mapping=subcat_map,
        user_history_embedding=user_embed
    )
    print(feat_dim)
    model = GNNRecommender(feat_dim, hidden_dim=64).to(device)
    train_model(train_data, test_data, model, epochs=50, lr=1e-3)

    print("\n=== Final Evaluation on TEST set ===")
    evaluate_model(test_data, model)

    # Plot learning curves
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_auc_list, label='Train AUC')
    plt.plot(test_auc_list, label='Test AUC')
    plt.xlabel('Epoch')
    plt.ylabel('AUC')
    plt.title('AUC over Epochs')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_ndcg_list, label='Train nDCG@5')
    plt.plot(test_ndcg_list, label='Test nDCG@5')
    plt.xlabel('Epoch')
    plt.ylabel('nDCG@5')
    plt.title('nDCG@5 over Epochs')
    plt.legend()

    plt.tight_layout()
    plt.show()

if __name__ == '__main__':
    main()

Using device: cuda
=== Building TRAIN graph ===
=== Building TEST graph ===
581


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.65 GiB. GPU 0 has a total capacity of 6.00 GiB of which 4.49 GiB is free. Of the allocated memory 594.43 MiB is allocated by PyTorch, and 7.57 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)