<a href="https://colab.research.google.com/github/vedant75/News-Recommender-System/blob/main/MIND_GNN_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch-geometric torch-scatter torch-sparse

Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-scatter
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-sparse
  Downloading torch_sparse-0.6.18.tar.gz (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.0/210.0 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.8 MB/s[0

In [None]:
import os
import requests
import zipfile
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv, GATConv, LGConv, to_hetero
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from sklearn.metrics import roc_auc_score
from torch.nn import ModuleList

# ==========================================
# 1. Configuration & Helper Functions
# ==========================================
CONFIG = {
    'dataset_url': 'https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip',
    'data_dir': './mind_data',
    'hidden_channels': 64,
    'num_layers': 2,
    'epochs': 5,          # Reduced for demo speed (increase to 10-20 for better results)
    'batch_size': 1024,   # Batch size for negative sampling
    'lr': 0.001,
    'model_type': 'GAT',  # Options: 'SAGE', 'GAT', 'LG'
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')
}

def download_and_extract(url, extract_to):
    """Downloads and extracts the MIND-small dataset."""
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    zip_path = os.path.join(extract_to, 'MINDsmall_train.zip')

    # Check if files already exist to avoid re-downloading
    if os.path.exists(os.path.join(extract_to, 'behaviors.tsv')) and \
       os.path.exists(os.path.join(extract_to, 'news.tsv')):
        print("Dataset already exists. Skipping download.")
        return

    print(f"Downloading dataset from {url}...")
    response = requests.get(url, stream=True)
    with open(zip_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print("Download and extraction complete.")

# ==========================================
# 2. Data Preprocessing
# ==========================================
def load_mind_data(data_dir):
    print("Loading raw data...")
    # Load News Data
    news_df = pd.read_csv(
        os.path.join(data_dir, 'news.tsv'),
        sep='\t',
        header=None,
        names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
    )

    # Load Behaviors Data
    behaviors_df = pd.read_csv(
        os.path.join(data_dir, 'behaviors.tsv'),
        sep='\t',
        header=None,
        names=['impression_id', 'user_id', 'time', 'history', 'impressions']
    )

    # --- Mappings ---
    # Map News IDs to Integers
    unique_news = news_df['news_id'].unique()
    news_id_map = {nid: i for i, nid in enumerate(unique_news)}
    num_news = len(unique_news)

    # Map User IDs to Integers
    unique_users = behaviors_df['user_id'].unique()
    user_id_map = {uid: i for i, uid in enumerate(unique_users)}
    num_users = len(unique_users)

    # Map Categories to Integers (for features)
    unique_categories = news_df['category'].unique()
    category_map = {cat: i for i, cat in enumerate(unique_categories)}
    num_categories = len(unique_categories)

    # --- Node Features ---
    # News Features: One-Hot Encoding of Category
    news_features = torch.zeros((num_news, num_categories))
    for _, row in news_df.iterrows():
        if row['news_id'] in news_id_map:
            nid_idx = news_id_map[row['news_id']]
            cat_idx = category_map[row['category']]
            news_features[nid_idx, cat_idx] = 1.0

    # --- Edge Construction ---
    # We collect edges from the 'history' column (user clicked these in the past)
    # Note: For a true recommendation task, we might also parse 'impressions',
    # but 'history' provides the core user profile graph.

    src = []
    dst = []

    print("Processing interaction history to build graph...")
    # Iterate through behaviors to build edges
    # Dropping NaNs in history
    behaviors_df = behaviors_df.dropna(subset=['history'])

    for _, row in behaviors_df.iterrows():
        u_idx = user_id_map[row['user_id']]
        history_str = str(row['history'])
        clicked_news_ids = history_str.split()

        for news_id in clicked_news_ids:
            if news_id in news_id_map:
                n_idx = news_id_map[news_id]
                src.append(u_idx)
                dst.append(n_idx)

    edge_index = torch.tensor([src, dst], dtype=torch.long)

    # --- Create HeteroData Object ---
    data = HeteroData()

    # Add Nodes
    data['user'].num_nodes = num_users
    # User features: We will use an Embedding layer in the model, so we don't strictly need static features here,
    # but PyG transforms often require a 'x' or 'num_nodes'.
    # Let's add a dummy feature or just rely on num_nodes.
    # To keep it compatible with the blog's "x_dict['user'] = self.user_emb(data['user'].node_id)",
    # we add a node_id vector.
    data['user'].node_id = torch.arange(num_users)

    data['news'].num_nodes = num_news
    data['news'].x = news_features # Category one-hot
    data['news'].node_id = torch.arange(num_news)

    # Add Edges
    data['user', 'clicks', 'news'].edge_index = edge_index

    return data, num_users, num_news, user_id_map, news_id_map

# ==========================================
# 3. Model Architecture
# ==========================================

class GNN(nn.Module):
    def __init__(self, hidden_channels, num_layers, conv_type):
        super().__init__()
        self.num_layers = num_layers
        self.conv_type = conv_type

        if conv_type == "SAGE":
            self.convs = ModuleList(SAGEConv(hidden_channels, hidden_channels) for _ in range(num_layers))
        elif conv_type == "GAT":
            # add_self_loops=False is important for bipartite graphs in some PyG versions
            self.convs = ModuleList(GATConv(hidden_channels, hidden_channels, heads=2, concat=False, add_self_loops=False) for _ in range(num_layers))
        elif conv_type == "LG":
            self.convs = ModuleList(LGConv() for _ in range(num_layers))

    def forward(self, x_dict, edge_index_dict):
        # Dictionary unpacking for Heterogeneous Graphs

        if self.conv_type == "LG":
            # LGConv expects a homogeneous graph structure logic manually applied
            x_user = x_dict['user']
            x_news = x_dict['news']

            # Create a virtual homogeneous node feature set
            x = torch.cat([x_user, x_news], dim=0)

            # Map heterogeneous edges to homogeneous indices
            # User indices [0, num_users-1]
            # News indices [num_users, num_users + num_news - 1]
            edge_index_user_news = edge_index_dict[('user', 'clicks', 'news')]

            # Offset news indices
            num_users = x_user.size(0)
            src = edge_index_user_news[0]
            dst = edge_index_user_news[1] + num_users

            # Bi-directional edges for LightGCN propagation
            edge_index = torch.cat([
                torch.stack([src, dst], dim=0),
                torch.stack([dst, src], dim=0)
            ], dim=1)

            for i in range(self.num_layers):
                x = self.convs[i](x, edge_index)
                # LightGCN usually doesn't use activation functions between layers

            # Split back
            x_user_out = x[:num_users]
            x_news_out = x[num_users:]

            return {'user': x_user_out, 'news': x_news_out}

        else:
            # SAGE and GAT handled via to_hetero wrapper typically,
            # but here we define the inner logic which to_hetero will convert.
            # NOTE: If we use to_hetero, the input x and edge_index are specific to types.
            # However, inside the ModuleList loop, standard PyG convs expect standard tensors.
            # The `to_hetero` transformation happens in the main Model class.

            # This 'forward' is technically just for the structure.
            # When wrapped with to_hetero, it expects x_dict and edge_index_dict automatically?
            # Actually, standard to_hetero usage involves defining a standard GNN and passing it.
            # But the blog defines a custom forward for LG.

            # Let's simplify: Return the convs. `to_hetero` will patch the module.
            # We just need to define how *one* layer works.
            # But the blog had the loop inside forward.

            # We will follow standard PyG + to_hetero pattern:
            # Define a model that takes (x, edge_index), then convert it.
            pass

class StandardGNN(nn.Module):
    """Standard GNN for SAGE/GAT to be converted by to_hetero"""
    def __init__(self, hidden_channels, num_layers, conv_type):
        super().__init__()
        self.conv_type = conv_type
        self.num_layers = num_layers
        if conv_type == "SAGE":
            self.convs = ModuleList(SAGEConv(hidden_channels, hidden_channels) for _ in range(num_layers))
        elif conv_type == "GAT":
            self.convs = ModuleList(GATConv(hidden_channels, hidden_channels, heads=2, concat=False, add_self_loops=False) for _ in range(num_layers))

    def forward(self, x, edge_index):
        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            if i < self.num_layers - 1:
                x = F.relu(x)
        return x

class Classifier(nn.Module):
    def forward(self, x_user: torch.Tensor, x_news: torch.Tensor, edge_label_index: torch.Tensor) -> torch.Tensor:
        # edge_label_index[0] are user indices, edge_label_index[1] are news indices
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_news = x_news[edge_label_index[1]]
        # Dot product
        return (edge_feat_user * edge_feat_news).sum(dim=-1)

class Model(nn.Module):
    def __init__(self, hidden_channels, data, num_layers=2, conv_type="SAGE"):
        super().__init__()
        self.conv_type = conv_type

        # Initial Embeddings
        # News: Linear projection of category One-Hot + Learnable Embedding ID
        self.news_lin = nn.Linear(data['news'].x.size(1), hidden_channels)
        self.news_emb = nn.Embedding(data['news'].num_nodes, hidden_channels)

        # User: Learnable Embedding
        self.user_emb = nn.Embedding(data['user'].num_nodes, hidden_channels)

        # GNN Backbone
        if conv_type == "LG":
            self.gnn = GNN(hidden_channels, num_layers, conv_type)
        else:
            # Use StandardGNN and convert to hetero
            base_gnn = StandardGNN(hidden_channels, num_layers, conv_type)
            self.gnn = to_hetero(base_gnn, data.metadata(), aggr='mean')

        self.classifier = Classifier()

    def forward(self, data):
        x_dict = self.get_embeddings(data)
        # Predict on specific edges (edge_label_index)
        # Note: 'edge_label_index' comes from the LinkSplit transform
        pred = self.classifier(
            x_dict['user'],
            x_dict['news'],
            data['user', 'clicks', 'news'].edge_label_index
        )
        return pred

    def get_embeddings(self, data):
        x_news = self.news_lin(data['news'].x) + self.news_emb(data['news'].node_id)
        x_user = self.user_emb(data['user'].node_id)

        x_dict = {'user': x_user, 'news': x_news}

        # Apply GNN
        if self.conv_type == "LG":
            x_dict = self.gnn(x_dict, data.edge_index_dict)
        else:
            x_dict = self.gnn(x_dict, data.edge_index_dict)

        return x_dict

def bpr_loss(user_emb, news_emb, pos_edge_index, neg_edge_index):
    # Calculate scores for positive edges
    pos_scores = (user_emb[pos_edge_index[0]] * news_emb[pos_edge_index[1]]).sum(dim=-1)

    # Calculate scores for negative edges
    # Check bounds to ensure we don't crash if batches differ slightly
    min_len = min(pos_scores.size(0), neg_edge_index.size(1))
    pos_scores = pos_scores[:min_len]
    neg_edge_index = neg_edge_index[:, :min_len]

    neg_scores = (user_emb[neg_edge_index[0]] * news_emb[neg_edge_index[1]]).sum(dim=-1)

    # BPR Loss formula: -log(sigmoid(pos - neg))
    loss = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-15).mean()
    return loss

def random_negative_sampling(edge_index, num_news, num_neg_samples=None):
    users = edge_index[0]
    if num_neg_samples is None:
        num_neg_samples = users.size(0)

    # Randomly sample news items as negatives
    neg_news = torch.randint(0, num_news, (num_neg_samples,), device=edge_index.device)

    neg_edge_index = torch.stack([users, neg_news], dim=0)
    return neg_edge_index

# ==========================================
# 4. Main Execution
# ==========================================
def main():
    print("--- 1. Downloading Data ---")
    download_and_extract(CONFIG['dataset_url'], CONFIG['data_dir'])

    print("--- 2. Processing Data ---")
    data, num_users, num_news, user_map, news_map = load_mind_data(CONFIG['data_dir'])

    # Convert to undirected for message passing (users affect news, news affect users)
    # But for recommendation target, we want User -> News.
    # PyG LinkSplit handles this.

    print("--- 3. Splitting Data (Train/Val/Test) ---")
    # RandomLinkSplit splits the 'edge_index' into train/val/test sets
    # and creates 'edge_label_index' for prediction.
    transform = RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
        neg_sampling_ratio=1.0, # Generates 1 negative for every positive in val/test
        add_negative_train_samples=False, # We do this manually in training for BPR
        edge_types=[('user', 'clicks', 'news')],
        rev_edge_types=[('news', 'rev_clicks', 'user')]
    )

    train_data, val_data, test_data = transform(data)

    device = CONFIG['device']
    train_data = train_data.to(device)
    val_data = val_data.to(device)
    test_data = test_data.to(device)

    print(f"--- 4. Initializing {CONFIG['model_type']} Model ---")
    model = Model(
        hidden_channels=CONFIG['hidden_channels'],
        data=data,
        num_layers=CONFIG['num_layers'],
        conv_type=CONFIG['model_type']
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['lr'])

    print("--- 5. Training Loop ---")
    for epoch in range(1, CONFIG['epochs'] + 1):
        model.train()
        total_loss = 0

        # Get embeddings
        x_dict = model.get_embeddings(train_data)

        # Positive edges from training set
        pos_edge_index = train_data['user', 'clicks', 'news'].edge_label_index

        # Negative sampling (create random edges that don't exist)
        neg_edge_index = random_negative_sampling(
            pos_edge_index,
            num_news,
            num_neg_samples=pos_edge_index.size(1)
        )

        # Compute Loss (BPR)
        loss = bpr_loss(x_dict['user'], x_dict['news'], pos_edge_index, neg_edge_index)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # --- Validation ---
        model.eval()
        with torch.no_grad():
            # Get predictions for validation set (which includes pos and neg edges from LinkSplit)
            val_preds = model(val_data)
            val_labels = val_data['user', 'clicks', 'news'].edge_label

            auc = roc_auc_score(val_labels.cpu().numpy(), val_preds.sigmoid().cpu().numpy())

        print(f"Epoch {epoch:02d} | Loss: {loss.item():.4f} | Val AUC: {auc:.4f}")

    print("--- 6. Final Evaluation (Test Set) ---")
    model.eval()
    with torch.no_grad():
        test_preds = model(test_data)
        test_labels = test_data['user', 'clicks', 'news'].edge_label
        auc = roc_auc_score(test_labels.cpu().numpy(), test_preds.sigmoid().cpu().numpy())

    print(f"Test AUC: {auc:.4f}")

    # --- Recall@K Calculation ---
    print("--- 7. Generating Top-K Recommendations ---")
    # Helper to calculate Recall@K for a few users
    k = 10
    user_indices = test_data['user'].node_id[:10] # Sample 10 users

    embeddings = model.get_embeddings(test_data)
    user_emb = embeddings['user']
    news_emb = embeddings['news']

    # Compute similarity matrix for all users (careful with memory on large data)
    # Here we do it for the sampled users only
    sample_user_emb = user_emb[user_indices]
    scores = torch.matmul(sample_user_emb, news_emb.t()) # [10, num_news]

    _, top_indices = torch.topk(scores, k=k)

    # Reverse map news IDs to titles
    # Load news DF again or use existing if in memory (it is)
    # We need a map Int -> Title
    news_df = pd.read_csv(
        os.path.join(CONFIG['data_dir'], 'news.tsv'),
        sep='\t',
        header=None,
        names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
    )
    # Make a quick lookup
    int_to_newsid = {v: k for k, v in news_map.items()}
    newsid_to_title = pd.Series(news_df.title.values, index=news_df.news_id).to_dict()

    print(f"\nTop {k} Recommendations for sample users:")
    for i, user_idx in enumerate(user_indices):
        print(f"\nUser Index: {user_idx.item()}")
        recs = top_indices[i].cpu().numpy()
        for news_idx in recs:
            nid = int_to_newsid.get(news_idx, "Unknown")
            title = newsid_to_title.get(nid, "Unknown Title")
            print(f" - [{nid}] {title[:50]}...")

if __name__ == "__main__":
    main()