<a href="https://colab.research.google.com/github/vedant75/News-Recommender-System/blob/main/MIND_GNN_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torch-geometric
# !pip install torch-scatter

Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0


In [5]:
!pip install torch-sparse

Collecting torch-sparse
  Using cached torch_sparse-0.6.18.tar.gz (209 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-sparse
  Building wheel for torch-sparse (setup.py) ... [?25l[?25hdone
  Created wheel for torch-sparse: filename=torch_sparse-0.6.18-cp312-cp312-linux_x86_64.whl size=3039787 sha256=7752a64e3ab178ddc0d388dc10dd9f67ce50c0503e6d24732cc9e42afbfc4b73
  Stored in directory: /root/.cache/pip/wheels/71/fa/21/bd1d78ce1629aec4ecc924a63b82f6949dda484b6321eac6f2
Successfully built torch-sparse
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18


In [3]:
import os
import requests
import zipfile
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv, GATConv, LGConv, to_hetero
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from sklearn.metrics import roc_auc_score
from torch.nn import ModuleList

# ==========================================
# 1. Configuration & Helper Functions
# ==========================================
CONFIG = {
    'dataset_url': 'https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip',
    'data_dir': './mind_data',
    'hidden_channels': 64,
    'num_layers': 2,
    'epochs': 20,         # Increased to 20 to match blog convergence
    'batch_size': 1024,   # Batch size for negative sampling
    'lr': 0.001,
    'weight_decay': 1e-5,
    'model_type': 'GAT',  # Blog winner: GAT
    'heads': 4,           # Blog specified 4 heads for GAT
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')
}

def download_and_extract(url, extract_to):
    """Downloads and extracts the MIND-small dataset."""
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    zip_path = os.path.join(extract_to, 'MINDsmall_train.zip')

    # Check if files already exist to avoid re-downloading
    if os.path.exists(os.path.join(extract_to, 'behaviors.tsv')) and \
       os.path.exists(os.path.join(extract_to, 'news.tsv')):
        print("Dataset already exists. Skipping download.")
        return

    print(f"Downloading dataset from {url}...")
    response = requests.get(url, stream=True)
    # Check if the download was successful
    response.raise_for_status()
    with open(zip_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print("Download and extraction complete.")

# ==========================================
# 2. Data Preprocessing
# ==========================================
def load_mind_data(data_dir):
    print("Loading raw data...")
    # Load News Data
    news_df = pd.read_csv(
        os.path.join(data_dir, 'news.tsv'),
        sep='\t',
        header=None,
        names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
    )

    # Load Behaviors Data
    behaviors_df = pd.read_csv(
        os.path.join(data_dir, 'behaviors.tsv'),
        sep='\t',
        header=None,
        names=['impression_id', 'user_id', 'time', 'history', 'impressions']
    )

    # --- Mappings ---
    # Map News IDs to Integers
    unique_news = news_df['news_id'].unique()
    news_id_map = {nid: i for i, nid in enumerate(unique_news)}
    num_news = len(unique_news)

    # Map User IDs to Integers
    unique_users = behaviors_df['user_id'].unique()
    user_id_map = {uid: i for i, uid in enumerate(unique_users)}
    num_users = len(unique_users)

    # Map Categories to Integers (for features)
    unique_categories = news_df['category'].unique()
    category_map = {cat: i for i, cat in enumerate(unique_categories)}
    num_categories = len(unique_categories)

    # --- Node Features ---
    # News Features: One-Hot Encoding of Category
    news_features = torch.zeros((num_news, num_categories))
    for _, row in news_df.iterrows():
        if row['news_id'] in news_id_map:
            nid_idx = news_id_map[row['news_id']]
            cat_idx = category_map[row['category']]
            news_features[nid_idx, cat_idx] = 1.0

    # --- Edge Construction ---
    src = []
    dst = []

    print("Processing interaction history to build graph...")
    behaviors_df = behaviors_df.dropna(subset=['history'])

    for _, row in behaviors_df.iterrows():
        u_idx = user_id_map[row['user_id']]
        history_str = str(row['history'])
        clicked_news_ids = history_str.split()

        for news_id in clicked_news_ids:
            if news_id in news_id_map:
                n_idx = news_id_map[news_id]
                src.append(u_idx)
                dst.append(n_idx)

    edge_index = torch.tensor([src, dst], dtype=torch.long)

    # --- Create HeteroData Object ---
    data = HeteroData()
    data['user'].num_nodes = num_users
    data['user'].node_id = torch.arange(num_users)
    data['news'].num_nodes = num_news
    data['news'].x = news_features
    data['news'].node_id = torch.arange(num_news)
    data['user', 'clicks', 'news'].edge_index = edge_index

    return data, num_users, num_news, user_id_map, news_id_map

# ==========================================
# 3. Model Architecture
# ==========================================

class GNN(nn.Module):
    def __init__(self, hidden_channels, num_layers, conv_type):
        super().__init__()
        self.num_layers = num_layers
        self.conv_type = conv_type

        if conv_type == "SAGE":
            self.convs = ModuleList(SAGEConv(hidden_channels, hidden_channels) for _ in range(num_layers))
        elif conv_type == "GAT":
            # Match blog: heads=4, concat=False (averaging heads usually better for embedding size stability)
            self.convs = ModuleList(GATConv(hidden_channels, hidden_channels, heads=CONFIG['heads'], concat=False, add_self_loops=False) for _ in range(num_layers))
        elif conv_type == "LG":
            self.convs = ModuleList(LGConv() for _ in range(num_layers))

    def forward(self, x_dict, edge_index_dict):
        if self.conv_type == "LG":
            # Manual Homogeneous conversion for LightGCN
            x_user = x_dict['user']
            x_news = x_dict['news']
            x = torch.cat([x_user, x_news], dim=0)

            edge_index_user_news = edge_index_dict[('user', 'clicks', 'news')]
            num_users = x_user.size(0)
            src = edge_index_user_news[0]
            dst = edge_index_user_news[1] + num_users

            edge_index = torch.cat([
                torch.stack([src, dst], dim=0),
                torch.stack([dst, src], dim=0)
            ], dim=1)

            for i in range(self.num_layers):
                x = self.convs[i](x, edge_index)

            x_user_out = x[:num_users]
            x_news_out = x[num_users:]
            return {'user': x_user_out, 'news': x_news_out}
        else:
            # Placeholder for to_hetero (not called directly)
            pass

class StandardGNN(nn.Module):
    """Standard GNN Wrapper for SAGE/GAT to be converted by to_hetero"""
    def __init__(self, hidden_channels, num_layers, conv_type):
        super().__init__()
        self.conv_type = conv_type
        self.num_layers = num_layers
        if conv_type == "SAGE":
            self.convs = ModuleList(SAGEConv(hidden_channels, hidden_channels) for _ in range(num_layers))
        elif conv_type == "GAT":
             self.convs = ModuleList(GATConv(hidden_channels, hidden_channels, heads=CONFIG['heads'], concat=False, add_self_loops=False) for _ in range(num_layers))

    def forward(self, x, edge_index):
        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            if i < self.num_layers - 1:
                x = F.relu(x)
        return x

class Classifier(nn.Module):
    def forward(self, x_user: torch.Tensor, x_news: torch.Tensor, edge_label_index: torch.Tensor) -> torch.Tensor:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_news = x_news[edge_label_index[1]]
        return (edge_feat_user * edge_feat_news).sum(dim=-1)

class Model(nn.Module):
    def __init__(self, hidden_channels, data, num_layers=2, conv_type="SAGE"):
        super().__init__()
        self.conv_type = conv_type

        self.news_lin = nn.Linear(data['news'].x.size(1), hidden_channels)
        self.news_emb = nn.Embedding(data['news'].num_nodes, hidden_channels)
        self.user_emb = nn.Embedding(data['user'].num_nodes, hidden_channels)

        if conv_type == "LG":
            self.gnn = GNN(hidden_channels, num_layers, conv_type)
        else:
            base_gnn = StandardGNN(hidden_channels, num_layers, conv_type)
            self.gnn = to_hetero(base_gnn, data.metadata(), aggr='mean')

        self.classifier = Classifier()

    def forward(self, data):
        x_dict = self.get_embeddings(data)
        pred = self.classifier(
            x_dict['user'],
            x_dict['news'],
            data['user', 'clicks', 'news'].edge_label_index
        )
        return pred

    def get_embeddings(self, data):
        x_news = self.news_lin(data['news'].x) + self.news_emb(data['news'].node_id)
        x_user = self.user_emb(data['user'].node_id)
        x_dict = {'user': x_user, 'news': x_news}

        # Apply GNN
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        return x_dict

# Updated BPR Loss to match blog (Added regularization)
def bpr_loss(user_emb, news_emb, pos_edge_index, neg_edge_index, lambda_reg=1e-4):
    pos_scores = (user_emb[pos_edge_index[0]] * news_emb[pos_edge_index[1]]).sum(dim=-1)

    min_len = min(pos_scores.size(0), neg_edge_index.size(1))
    pos_scores = pos_scores[:min_len]
    neg_edge_index = neg_edge_index[:, :min_len]

    neg_scores = (user_emb[neg_edge_index[0]] * news_emb[neg_edge_index[1]]).sum(dim=-1)

    # BPR Loss + Regularization
    loss_bpr = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-15).mean()
    loss_reg = lambda_reg * (user_emb[pos_edge_index[0]].norm(2).pow(2) +
                             news_emb[pos_edge_index[1]].norm(2).pow(2)) / float(min_len)

    return loss_bpr + loss_reg

def random_negative_sampling(edge_index, num_news, num_neg_samples=None):
    users = edge_index[0]
    if num_neg_samples is None:
        num_neg_samples = users.size(0)
    neg_news = torch.randint(0, num_news, (num_neg_samples,), device=edge_index.device)
    neg_edge_index = torch.stack([users, neg_news], dim=0)
    return neg_edge_index

# ==========================================
# 4. Main Execution
# ==========================================
def main():
    print("--- 1. Downloading Data ---")
    download_and_extract(CONFIG['dataset_url'], CONFIG['data_dir'])

    print("--- 2. Processing Data ---")
    data, num_users, num_news, user_map, news_map = load_mind_data(CONFIG['data_dir'])

    print("--- 3. Splitting Data (Train/Val/Test) ---")
    transform = RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
        neg_sampling_ratio=1.0,
        add_negative_train_samples=False,
        edge_types=[('user', 'clicks', 'news')],
        rev_edge_types=[('news', 'rev_clicks', 'user')]
    )

    train_data, val_data, test_data = transform(data)

    device = CONFIG['device']
    train_data = train_data.to(device)
    val_data = val_data.to(device)
    test_data = test_data.to(device)

    print(f"--- 4. Initializing {CONFIG['model_type']} Model (Heads={CONFIG['heads']}) ---")
    model = Model(
        hidden_channels=CONFIG['hidden_channels'],
        data=data,
        num_layers=CONFIG['num_layers'],
        conv_type=CONFIG['model_type']
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['lr'], weight_decay=CONFIG['weight_decay'])

    print(f"--- 5. Training Loop ({CONFIG['epochs']} Epochs) ---")
    for epoch in range(1, CONFIG['epochs'] + 1):
        model.train()

        x_dict = model.get_embeddings(train_data)
        pos_edge_index = train_data['user', 'clicks', 'news'].edge_label_index
        neg_edge_index = random_negative_sampling(
            pos_edge_index,
            num_news,
            num_neg_samples=pos_edge_index.size(1)
        )

        loss = bpr_loss(x_dict['user'], x_dict['news'], pos_edge_index, neg_edge_index)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            val_preds = model(val_data)
            val_labels = val_data['user', 'clicks', 'news'].edge_label
            auc = roc_auc_score(val_labels.cpu().numpy(), val_preds.sigmoid().cpu().numpy())

        print(f"Epoch {epoch:02d} | Loss: {loss.item():.4f} | Val AUC: {auc:.4f}")

    print("--- 6. Final Evaluation (Test Set) ---")
    model.eval()
    with torch.no_grad():
        test_preds = model(test_data)
        test_labels = test_data['user', 'clicks', 'news'].edge_label
        auc = roc_auc_score(test_labels.cpu().numpy(), test_preds.sigmoid().cpu().numpy())

    print(f"Test AUC: {auc:.4f}")

    # --- Recall@1000 Calculation (Batched) ---
    print("--- 7. Generating Top-K Recommendations (Recall@1000) ---")
    k = 1000
    # Calculate for a subset of users to demonstrate functionality
    # Calculating for ALL 50k users might timeout on free Colab
    num_eval_users = 100
    user_indices = test_data['user'].node_id[:num_eval_users]

    embeddings = model.get_embeddings(test_data)
    user_emb = embeddings['user'][user_indices]
    news_emb = embeddings['news']

    recall_hits = 0

    # Batch processing for similarity to avoid OOM
    batch_size = 10 # Process 10 users at a time
    print(f"Calculating Recall@{k} for {num_eval_users} users...")

    for i in range(0, num_eval_users, batch_size):
        end = min(i + batch_size, num_eval_users)
        batch_user_emb = user_emb[i:end]

        # Matrix multiplication: [Batch, Hidden] @ [Hidden, News] -> [Batch, News]
        scores = torch.matmul(batch_user_emb, news_emb.t())

        # Get Top K
        _, top_indices = torch.topk(scores, k=k)

        # Check against ground truth (Test Edges)
        # Note: In a real rigorous recall test, we must group ground truth by user.
        # This is a simplified check against the test edge set provided by LinkSplit
        # For a full Recall metric, we need a dictionary of {user: [ground_truth_items]}
        # We will build that quickly for these users

        current_user_ids = user_indices[i:end].cpu().numpy()

        # Filter test edges for these users
        test_edges = test_data['user', 'clicks', 'news'].edge_label_index
        test_labels = test_data['user', 'clicks', 'news'].edge_label
        # We only care about positive edges in test set
        pos_mask = test_labels == 1
        pos_test_edges = test_edges[:, pos_mask]

        for idx, u_id in enumerate(current_user_ids):
            # Get true items for this user in test set
            # (Note: This search is slow, in production use a pre-built dict)
            mask = pos_test_edges[0] == u_id
            true_items = pos_test_edges[1][mask].cpu().numpy()

            if len(true_items) == 0:
                continue

            # Check overlap
            recs = top_indices[idx].cpu().numpy()
            hits = np.intersect1d(true_items, recs)
            if len(hits) > 0:
                recall_hits += 1

    recall_score = recall_hits / num_eval_users
    print(f"Recall@{k} (Sampled): {recall_score:.4f}")

    # Show actual recommendations for the first user in the batch
    news_df = pd.read_csv(
        os.path.join(CONFIG['data_dir'], 'news.tsv'),
        sep='\t',
        header=None,
        names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
    )
    int_to_newsid = {v: k for k, v in news_map.items()}
    newsid_to_title = pd.Series(news_df.title.values, index=news_df.news_id).to_dict()

    print(f"\nTop 10 Recommendations for User {user_indices[0].item()}:")
    recs = top_indices[0].cpu().numpy()[:10]
    for news_idx in recs:
        nid = int_to_newsid.get(news_idx, "Unknown")
        title = newsid_to_title.get(nid, "Unknown Title")
        print(f" - [{nid}] {title[:60]}...")

if __name__ == "__main__":
    main()

--- 1. Downloading Data ---
Downloading dataset from https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip...


HTTPError: 409 Client Error: Public access is not permitted on this storage account. for url: https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip