In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import __main__
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm

In [2]:
# Load news and behavior data
news_columns = ["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"]
news_df = pd.read_csv('./MINDsmall_train/news.tsv', sep='\t', names=news_columns, header=0).fillna('')
news_df['text'] = news_df['Title'] + " [SEP] " + news_df['Abstract']  # Combine title and abstract

behaviors_df = pd.read_csv('./MINDsmall_train/behaviors.tsv', sep='\t', 
                          names=["Impression ID", "User ID", "Time", "History", "Impressions"], header=0)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 64

def encode_text(text):
    return tokenizer.encode(text, truncation=True, padding='max_length', max_length=MAX_LEN)

news_df['tokens'] = news_df['text'].apply(encode_text)
news_dict = dict(zip(news_df['News ID'], news_df['tokens']))

In [3]:
news_df.head(3)

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities,text,tokens
0,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",50 Worst Habits For Belly Fat [SEP] These seem...,"[101, 2753, 5409, 14243, 2005, 7579, 6638, 102..."
1,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",The Cost of Trump's Aid Freeze in the Trenches...,"[101, 1996, 3465, 1997, 8398, 1005, 1055, 4681..."
2,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",I Was An NBA Wife. Here's How It Affected My M...,"[101, 1045, 2001, 2019, 6452, 2564, 1012, 2182..."


In [4]:
behaviors_df.head(3)

Unnamed: 0,Impression ID,User ID,Time,History,Impressions
0,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
1,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
2,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0


In [5]:
# Dataset Preparation
def parse_behaviors(behaviors):
    samples = []
    for _, row in behaviors.iterrows():
        history = row['History'].split() if pd.notna(row['History']) else []
        impressions = row['Impressions'].split()
        for impression in impressions:
            nid, label = impression.split('-')
            samples.append((history, nid, int(label)))
    return samples

samples = parse_behaviors(behaviors_df)


In [6]:
# First define NewsDataset in its own cell (or at top of script)
class NewsDataset(Dataset):
    def __init__(self, samples, news_dict):
        self.samples = samples
        self.news_dict = news_dict

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        history_ids, candidate_id, label = self.samples[idx]
        history_tokens = [self.news_dict[nid] for nid in history_ids if nid in self.news_dict]
        candidate_tokens = self.news_dict.get(candidate_id, [0]*MAX_LEN)
        
        max_history_len = 50
        if len(history_tokens) > max_history_len:
            history_tokens = history_tokens[-max_history_len:]
        else:
            history_tokens.extend([[0]*MAX_LEN]*(max_history_len - len(history_tokens)))

        return {
            'history': torch.tensor(history_tokens),
            'candidate': torch.tensor(candidate_tokens),
            'label': torch.tensor(label, dtype=torch.float)
        }


In [7]:
# Simplified model
class NRMS(nn.Module):
    def __init__(self, embedding_dim=300):
        super().__init__()
        self.embedding = nn.Embedding(tokenizer.vocab_size, embedding_dim, padding_idx=0)
        self.news_encoder = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim),
            nn.ReLU(),
            nn.LayerNorm(embedding_dim)
        )
        self.final_layer = nn.Sequential(
            nn.Linear(embedding_dim, 1),
            nn.Sigmoid()
        )
    
    def encode_news(self, x):
        if len(x.shape) == 3:  # History (batch_size, hist_len, seq_len)
            batch_size, hist_len, seq_len = x.shape
            x = x.view(-1, seq_len)  # (batch_size*hist_len, seq_len)
            x = self.embedding(x)  # (batch_size*hist_len, seq_len, embed_dim)
            x = x.mean(dim=1)  # (batch_size*hist_len, embed_dim)
            x = x.view(batch_size, hist_len, -1)  # (batch_size, hist_len, embed_dim)
            x = self.news_encoder(x)  # (batch_size, hist_len, embed_dim)
            x = x.mean(dim=1)  # (batch_size, embed_dim)
        else:  # Candidate (batch_size, seq_len)
            x = self.embedding(x)  # (batch_size, seq_len, embed_dim)
            x = x.mean(dim=1)  # (batch_size, embed_dim)
            x = self.news_encoder(x)  # (batch_size, embed_dim)
        return x
    
    def forward(self, history, candidate):
        user_vec = self.encode_news(history)  # (batch_size, embed_dim)
        news_vec = self.encode_news(candidate)  # (batch_size, embed_dim)
        interaction = user_vec * news_vec  # (batch_size, embed_dim)
        return self.final_layer(interaction).squeeze()  # (batch_size,)


In [8]:
def calculate_mrr(y_true, y_pred):
    """Calculate MRR using scikit-learn style implementation"""
    # For binary classification
    if len(y_true.shape) == 1:
        y_true = y_true.reshape(-1, 1)
        y_pred = y_pred.reshape(-1, 1)
    
    # Get ranking positions of positive samples
    ranking_positions = []
    for i in range(len(y_true)):
        if np.sum(y_true[i]) > 0:  # Only calculate for queries with positive samples
            pred_rank = np.argsort(-y_pred[i])  # Sort descending
            pos_rank = np.where(pred_rank == np.argmax(y_true[i]))[0][0] + 1
            ranking_positions.append(1.0 / pos_rank)
    
    return np.mean(ranking_positions) if ranking_positions else 0.0

def calculate_ndcg(y_true, y_pred, k):
    """Calculate NDCG@k using scikit-learn style implementation"""
    # For binary classification
    if len(y_true.shape) == 1:
        y_true = y_true.reshape(-1, 1)
        y_pred = y_pred.reshape(-1, 1)
    
    ndcg_scores = []
    for i in range(len(y_true)):
        # Sort predictions and get top k
        order = np.argsort(-y_pred[i])[:k]
        # Calculate DCG
        dcg = np.sum(y_true[i][order] / np.log2(np.arange(2, k+2)))
        # Calculate IDCG
        ideal_order = np.argsort(-y_true[i])[:k]
        idcg = np.sum(y_true[i][ideal_order] / np.log2(np.arange(2, k+2)))
        ndcg_scores.append(dcg / idcg if idcg > 0 else 0.0)
    
    return np.mean(ndcg_scores)

def evaluate(model, loader):
    model.eval()
    all_labels, all_preds = [], []
    
    with torch.no_grad():
        for batch in loader:
            outputs = model(batch['history'], batch['candidate'])
            all_labels.append(batch['label'].cpu().numpy())
            all_preds.append(outputs.cpu().numpy())
    
    y_true = np.concatenate(all_labels)
    y_pred = np.concatenate(all_preds)
    
    # Handle binary classification case
    if len(y_true.shape) == 1:
        y_true = y_true.reshape(-1, 1)
        y_pred = y_pred.reshape(-1, 1)

    return (
        roc_auc_score(y_true, y_pred),
        calculate_mrr(y_true, y_pred),
        calculate_ndcg(y_true, y_pred, 5),
        calculate_ndcg(y_true, y_pred, 10)
    )

In [9]:
# Check label distribution
print(f"Positive samples: {sum(samples[i][2] for i in range(len(samples)))} out of {len(samples)}")

# Verify tokenization
sample_idx = random.randint(0, len(samples)-1)
print(f"Sample tokens: {news_dict[samples[sample_idx][1]]}")

Positive samples: 236343 out of 5843442
Sample tokens: [101, 2662, 10558, 6869, 2000, 1016, 11573, 2273, 2915, 2007, 12563, 1024, 3189, 102, 2048, 11573, 2273, 2542, 2012, 2019, 4372, 26468, 3672, 2379, 2624, 3799, 2020, 2915, 2007, 12563, 2220, 4465, 1999, 2019, 2886, 2008, 2001, 2025, 6721, 1010, 1037, 3189, 2056, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [10]:
__main__.NewsDataset = NewsDataset  # Critical for Jupyter

device = 'cpu'
model = NRMS().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.BCELoss()

# Create dataset and loader
dataset = NewsDataset(random.sample(samples, int(len(samples)*0.2)), news_dict)
loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)
print(len(tqdm(loader)))
for epoch in range(2):
    model.train()
    total_loss = 0
    for i, batch in enumerate(tqdm(loader)):
        history = batch['history'].to(device)
        candidate = batch['candidate'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(history, candidate)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        if i % 100 == 0:
            print(f"Batch {i}: Loss {total_loss/(i+1):.4f}")
    
    auc, mrr, ndcg5, ndcg10 = evaluate(model, loader)
    print(f"\nEpoch {epoch+1} Metrics:")
    print(f"AUC: {auc:.4f} | MRR: {mrr:.4f} | NDCG@5: {ndcg5:.4f} | NDCG@10: {ndcg10:.4f}")
    print("-"*50)

  0%|          | 0/36522 [00:00<?, ?it/s]

36522


  0%|          | 0/36522 [00:00<?, ?it/s]

Batch 0: Loss 0.7283
Batch 100: Loss 0.2018
Batch 200: Loss 0.1897
Batch 300: Loss 0.1862
Batch 400: Loss 0.1853
Batch 500: Loss 0.1842
Batch 600: Loss 0.1803
Batch 700: Loss 0.1788
Batch 800: Loss 0.1771
Batch 900: Loss 0.1762
Batch 1000: Loss 0.1747
Batch 1100: Loss 0.1741
Batch 1200: Loss 0.1728
Batch 1300: Loss 0.1735
Batch 1400: Loss 0.1734
Batch 1500: Loss 0.1733
Batch 1600: Loss 0.1718
Batch 1700: Loss 0.1703
Batch 1800: Loss 0.1709
Batch 1900: Loss 0.1706
Batch 2000: Loss 0.1704
Batch 2100: Loss 0.1701
Batch 2200: Loss 0.1696
Batch 2300: Loss 0.1691
Batch 2400: Loss 0.1688
Batch 2500: Loss 0.1688
Batch 2600: Loss 0.1683
Batch 2700: Loss 0.1678
Batch 2800: Loss 0.1673
Batch 2900: Loss 0.1674
Batch 3000: Loss 0.1673
Batch 3100: Loss 0.1672
Batch 3200: Loss 0.1665
Batch 3300: Loss 0.1666
Batch 3400: Loss 0.1660
Batch 3500: Loss 0.1663
Batch 3600: Loss 0.1664
Batch 3700: Loss 0.1659
Batch 3800: Loss 0.1663
Batch 3900: Loss 0.1665
Batch 4000: Loss 0.1668
Batch 4100: Loss 0.1669
Batc

  0%|          | 0/36522 [00:00<?, ?it/s]

Batch 0: Loss 0.2341
Batch 100: Loss 0.1655
Batch 200: Loss 0.1658
Batch 300: Loss 0.1667
Batch 400: Loss 0.1636
Batch 500: Loss 0.1623
Batch 600: Loss 0.1624
Batch 700: Loss 0.1604
Batch 800: Loss 0.1579
Batch 900: Loss 0.1580
Batch 1000: Loss 0.1570
Batch 1100: Loss 0.1569
Batch 1200: Loss 0.1573
Batch 1300: Loss 0.1572
Batch 1400: Loss 0.1569
Batch 1500: Loss 0.1566
Batch 1600: Loss 0.1567
Batch 1700: Loss 0.1567
Batch 1800: Loss 0.1572
Batch 1900: Loss 0.1575
Batch 2000: Loss 0.1575
Batch 2100: Loss 0.1571
Batch 2200: Loss 0.1572
Batch 2300: Loss 0.1574
Batch 2400: Loss 0.1575
Batch 2500: Loss 0.1577
Batch 2600: Loss 0.1578
Batch 2700: Loss 0.1568
Batch 2800: Loss 0.1562
Batch 2900: Loss 0.1563
Batch 3000: Loss 0.1557
Batch 3100: Loss 0.1555
Batch 3200: Loss 0.1559
Batch 3300: Loss 0.1565
Batch 3400: Loss 0.1568
Batch 3500: Loss 0.1569
Batch 3600: Loss 0.1568
Batch 3700: Loss 0.1567
Batch 3800: Loss 0.1566
Batch 3900: Loss 0.1562
Batch 4000: Loss 0.1564
Batch 4100: Loss 0.1569
Batc

In [26]:
# Save the entire model
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': total_loss/len(loader),
}, 'nrms_model_checkpoint.pth')

# Additionally save just the model weights (smaller file)
torch.save(model.state_dict(), 'nrms_model_weights.pth')

print("Model saved successfully!")

Model saved successfully!


In [28]:
import torch
from transformers import BertTokenizer
from torch import nn

MAX_LEN = 64

def load_model(checkpoint_path='nrms_model_checkpoint.pth'):
    # Initialize model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = NRMS()
    
    # Load saved state
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    
    return model, tokenizer

In [None]:
from model_utils import load_model

# Load the model
model, tokenizer = load_model('nrms_model_checkpoint.pth')
model.eval()  # Set to evaluation mode

# Example usage
def predict(history_news_ids, candidate_news_id):
    # Tokenize input
    history_tokens = [news_dict.get(nid, [0]*MAX_LEN) for nid in history_news_ids]
    candidate_tokens = news_dict.get(candidate_news_id, [0]*MAX_LEN)
    
    # Pad/truncate history
    if len(history_tokens) > 50:
        history_tokens = history_tokens[-50:]
    else:
        history_tokens.extend([[0]*MAX_LEN]*(50 - len(history_tokens)))
    
    # Convert to tensors
    history_tensor = torch.tensor([history_tokens])
    candidate_tensor = torch.tensor([candidate_tokens])
    
    # Predict
    with torch.no_grad():
        score = model(history_tensor, candidate_tensor).item()
    
    return score

In [46]:
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

device = 'cpu'

def compute_dcg(relevances, k):
    relevances = np.array(relevances)[:k]
    if len(relevances) > 0:
        return np.sum(relevances / np.log2(np.arange(2, len(relevances)+2)))
    return 0.0

def evaluate_nrms(model, behaviors_df, news_dict, show_progress=True):
    auc_scores, mrr_scores, ndcg5_scores, ndcg10_scores = [], [], [], []
    
    # Create progress bar if requested
    iterator = behaviors_df.iterrows()
    if show_progress:
        iterator = tqdm(iterator, total=len(behaviors_df), desc="Evaluating")
    
    for _, row in iterator:
        # Parse impressions
        impressions = []
        for item in row['Impressions'].strip().split():
            parts = item.split('-')
            if len(parts) == 2:
                impressions.append((parts[0], int(parts[1])))
        
        if not impressions:
            continue
            
        news_ids, labels = zip(*impressions)
        labels = np.array(labels)
        
        # Get model predictions
        with torch.no_grad():
            scores = []
            for nid in news_ids:
                # Create dummy history if empty
                history = row['History'].split() if pd.notna(row['History']) else [nid]
                
                # Prepare input tensors
                history_tokens = [news_dict[nid] for nid in history if nid in news_dict]
                if not history_tokens:
                    history_tokens = [[0]*MAX_LEN]
                    
                candidate_tokens = news_dict.get(nid, [0]*MAX_LEN)
                
                # Pad/truncate history
                if len(history_tokens) > 50:
                    history_tokens = history_tokens[-50:]
                else:
                    history_tokens.extend([[0]*MAX_LEN]*(50 - len(history_tokens)))
                
                # Convert to tensors
                history_tensor = torch.tensor([history_tokens]).to(device)
                candidate_tensor = torch.tensor([candidate_tokens]).to(device)
                
                # Get prediction
                score = model(history_tensor, candidate_tensor).item()
                scores.append(score)
        
        scores = np.array(scores)
        
        # Calculate metrics
        if 0 < sum(labels) < len(labels):
            auc_scores.append(roc_auc_score(labels, scores))
        
        order = np.argsort(-scores)
        
        # MRR
        for rank, idx in enumerate(order):
            if labels[idx] == 1:
                mrr_scores.append(1.0/(rank+1))
                break
        else:
            mrr_scores.append(0.0)
        
        # NDCG
        sorted_labels = labels[order]
        ideal_labels = sorted(labels, reverse=True)
        
        dcg5 = compute_dcg(sorted_labels, 5)
        dcg10 = compute_dcg(sorted_labels, 10)
        idcg5 = compute_dcg(ideal_labels, 5)
        idcg10 = compute_dcg(ideal_labels, 10)
        
        ndcg5_scores.append(dcg5/idcg5 if idcg5 > 0 else 0.0)
        ndcg10_scores.append(dcg10/idcg10 if idcg10 > 0 else 0.0)
    
    # Calculate final metrics
    metrics = (
        np.mean(auc_scores) if auc_scores else 0,
        np.mean(mrr_scores) if mrr_scores else 0,
        np.mean(ndcg5_scores) if ndcg5_scores else 0,
        np.mean(ndcg10_scores) if ndcg10_scores else 0
    )
    
    return metrics

# Load the model
model.eval()  # Set to evaluation mode

# Sample a tenth of the dataset for evaluation
evaluation_subset = behaviors_df.sample(frac=1, random_state=42)  # random_state for reproducibility

print(f"Evaluating on {len(evaluation_subset)} samples (100% of full dataset)")
nrms_metrics = evaluate_nrms(model, evaluation_subset, news_dict)

print("\nFinal Evaluation Metrics (100% subset):")
print(f"AUC: {nrms_metrics[0]:.4f} | MRR: {nrms_metrics[1]:.4f} | NDCG@5: {nrms_metrics[2]:.4f} | NDCG@10: {nrms_metrics[3]:.4f}")

Evaluating on 156964 samples (100% of full dataset)


Evaluating: 100%|█████████████████████| 156964/156964 [1:47:56<00:00, 24.24it/s]



Final Evaluation Metrics (100% subset):
AUC: 0.6825 | MRR: 0.3923 | NDCG@5: 0.3686 | NDCG@10: 0.4250
