In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [1]:
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import random
import logging
import os
import torch.optim as optim
import torch.distributed as dist
from torch.utils.data import DataLoader
from pathlib import Path



In [79]:
from dataclasses import dataclass
from typing import Optional

@dataclass
class Args:
    nGPU: int = 1
    seed: int = 0
    prepare: bool = True
    mode: str = "train"
    train_data_dir: str = "/home/vinmike/Documents/GitHub/LLM4Rec-Dataloader/data/MINDsmall_train"
    test_data_dir: str = "/home/vinmike/Documents/GitHub/LLM4Rec-Dataloader/data/MINDsmall_dev"
    train_abstract_dir: str = '/home/vinmike/Documents/GitHub/LLM4Rec-Dataloader/data/genAbs0.json'
    # "/content/drive/MyDrive/Colab Notebooks/NewsRecommendation/data/train_gen_abs.json"
    test_abstract_dir: str = '/home/vinmike/Documents/GitHub/LLM4Rec-Dataloader/data/genAbs0.json'
    # "/content/drive/MyDrive/Colab Notebooks/NewsRecommendation/data/Dev_gen_abs.json"
    model_dir: str = '/content/model'
    batch_size: int = 32
    npratio: int = 4
    enable_gpu: bool = False
    filter_num: int = 3
    log_steps: int = 100
    epochs: int = 5
    lr: float = 0.0003
    num_words_title: int = 20
    num_words_abstract: int = 50
    user_log_length: int = 50
    word_embedding_dim: int = 300
    glove_embedding_path: str = '/home/vinmike/Downloads/glove.840B.300d.txt'
    freeze_embedding: bool = False
    news_dim: int = 300
    news_query_vector_dim: int = 200
    user_query_vector_dim: int = 200
    num_attention_heads: int = 15
    user_log_mask: bool = False
    drop_rate: float = 0.2
    save_steps: int = 10000
    start_epoch: int = 0
    load_ckpt_name: Optional[str] = None
    use_category: bool = True
    use_subcategory: bool = True
    use_abstract: bool = True
    use_custom_abstract: bool = True
    category_emb_dim: int = 100

def parse_args():
  return Args()


**Dataset.py**

In [3]:
from torch.utils.data import IterableDataset, Dataset
import numpy as np
import random


class DatasetTrain(IterableDataset):
    def __init__(self, filename, news_index, news_combined, args):
        super(DatasetTrain).__init__()
        self.filename = filename
        self.news_index = news_index
        self.news_combined = news_combined
        self.args = args

    def trans_to_nindex(self, nids):
        return [self.news_index[i] if i in self.news_index else 0 for i in nids]

    def pad_to_fix_len(self, x, fix_length, padding_front=True, padding_value=0):
        if padding_front:
            pad_x = [padding_value] * (fix_length - len(x)) + x[-fix_length:]
            mask = [0] * (fix_length - len(x)) + [1] * min(fix_length, len(x))
        else:
            pad_x = x[-fix_length:] + [padding_value] * (fix_length - len(x))
            mask = [1] * min(fix_length, len(x)) + [0] * (fix_length - len(x))
        return pad_x, np.array(mask, dtype='float32')

    def line_mapper(self, line):
        line = line.strip().split('\t')
        click_docs = line[3].split()
        sess_pos = line[4].split()
        sess_neg = line[5].split()

        click_docs, log_mask = self.pad_to_fix_len(self.trans_to_nindex(click_docs), self.args.user_log_length)
        user_feature = self.news_combined[click_docs]

        pos = self.trans_to_nindex(sess_pos)
        neg = self.trans_to_nindex(sess_neg)

        label = random.randint(0, self.args.npratio)
        sample_news = neg[:label] + pos + neg[label:]
        sample_news, log_mask = self.pad_to_fix_len(sample_news, self.args.user_log_length)

        news_feature = self.news_combined[sample_news]
        
        return user_feature, news_feature, label

    def __iter__(self):
        file_iter = open(self.filename)
        return map(self.line_mapper, file_iter)



class DatasetTest(DatasetTrain):
    def __init__(self, filename, news_index, news_scoring, args):
        super(DatasetTrain).__init__()
        self.filename = filename
        self.news_index = news_index
        self.news_scoring = news_scoring
        self.args = args

    def line_mapper(self, line):
        line = line.strip().split('\t')
        click_docs = line[3].split()
        click_docs, log_mask = self.pad_to_fix_len(self.trans_to_nindex(click_docs), self.args.user_log_length)
        user_feature = self.news_scoring[click_docs]

        candidate_news = self.trans_to_nindex([i.split('-')[0] for i in line[4].split()])
        labels = np.array([int(i.split('-')[1]) for i in line[4].split()])
        news_feature = self.news_scoring[candidate_news]

        return user_feature, log_mask, news_feature, labels

    def __iter__(self):
        file_iter = open(self.filename)
        return map(self.line_mapper, file_iter)


class NewsDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(self):
        return self.data.shape[0]


**Metric.py**

In [4]:
from sklearn.metrics import roc_auc_score
import numpy as np


def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2**y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best


def mrr_score(y_true, y_score):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)


def ctr_score(y_true, y_score, k=1):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    return np.mean(y_true)

def acc(y_true, y_hat):
    y_hat = torch.argmax(y_hat, dim=-1)
    tot = y_true.shape[0]
    hit = torch.sum(y_true == y_hat)
    return hit.data.float() * 1.0 / tot



**Ultis.py**

In [5]:
import logging
import argparse
import sys

def setuplogger():
    root = logging.getLogger()
    root.setLevel(logging.INFO)
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter("[%(levelname)s %(asctime)s] %(message)s")
    handler.setFormatter(formatter)
    root.addHandler(handler)


def dump_args(args):
    for arg in dir(args):
        if not arg.startswith("_"):
            logging.info(f"args[{arg}]={getattr(args, arg)}")

def load_matrix(embedding_file_path, word_dict, word_embedding_dim):
    embedding_matrix = np.zeros(shape=(len(word_dict) + 1, word_embedding_dim))
    have_word = []
    if embedding_file_path is not None:
        with open(embedding_file_path, 'rb') as f:
            while True:
                line = f.readline()
                if len(line) == 0:
                    break
                line = line.split()
                word = line[0].decode()
                if word in word_dict:
                    index = word_dict[word]
                    tp = [float(x) for x in line[1:]]
                    embedding_matrix[index] = np.array(tp)
                    have_word.append(word)
    return embedding_matrix, have_word


def get_checkpoint(directory, ckpt_name):
    ckpt_path = os.path.join(directory, ckpt_name)
    if os.path.exists(ckpt_path):
        return ckpt_path
    else:
        return None


**preprocess.py**

In [6]:
from collections import Counter
from tqdm import tqdm
import numpy as np
from nltk.tokenize import word_tokenize
import json


def update_dict(dict, key, value=None):
    if key not in dict:
        if value is None:
            dict[key] = len(dict) + 1
        else:
            dict[key] = value


def read_custom_abstract(news_file, custom_abstract_dict):
    news = {}
    news_index = {}
    category_dict = {}
    subcategory_dict = {}
    word_cnt = {}

    with open(news_file, 'r', encoding='utf-8') as f:
        for line in f:
            splited = line.strip('\n').split('\t')
            doc_id, category, subcategory, title, abstract, url, entity_title, entity_abstract = splited
            if doc_id in custom_abstract_dict:
                abstract = custom_abstract_dict[doc_id]
            news[doc_id] = [title.split(' '), category, subcategory, abstract.split(' ')]
            news_index[doc_id] = len(news_index) + 1
            for word in title.split(' '):
                if word not in word_cnt:
                    word_cnt[word] = 0
                word_cnt[word] += 1
            for word in abstract.split(' '):
                if word not in word_cnt:
                    word_cnt[word] = 0
                word_cnt[word] += 1
            if category not in category_dict:
                category_dict[category] = len(category_dict) + 1
            if subcategory not in subcategory_dict:
                subcategory_dict[subcategory] = len(subcategory_dict) + 1

    return news, news_index, category_dict, subcategory_dict, word_cnt

def read_news(news_path, abstract_path, args, mode='train'):
    news = {}
    category_dict = {}
    subcategory_dict = {}
    news_index = {}
    word_cnt = Counter()
    if args.use_custom_abstract:
      with open(abstract_path, 'r') as f:
          abs = json.load(f)
    with open(news_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            splited = line.strip('\n').split('\t')
            doc_id, category, subcategory, title, abstract, url, _, _ = splited
            update_dict(news_index, doc_id)

            title = title.lower()
            title = word_tokenize(title, language='english', preserve_line=True)

            update_dict(news, doc_id, [title, category, subcategory, abs[doc_id] if doc_id in abs else abstract])
            if mode == 'train':
                if args.use_category:
                    update_dict(category_dict, category)
                if args.use_subcategory:
                    update_dict(subcategory_dict, subcategory)
                word_cnt.update(title)

    if mode == 'train':
        word = [k for k, v in word_cnt.items() if v > args.filter_num]
        word_dict = {k: v for k, v in zip(word, range(1, len(word) + 1))}
        return news, news_index, category_dict, subcategory_dict, word_dict
    elif mode == 'test':
        return news, news_index
    else:
        assert False, 'Wrong mode!'


def get_doc_input(news, news_index, category_dict, subcategory_dict, word_dict, args):
    news_num = len(news) + 1
    news_title = np.zeros((news_num, args.num_words_title), dtype='int32')
    news_category = np.zeros((news_num, 1), dtype='int32') if args.use_category else None
    news_subcategory = np.zeros((news_num, 1), dtype='int32') if args.use_subcategory else None
    news_abstract = np.zeros((news_num, args.num_words_abstract), dtype='int32') if args.use_abstract else None

    for key in tqdm(news):
        title, category, subcategory, abstract = news[key]
        doc_index = news_index[key]

        for word_id in range(min(args.num_words_title, len(title))):
            if title[word_id] in word_dict:
                news_title[doc_index, word_id] = word_dict[title[word_id]]

        if args.use_category:
            news_category[doc_index, 0] = category_dict[category] if category in category_dict else 0
        if args.use_subcategory:
            news_subcategory[doc_index, 0] = subcategory_dict[subcategory] if subcategory in subcategory_dict else 0
        if args.use_abstract:
            for word_id in range(min(args.num_words_abstract, len(abstract))):
                if abstract[word_id] in word_dict:
                    news_abstract[doc_index, word_id] = word_dict[abstract[word_id]]

    return news_title, news_category, news_subcategory, news_abstract

**prepare_data.py**

In [7]:
import os
from tqdm import tqdm
import random
import logging


def get_sample(all_elements, num_sample):
    if num_sample > len(all_elements):
        return random.sample(all_elements * (num_sample // len(all_elements) + 1), num_sample)
    else:
        return random.sample(all_elements, num_sample)


def prepare_training_data(train_data_dir, nGPU, npratio, seed):
    random.seed(seed)
    behaviors = []

    behavior_file_path = os.path.join(train_data_dir, 'behaviors.tsv')
    with open(behavior_file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            iid, uid, time, history, imp = line.strip().split('\t')
            impressions = [x.split('-') for x in imp.split(' ')]
            pos, neg = [], []
            for news_ID, label in impressions:
                if label == '0':
                    neg.append(news_ID)
                elif label == '1':
                    pos.append(news_ID)
            if len(pos) == 0 or len(neg) == 0:
                continue
            for pos_id in pos:
                neg_candidate = get_sample(neg, npratio)
                neg_str = ' '.join(neg_candidate)
                new_line = '\t'.join([iid, uid, time, history, pos_id, neg_str]) + '\n'
                behaviors.append(new_line)

    random.shuffle(behaviors)

    behaviors_per_file = [[] for _ in range(nGPU)]
    for i, line in enumerate(behaviors):
        behaviors_per_file[i % nGPU].append(line)

    logging.info('Writing files...')
    for i in range(nGPU):
        processed_file_path = os.path.join(train_data_dir, f'behaviors_np{npratio}_{i}.tsv')
        with open(processed_file_path, 'w') as f:
            f.writelines(behaviors_per_file[i])

    return len(behaviors)


def prepare_testing_data(test_data_dir, nGPU):
    behaviors = [[] for _ in range(nGPU)]

    behavior_file_path = os.path.join(test_data_dir, 'behaviors.tsv')
    with open(behavior_file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f)):
            behaviors[i % nGPU].append(line)

    logging.info('Writing files...')
    for i in range(nGPU):
        processed_file_path = os.path.join(test_data_dir, f'behaviors_{i}.tsv')
        with open(processed_file_path, 'w') as f:
            f.writelines(behaviors[i])

    return sum([len(x) for x in behaviors])


In [80]:

    import subprocess
    setuplogger()
    args = parse_args()
    dump_args(args)
    random.seed(args.seed)





[INFO 2025-03-03 14:19:05,978] args[batch_size]=32
[INFO 2025-03-03 14:19:05,978] args[batch_size]=32
[INFO 2025-03-03 14:19:05,978] args[batch_size]=32
[INFO 2025-03-03 14:19:05,978] args[batch_size]=32
[INFO 2025-03-03 14:19:05,978] args[batch_size]=32
[INFO 2025-03-03 14:19:05,978] args[batch_size]=32
[INFO 2025-03-03 14:19:05,978] args[batch_size]=32
[INFO 2025-03-03 14:19:05,978] args[batch_size]=32
[INFO 2025-03-03 14:19:05,986] args[category_emb_dim]=100
[INFO 2025-03-03 14:19:05,986] args[category_emb_dim]=100
[INFO 2025-03-03 14:19:05,986] args[category_emb_dim]=100
[INFO 2025-03-03 14:19:05,986] args[category_emb_dim]=100
[INFO 2025-03-03 14:19:05,986] args[category_emb_dim]=100
[INFO 2025-03-03 14:19:05,986] args[category_emb_dim]=100
[INFO 2025-03-03 14:19:05,986] args[category_emb_dim]=100
[INFO 2025-03-03 14:19:05,986] args[category_emb_dim]=100
[INFO 2025-03-03 14:19:05,996] args[drop_rate]=0.2
[INFO 2025-03-03 14:19:05,996] args[drop_rate]=0.2
[INFO 2025-03-03 14:19:05,

# **NRMS**

In [81]:
import torch
import torch.nn as nn
class DotProductClickPredictor(torch.nn.Module):
    def __init__(self):
        super(DotProductClickPredictor, self).__init__()

    def forward(self, candidate_news_vector, user_vector):
        """
        Args:
            candidate_news_vector: batch_size, candidate_size, X
            user_vector: batch_size, X
        Returns:
            (shape): batch_size
        """
        # batch_size, candidate_size
        probability = torch.bmm(candidate_news_vector,
                                user_vector.unsqueeze(dim=-1)).squeeze(dim=-1)
        return probability
class AdditiveAttention(torch.nn.Module):
    """
    A general additive attention module.
    Originally for NAML.
    """
    def __init__(self,
                 query_vector_dim,
                 candidate_vector_dim,
                 writer=None,
                 tag=None,
                 names=None):
        super(AdditiveAttention, self).__init__()
        self.linear = nn.Linear(candidate_vector_dim, query_vector_dim)
        self.attention_query_vector = nn.Parameter(
            torch.empty(query_vector_dim).uniform_(-0.1, 0.1))
        # For tensorboard
        self.writer = writer
        self.tag = tag
        self.names = names
        self.local_step = 1

    def forward(self, candidate_vector):
        """
        Args:
            candidate_vector: batch_size, candidate_size, candidate_vector_dim
        Returns:
            (shape) batch_size, candidate_vector_dim
        """
        # batch_size, candidate_size, query_vector_dim
        temp = torch.tanh(self.linear(candidate_vector))
        # batch_size, candidate_size
        candidate_weights = F.softmax(torch.matmul(
            temp, self.attention_query_vector),
                                      dim=1)
        if self.writer is not None:
            assert candidate_weights.size(1) == len(self.names)
            if self.local_step % 10 == 0:
                self.writer.add_scalars(
                    self.tag, {
                        x: y
                        for x, y in zip(self.names,
                                        candidate_weights.mean(dim=0))
                    }, self.local_step)
            self.local_step += 1
        # batch_size, candidate_vector_dim
        target = torch.bmm(candidate_weights.unsqueeze(dim=1),
                           candidate_vector).squeeze(dim=1)
        return target
    
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super(ScaledDotProductAttention, self).__init__()
        self.d_k = d_k

    def forward(self, Q, K, V, attn_mask=None):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(self.d_k)
        scores = torch.exp(scores)
        if attn_mask is not None:
            scores = scores * attn_mask
        attn = scores / (torch.sum(scores, dim=-1, keepdim=True) + 1e-8)

        context = torch.matmul(attn, V)
        return context, attn



class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_attention_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.d_model = d_model
        self.num_attention_heads = num_attention_heads
        assert d_model % num_attention_heads == 0
        self.d_k = d_model // num_attention_heads
        self.d_v = d_model // num_attention_heads

        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)

        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight, gain=1)

    def forward(self, Q, K=None, V=None, length=None):
        if K is None:
            K = Q
        if V is None:
            V = Q
        batch_size = Q.size(0)

        q_s = self.W_Q(Q).view(batch_size, -1, self.num_attention_heads,
                               self.d_k).transpose(1, 2)
        k_s = self.W_K(K).view(batch_size, -1, self.num_attention_heads,
                               self.d_k).transpose(1, 2)
        v_s = self.W_V(V).view(batch_size, -1, self.num_attention_heads,
                               self.d_v).transpose(1, 2)

        if length is not None:
            maxlen = Q.size(1)
            attn_mask = torch.arange(maxlen).expand(
                batch_size, maxlen) < length.view(-1, 1)
            attn_mask = attn_mask.unsqueeze(1).expand(batch_size, maxlen,
                                                      maxlen)
            attn_mask = attn_mask.unsqueeze(1).repeat(1,
                                                      self.num_attention_heads,
                                                      1, 1)
        else:
            attn_mask = None

        context, attn = ScaledDotProductAttention(self.d_k)(q_s, k_s, v_s,
                                                            attn_mask)
        context = context.transpose(1, 2).contiguous().view(
            batch_size, -1, self.num_attention_heads * self.d_v)
        return context

In [83]:
import torch
from torch import nn
import torch.nn.functional as F



import torch
import torch.nn as nn
import torch.nn.functional as F

class NewsEncoder(nn.Module):
    def __init__(self, args, embedding_matrix, num_category, num_subcategory):
        super(NewsEncoder, self).__init__()
        self.embedding_matrix = embedding_matrix
        self.drop_rate = args.drop_rate
        self.num_words_title = args.num_words_title
        self.num_words_abstract = args.num_words_abstract
        self.news_dim = args.news_dim
        self.word_embedding_dim = args.word_embedding_dim

        self.multi_head_self_attn = MultiHeadSelfAttention(args.word_embedding_dim, args.num_attention_heads)
        self.attn = AdditiveAttention(args.news_query_vector_dim, args.word_embedding_dim)

        # Category & Subcategory Embeddings
        self.use_category = args.use_category
        self.use_subcategory = args.use_subcategory
        self.use_abstract = args.use_abstract

        self.category_emb = nn.Embedding(num_category + 1, args.category_emb_dim, padding_idx=0) if self.use_category else None
        self.subcategory_emb = nn.Embedding(num_subcategory + 1, args.category_emb_dim, padding_idx=0) if self.use_subcategory else None

        # Projection layers to match news_dim
        self.category_dense = nn.Linear(args.category_emb_dim, self.news_dim) if self.use_category else None
        self.subcategory_dense = nn.Linear(args.category_emb_dim, self.news_dim) if self.use_subcategory else None
        
        if self.use_abstract:
            self.abstract_multi_head_self_attn = MultiHeadSelfAttention(args.word_embedding_dim, args.num_attention_heads)
            self.abstract_attn = AdditiveAttention(args.news_query_vector_dim, args.word_embedding_dim)

        # Adjusting feature multiplier for correct shape
        feature_multiplier = 1 + int(self.use_category) + int(self.use_subcategory) + int(self.use_abstract)
        print("Feature Multiplier:", feature_multiplier)
        self.fusion_layer = nn.Linear(1200, self.news_dim)

    def forward(self, x, mask=None):
        all_vecs = []

        # Encode Title
        title = x[:, :self.num_words_title].long()
        word_vecs = F.dropout(self.embedding_matrix(title), p=self.drop_rate, training=self.training)
        multihead_text_vecs = self.multi_head_self_attn(word_vecs, word_vecs, word_vecs, mask)
        multihead_text_vecs = F.dropout(multihead_text_vecs, p=self.drop_rate, training=self.training)
        title_vecs = self.attn(multihead_text_vecs)
        all_vecs.append(title_vecs)
        
        start = self.num_words_title

        # Encode Category
        if self.use_category:
            category = x[:, start].long()
            category_vecs = F.relu(self.category_dense(self.category_emb(category)))
            all_vecs.append(category_vecs)
            start += 1

        # Encode Subcategory
        if self.use_subcategory:
            subcategory = x[:, start].long()
            subcategory_vecs = F.relu(self.subcategory_dense(self.subcategory_emb(subcategory)))
            all_vecs.append(subcategory_vecs)
            start += 1

        # Encode Abstract
        if self.use_abstract:
            abstract = x[:, start:start + self.num_words_abstract].long()
            abstract_word_vecs = F.dropout(self.embedding_matrix(abstract), p=self.drop_rate, training=self.training)
            abstract_multihead_text_vecs = self.abstract_multi_head_self_attn(abstract_word_vecs, abstract_word_vecs, abstract_word_vecs, mask)
            abstract_multihead_text_vecs = F.dropout(abstract_multihead_text_vecs, p=self.drop_rate, training=self.training)
            abstract_vecs = self.abstract_attn(abstract_multihead_text_vecs)
            all_vecs.append(abstract_vecs)
        
        # Concatenate Features and Apply Final Fusion
        news_vecs = torch.cat(all_vecs, dim=-1)
        print(news_vecs.shape) 
        news_vecs = self.fusion_layer(news_vecs)
        
        return news_vecs



    
class UserEncoder(nn.Module):
    def __init__(self, args):
        super(UserEncoder, self).__init__()
        self.args = args
        self.dim_per_head = args.news_dim // args.num_attention_heads
        self.multi_head_self_attn = MultiHeadSelfAttention(args.word_embedding_dim, args.num_attention_heads)
        self.attn = AdditiveAttention(args.user_query_vector_dim, args.word_embedding_dim)
        self.pad_doc = nn.Parameter(torch.empty(1, args.news_dim).uniform_(-1, 1)).type(torch.FloatTensor)

    def forward(self, user_vector):
        """
        Args:
            user_vector: batch_size, num_clicked_news_a_user, word_embedding_dim
        Returns:
            (shape) batch_size, word_embedding_dim
        """
        # batch_size, num_clicked_news_a_user, word_embedding_dim
        multihead_user_vector = self.multi_head_self_attn(user_vector)
        # batch_size, word_embedding_dim
        final_user_vector = self.attn(multihead_user_vector)
        return final_user_vector
    


class NRMS(torch.nn.Module):
    """
    NRMS network.
    Input 1 + K candidate news and a list of user clicked news, produce the click probability.
    """
    def __init__(self, config, embedding_matrix, num_category, num_subcategory):
        super(NRMS, self).__init__()
        self.config = config
        word_embedding = torch.from_numpy(embedding_matrix).float()
        pretrained_word_embedding = nn.Embedding.from_pretrained(word_embedding,
                                                      freeze=args.freeze_embedding,
                                                      padding_idx=0)
 
        self.news_encoder = NewsEncoder(config, pretrained_word_embedding, num_category, num_subcategory)
        self.user_encoder = UserEncoder(config)
        self.click_predictor = DotProductClickPredictor()
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, clicked_news, candidate_news, label):
        
        # batch_size, 1 + K, word_embedding_dim
        candidate_news_vector = torch.stack(
            [self.news_encoder(x) for x in candidate_news])
        
        # batch_size, num_clicked_news_a_user, word_embedding_dim
        clicked_news_vector = torch.stack(
            [self.news_encoder(x) for x in clicked_news])
        
        # batch_size, word_embedding_dim
        user_vector = self.user_encoder(clicked_news_vector)

        # batch_size, 1 + K
        click_probability = self.click_predictor(candidate_news_vector,
                                                 user_vector)
        loss = self.loss_fn(click_probability, label)
        # loss = 0.5
        return loss, click_probability


In [12]:
args.mode = 'train'

In [10]:
if 'train' in args.mode:
    if args.prepare:
        logging.info('Preparing training data...')
        total_sample_num = prepare_training_data(args.train_data_dir, args.nGPU, args.npratio, args.seed)
    else:
        total_sample_num = 0
        for i in range(args.nGPU):
            data_file_path = os.path.join(args.train_data_dir, f'behaviors_np{args.npratio}_{i}.tsv')
            print(data_file_path)
            if not os.path.exists(data_file_path):
                logging.error(f'Splited training data {data_file_path} for GPU {i} does not exist. Please set the parameter --prepare as True and rerun the code.')
                exit()
            result = subprocess.getoutput(f'wc -l {data_file_path}')
            total_sample_num += int(result.split(' ')[0])
        logging.info('Skip training data preparation.')
    logging.info(f'{total_sample_num} training samples, {total_sample_num // args.batch_size // args.nGPU} batches in total.')



[INFO 2025-03-03 13:23:02,746] Preparing training data...


0it [00:00, ?it/s]

156965it [00:06, 24355.24it/s]


[INFO 2025-03-03 13:23:09,622] Writing files...
[INFO 2025-03-03 13:23:10,348] 236344 training samples, 7385 batches in total.


In [11]:
rank = 0
news, news_index, category_dict, subcategory_dict, word_dict = read_news(
		os.path.join(args.train_data_dir, 'news.tsv'), args.train_abstract_dir, args, mode='train')

news_title, news_category, news_subcategory, news_abstract = get_doc_input(
    news, news_index, category_dict, subcategory_dict, word_dict, args)
news_combined = np.concatenate([x for x in [news_title, news_category, news_subcategory, news_abstract] if x is not None], axis=-1)

if rank == 0:
    logging.info('Initializing word embedding matrix...')

embedding_matrix, have_word = load_matrix(args.glove_embedding_path,
                                                word_dict,
                                                args.word_embedding_dim)
if rank == 0:
    logging.info(f'Word dict length: {len(word_dict)}')
    logging.info(f'Have words: {len(have_word)}')
    logging.info(f'Missing rate: {(len(word_dict) - len(have_word)) / len(word_dict)}')

51282it [00:08, 6072.64it/s]
100%|██████████| 51282/51282 [00:00<00:00, 82586.01it/s]

[INFO 2025-03-03 13:23:31,969] Initializing word embedding matrix...





[INFO 2025-03-03 13:24:40,261] Word dict length: 12519
[INFO 2025-03-03 13:24:40,262] Have words: 11960
[INFO 2025-03-03 13:24:40,263] Missing rate: 0.0446521287642783


In [82]:
import torch.optim as optim

model = NRMS(args, embedding_matrix, len(category_dict), len(subcategory_dict))
is_distributed = False
if args.load_ckpt_name is not None:
	ckpt_path = get_checkpoint(args.model_dir, args.load_ckpt_name)
	checkpoint = torch.load(ckpt_path, map_location='cpu')
	model.load_state_dict(checkpoint['model_state_dict'])
	logging.info(f"Model loaded from {ckpt_path}.")

optimizer = optim.Adam(model.parameters(), lr=args.lr)

if args.enable_gpu:
	model = model.cuda(rank)

if is_distributed:
	model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])

data_file_path = os.path.join(args.train_data_dir, f'behaviors_np{args.npratio}_{rank}.tsv')

dataset = DatasetTrain(data_file_path, news_index, news_combined, args)
dataloader = DataLoader(dataset, batch_size=args.batch_size)

logging.info('Training...')
for ep in range(args.start_epoch, args.epochs):
	loss = 0.0
	accuary = 0.0
	for cnt, (log_ids, input_ids, targets) in enumerate(dataloader):
		if args.enable_gpu:
			log_ids = log_ids.cuda(rank, non_blocking=True)
			# log_mask = log_mask.cuda(rank, non_blocking=True)
			input_ids = input_ids.cuda(rank, non_blocking=True)
			targets = targets.cuda(rank, non_blocking=True)
			
		
		bz_loss, y_hat = model(log_ids, input_ids, targets)
		loss += bz_loss.data.float()
		accuary += acc(targets, y_hat)
		optimizer.zero_grad()
		bz_loss.backward()
		optimizer.step()

		if cnt % args.log_steps == 0:
			logging.info(
				'[{}] Ed: {}, train_loss: {:.5f}, acc: {:.5f}'.format(
					rank, cnt * args.batch_size, loss.data / cnt, accuary / cnt)
			)

		if rank == 0 and cnt != 0 and cnt % args.save_steps == 0:
			ckpt_path = os.path.join(args.model_dir, f'epoch-{ep+1}-{cnt}.pt')
			torch.save(
				{
					'model_state_dict':
						{'.'.join(k.split('.')[1:]): v for k, v in model.state_dict().items()}
						if is_distributed else model.state_dict(),
					'category_dict': category_dict,
					'word_dict': word_dict,
					'subcategory_dict': subcategory_dict
				}, ckpt_path)
			logging.info(f"Model saved to {ckpt_path}.")

	logging.info('Training finish.')

	if rank == 0:
		ckpt_path = os.path.join(args.model_dir, f'epoch-{ep+1}.pt')
		torch.save(
			{
				'model_state_dict':
					{'.'.join(k.split('.')[1:]): v for k, v in model.state_dict().items()}
					if is_distributed else model.state_dict(),
				'category_dict': category_dict,
				'subcategory_dict': subcategory_dict,
				'word_dict': word_dict,
			}, ckpt_path)
		logging.info(f"Model saved to {ckpt_path}.")



Feature Multiplier: 4
[INFO 2025-03-03 14:19:11,715] Training...
[INFO 2025-03-03 14:19:11,715] Training...
[INFO 2025-03-03 14:19:11,715] Training...
[INFO 2025-03-03 14:19:11,715] Training...
[INFO 2025-03-03 14:19:11,715] Training...
[INFO 2025-03-03 14:19:11,715] Training...
[INFO 2025-03-03 14:19:11,715] Training...
[INFO 2025-03-03 14:19:11,715] Training...


torch.Size([50, 1200])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (50x1200 and 1400x300)

In [22]:
args.mode = 'test'
args.user_log_mask=True
args.batch_size=128
args.load_ckpt_name= 'epoch-5.pt'
rank = 0
checkpoint = torch.load('/home/vinmike/Downloads/epoch-5.pt', map_location='cpu')

category_dict = checkpoint['category_dict']
subcategory_dict = checkpoint['subcategory_dict']
word_dict = checkpoint['word_dict']

dummy_embedding_matrix = np.zeros((len(word_dict) + 1, args.word_embedding_dim))
model = NRMS(args, dummy_embedding_matrix)
model.load_state_dict(checkpoint['model_state_dict'])
logging.info(f"Model loaded from {ckpt_path}")
model.eval()
news, news_index, category_dict, subcategory_dict, word_dict = read_news(
  os.path.join(args.test_data_dir, 'news.tsv'), args.train_abstract_dir, args, mode='train')

news_title, news_category, news_subcategory, news_abstract = get_doc_input(
    news, news_index, category_dict, subcategory_dict, word_dict, args)

news_combined = np.concatenate([x for x in [news_title, news_category, news_subcategory] if x is not None], axis=-1)

 
news_dataset = NewsDataset(news_combined)
news_dataloader = DataLoader(news_dataset,
                                batch_size=args.batch_size,
                                num_workers=4)

news_scoring = []
with torch.no_grad():
    for input_ids in tqdm(news_dataloader):
        input_ids = input_ids
        news_vec = model.news_encoder(input_ids)
        news_vec = news_vec.to(torch.device("cpu")).detach().numpy()
        news_scoring.extend(news_vec)

news_scoring = np.array(news_scoring)
logging.info("news scoring num: {}".format(news_scoring.shape[0]))

if rank == 0:
    doc_sim = 0
    for _ in tqdm(range(1000000)):
        i = random.randrange(1, len(news_scoring))
        j = random.randrange(1, len(news_scoring))
        if i != j:
            doc_sim += np.dot(news_scoring[i], news_scoring[j]) / (np.linalg.norm(news_scoring[i]) * np.linalg.norm(news_scoring[j]))
    logging.info(f'News doc-sim: {doc_sim / 1000000}')

data_file_path = os.path.join(args.test_data_dir, f'behaviors_{rank}.tsv')

def collate_fn(tuple_list):
    log_vecs = torch.FloatTensor([x[0] for x in tuple_list])
    for i, x in enumerate(tuple_list):
        print(f"Sample {i}: candidate_news_feature shape = {x[2].shape}")

    log_mask = torch.FloatTensor([x[1] for x in tuple_list])
    news_vecs = [x[2] for x in tuple_list]
    labels = [x[3] for x in tuple_list]
    return (log_vecs, log_mask, news_vecs, labels)

dataset = DatasetTest(data_file_path, news_index, news_scoring, args)
dataloader = DataLoader(dataset, batch_size=args.batch_size, collate_fn=collate_fn)

42416it [00:02, 15393.26it/s]
100%|██████████| 42416/42416 [00:00<00:00, 188532.42it/s]


In [None]:
args.mode = 'test'
args.user_log_mask=True
args.batch_size=128
args.load_ckpt_name= 'epoch-5.pt'
args.prepare=True
if 'test' in args.mode:
        if args.prepare:
            logging.info('Preparing testing data...')
            total_sample_num = prepare_testing_data(args.test_data_dir, args.nGPU)
        else:
            total_sample_num = 0
            for i in range(args.nGPU):
                data_file_path = os.path.join(args.test_data_dir, f'behaviors_{i}.tsv')
                if not os.path.exists(data_file_path):
                    logging.error(f'Splited testing data {data_file_path} for GPU {i} does not exist. Please set the parameter --prepare as True and rerun the code.')
                    exit()
                result = subprocess.getoutput(f'wc -l {data_file_path}')
                total_sample_num += int(result.split(' ')[0])
            logging.info('Skip testing data preparation.')
        logging.info(f'{total_sample_num} testing samples in total.')

        test(0, args)

# Test


In [39]:
label = random.randint(0, 4)
label

2

In [40]:
!mkdir data
!cd data

# Dowload GloVe pre-trained word embedding and unzip
!wget https://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip

# Download MIND-small dataset and unzip
!wget https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip
!wget https://mind201910small.blob.core.windows.net/release/MINDsmall_dev.zip
!unzip MINDsmall_train.zip -d MINDsmall_train
!unzip MINDsmall_dev.zip -d MINDsmall_dev

!rm glove.840B.300d.zip
!rm MINDsmall_train.zip
!rm MINDsmall_dev.zip

--2025-02-16 04:36:04--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2025-02-16 04:36:04--  https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip]
Saving to: ‘glove.840B.300d.zip’


2025-02-16 04:42:54 (5.07 MB/s) - ‘glove.840B.300d.zip’ saved [2176768927/2176768927]

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     
--2025-02-16 04:43:48--  https://mind201910small.blob.core.windows.net/release/MIND