## Using author's libraries (NOT RUNNING)

In [17]:
from merlin.io import Dataset
from nvtabular import ops
import nvtabular as nvt
from merlin.schema.tags import Tags

from merlin.schema import Schema, ColumnSchema, Tags

import polars as pl
import numpy as np
import pyarrow as pa

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_READ_TIME_COL
)


from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)

from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

from collections import OrderedDict

## Create Schema

In [2]:

# Initialize an empty schema
schema = Schema()

article_id_fixed_col = ColumnSchema(
    name="article_id_fixed",
    dtype="int32",
    tags=[Tags.LIST, Tags.CATEGORICAL, Tags.ITEM_ID, Tags.ITEM],
    is_list=True,
    is_ragged=True,
).with_properties({'domain': {'min': 0, 'max': 125541}, 'value_count': {'min': 1, 'max': 500}})

read_time_fixed_col = ColumnSchema(
    name="read_time_fixed",
    dtype="float32",
    tags=[Tags.LIST, Tags.CONTINUOUS],
    is_list=True,
    is_ragged=True
)

# Add columns to the schema
columns = [article_id_fixed_col, read_time_fixed_col]

for col in columns:
    schema[col.name] = col

In [3]:
schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.value_count.min,properties.value_count.max,properties.domain.min,properties.domain.max
0,article_id_fixed,"(Tags.ID, Tags.LIST, Tags.CATEGORICAL, Tags.ITEM)","DType(name='int32', element_type=<ElementType....",True,True,1,500.0,0.0,125541.0
1,read_time_fixed,"(Tags.CONTINUOUS, Tags.LIST)","DType(name='float32', element_type=<ElementTyp...",True,True,0,,,


## Load Pre-Trained Embeddings

In [4]:
import torch
import polars as pl

pretrained_embeds_df = pl.read_parquet("../data/eb_contrastive_vector/contrastive_vector.parquet")

In [5]:
article_to_index = {
    art_id: num + 1 for num, art_id in enumerate(pretrained_embeds_df['article_id'].to_list())
}

In [6]:
import numpy as np

pretrained_embeds = pretrained_embeds_df['contrastive_vector'].to_list()
pretrained_embeds = np.vstack([np.zeros(768,)] + [np.array(vec) for vec in pretrained_embeds])

pretrained_embeds = torch.from_numpy(pretrained_embeds)

In [8]:
split: str = "large"
history_size: int = 20

In [20]:
from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import NDCGAt, RecallAt

max_sequence_length, d_model = history_size, 64
emb_dims = {"article_id_fixed": 768}
infer_embedding_sizes = True

inputs = tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=max_sequence_length,
        continuous_projection=d_model,
        aggregation="concat",
        d_output=100,
        masking='causal',
        infer_embedding_sizes=infer_embedding_sizes,
        embedding_dims=emb_dims
)

In [21]:
schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.value_count.min,properties.value_count.max,properties.domain.min,properties.domain.max
0,article_id_fixed,"(Tags.ID, Tags.LIST, Tags.CATEGORICAL, Tags.ITEM)","DType(name='int32', element_type=<ElementType....",True,True,1,500.0,0.0,125541.0
1,read_time_fixed,"(Tags.CONTINUOUS, Tags.LIST)","DType(name='float32', element_type=<ElementTyp...",True,True,0,,,


In [22]:
tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=max_sequence_length,
        continuous_projection=d_model,
        aggregation="concat",
        d_output=100,
        masking='causal',
        infer_embedding_sizes=True,
        embedding_dims={"article_id_fixed": 768}
)

TabularSequenceFeatures(
  (_aggregation): ConcatFeatures()
  (to_merge): ModuleDict(
    (continuous_module): SequentialBlock(
      (0): ContinuousFeatures(
        (filter_features): FilterFeatures()
        (_aggregation): ConcatFeatures()
      )
      (1): SequentialBlock(
        (0): DenseBlock(
          (0): Linear(in_features=1, out_features=64, bias=True)
          (1): ReLU(inplace=True)
        )
      )
      (2): AsTabular()
    )
    (categorical_module): SequenceEmbeddingFeatures(
      (filter_features): FilterFeatures()
      (embedding_tables): ModuleDict(
        (article_id_fixed): Embedding(125542, 768, padding_idx=0)
      )
    )
    (pretrained_embedding_module): PretrainedEmbeddingFeatures(
      (filter_features): FilterFeatures()
    )
  )
  (projection_module): SequentialBlock(
    (0): DenseBlock(
      (0): Linear(in_features=832, out_features=100, bias=True)
      (1): ReLU(inplace=True)
    )
  )
  (_masking): CausalLanguageModeling()
)

In [23]:
with torch.no_grad():
    inputs.categorical_module.embedding_tables["article_id_fixed"].weight.copy_(
        pretrained_embeds
    )

inputs.categorical_module.embedding_tables["article_id_fixed"].requires_grad = False
inputs.categorical_module.embedding_tables[
    "article_id_fixed"
].weight.requires_grad = False

In [24]:
# from transformers4rec.torch.ranking_metric import Precision,Recall,Accuracy

# Define XLNetConfig class and set default parameters for HF XLNet config  
transformer_config = tr.XLNetConfig.build(
    d_model=d_model, n_head=4, n_layer=2, total_seq_length=max_sequence_length
)
# Define the model block including: inputs, masking, projection and transformer block.
body = tr.SequentialBlock(
    inputs, tr.MLPBlock([64]), tr.TransformerBlock(transformer_config, masking=inputs.masking)
)

# Define the evaluation top-N metrics and the cut-offs
metrics = [NDCGAt(top_ks=[20, 40], labels_onehot=True),  
           RecallAt(top_ks=[20, 40], labels_onehot=True)]

# Define a head related to next item prediction task 
head = tr.Head(
    body,
    tr.NextItemPredictionTask(
        weight_tying=True,
        metrics=metrics
    )
    ,inputs=inputs
)

# Get the end-to-end Model class 
model = tr.Model(head)

Projecting inputs of NextItemPredictionTask to'768' As weight tying requires the input dimension '64' to be equal to the item-id embedding dimension '768'


### Train Model

In [25]:
per_device_train_batch_size: int = 128
per_device_eval_batch_size: int = 32

In [31]:
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer
# Set hyperparameters for training 
train_args = T4RecTrainingArguments(data_loader_engine='merlin', 
                                    dataloader_drop_last = True,
                                    gradient_accumulation_steps = 1,
                                    per_device_train_batch_size = per_device_train_batch_size, 
                                    per_device_eval_batch_size = per_device_eval_batch_size,
                                    output_dir = "./tmp-large", 
                                    learning_rate=0.0005,
                                    lr_scheduler_type='cosine', 
                                    learning_rate_num_cosine_cycles_by_epoch=1.5,
                                    num_train_epochs=1,
                                    max_sequence_length=20, 
                                    report_to = [],
                                    logging_steps=200,
                                    save_steps=10000,
                                    no_cuda=False)

In [32]:
# Instantiate the T4Rec Trainer, which manages training and evaluation for the PyTorch API
trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [33]:
eval_paths = f"/home/scur1565/News-Recommender-Recsys24/data/ebnerd_processed/validation_data_small.parquet"

In [34]:
train_paths = [
    f"/home/scur1565/News-Recommender-Recsys24/data/ebnerd_processed/train_data_large_0.parquet",
    f"/home/scur1565/News-Recommender-Recsys24/data/ebnerd_processed/train_data_large_1.parquet",
    f"/home/scur1565/News-Recommender-Recsys24/data/ebnerd_processed/train_data_large_2.parquet",
    f"/home/scur1565/News-Recommender-Recsys24/data/ebnerd_processed/train_data_large_3.parquet",
    f"/home/scur1565/News-Recommender-Recsys24/data/ebnerd_processed/train_data_large_4.parquet",
    f"/home/scur1565/News-Recommender-Recsys24/data/ebnerd_processed/train_data_large_5.parquet",
    f"/home/scur1565/News-Recommender-Recsys24/data/ebnerd_processed/train_data_large_6.parquet"
]

for train_path in train_paths:
    trainer.train_dataset_or_path = train_paths
    trainer.reset_lr_scheduler()
    trainer.train()
    trainer.state.global_step += 1
print('Finished training')

# trainer.eval_dataset_or_path = eval_paths
# train_metrics = trainer.evaluate(metric_key_prefix='eval')

# for key in sorted(train_metrics.keys()):
#     print(" %s = %s" % (key, str(train_metrics[key]))) 
    
# from transformers4rec.torch.utils.examples_utils import wipe_memory
# wipe_memory()



Step,Training Loss
200,8.4742
400,7.8673
600,7.4401
800,6.9984
1000,6.625
1200,6.2995
1400,5.9727
1600,5.6753
1800,5.4787
2000,5.2589


KeyboardInterrupt: 

In [35]:
import pandas as pd
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from typing import List, Tuple
from tqdm import tqdm

In [43]:
validation_df = pd.read_parquet(f"/home/scur1565/News-Recommender-Recsys24/data/ebnerd_processed/validation_data_small.parquet", engine="pyarrow")
all_list_ids_inview: List[List[int]] = validation_df["article_ids_inview"].to_list()
all_list_ids_clicked: List[List[int]] = validation_df["article_ids_clicked"].to_list()

trainer.eval_dataset_or_path = eval_paths
dlv = trainer.get_eval_dataloader()


In [44]:
print(f"Ensure that Validation Dataloader is deterministic: {not dlv.shuffle}")

Ensure that Validation Dataloader is deterministic: True


In [45]:
def get_inview_articles_score(
    ids_inview: List[int], 
    ids_clicked: List[int], 
    prob_list: List[float]
) -> List[float]:

    inview_scores: List[float] = []
    
    for inview_id in ids_inview:
       inview_scores.append(prob_list[inview_id]) 

    return inview_scores

In [46]:
all_inview_scores: List[List[float]] = []
index: int = 0
# Sample one from data loader
for sp_batch in tqdm(dlv):
    # pass through body - encoding
    body_out = body(sp_batch[0])
    # pass encoding through the prediction head
    head_out = head(body_out)
    # extract probabilities
    list_of_probs = head_out['next-item'][0].cpu().tolist()
    # get inview prob scores
    for sample in head_out:
        ids_clicked = all_list_ids_clicked[index]
        ids_inview = all_list_ids_inview[index]
        inview_scores = get_inview_articles_score(ids_inview, ids_clicked, list_of_probs)

        all_inview_scores.append(inview_scores)

        index += 1

  0%|          | 0/7645 [00:00<?, ?it/s]

 32%|███▏      | 2432/7645 [00:27<00:52, 99.26it/s] 

In [None]:
metrics = MetricEvaluator(
    labels=validation_df["labels"].to_list(),
    predictions=all_inview_scores,
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()

KeyError: 'labels'

In [64]:
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

In [65]:
validation_df["scores"] = all_inview_scores

ValueError: Length of values (7645) does not match length of index (244647)

In [None]:
validation_df = validation_df.with_columns(
    pl.col("scores")
    .map_elements(lambda x: list(rank_predictions_by_score(x)))
    .alias("ranked_scores")
)
validation_df.head(2)