In [2]:
import gc
import argparse
import pickle
import os
from typing import Any, List

import numpy as np
import pandas as pd
import polars as pl
import torch
from pydantic import BaseModel
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_logger, init_seed

import wandb


class CFG:
    wandb = False
    debug = False
    use_saved_dataset = False
    model_name = "gru4rec"
    MAX_ITEM = 30
    candidates_num = 20


class ItemHistory(BaseModel):
    sequence: List[str]
    topk: int


class RecommendedItems(BaseModel):
    score_list: List[float]
    item_list: List[str]


def pred_user_to_item(item_history: ItemHistory, dataset: Any, model: Any):
    item_history_dict = item_history.dict()
    item_sequence = item_history_dict["sequence"]
    item_length = len(item_sequence)
    pad_length = CFG.MAX_ITEM  # pre-defined by recbole

    padded_item_sequence = torch.nn.functional.pad(
        torch.tensor(dataset.token2id(dataset.iid_field, item_sequence)),
        (0, pad_length - item_length),
        "constant",
        0,
    )

    input_interaction = Interaction(
        {
            "aid_list": padded_item_sequence.reshape(1, -1),
            "item_length": torch.tensor([item_length]),
        }
    )
    scores = model.full_sort_predict(input_interaction.to(model.device))
    scores = scores.view(-1, dataset.item_num)
    scores[:, 0] = -np.inf  # pad item score -> -inf
    topk_score, topk_iid_list = torch.topk(scores, item_history_dict["topk"])

    predicted_score_list = topk_score.tolist()[0]
    predicted_item_list = dataset.id2token(dataset.iid_field, topk_iid_list.tolist()).tolist()

    recommended_items = {
        "score_list": predicted_score_list,
        "item_list": predicted_item_list,
    }
    return recommended_items

In [3]:
pred_df = pd.read_csv("output/gru4rec/avid-cloud-284/cv/pred_df.csv")

In [5]:
prediction_dfs = []
for st in ["clicks", "carts", "orders"]:
    modified_predictions = pred_df.copy()
    modified_predictions["type"] = st
    prediction_dfs.append(modified_predictions)
prediction_dfs = pd.concat(prediction_dfs).reset_index(drop=True)

In [6]:
pred_df = prediction_dfs

In [7]:
pred_df.shape

(227400, 4)

In [8]:
score_potential = 0
score_20 = 0
weights = {"clicks": 0.10, "carts": 0.30, "orders": 0.60}

In [9]:
t="clicks"
output_dir = "."

In [10]:
sub = pred_df.loc[pred_df["type"] == t].copy()

In [11]:
sub = sub.groupby("session")["aid"].apply(list)
test_labels = pd.read_parquet("./input/otto-validation/test_labels.parquet")
test_labels = test_labels.loc[test_labels["type"] == t]
test_labels = test_labels.merge(sub, how="left", on=["session"])
test_labels.shape

(1755534, 4)

In [13]:
test_labels = test_labels[test_labels["aid"].notnull()]
test_labels.shape

(3735, 4)

In [14]:
test_labels["hits"] = test_labels.apply(lambda df: len(set(df["ground_truth"]).intersection(set(df["aid"]))), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["hits"] = test_labels.apply(lambda df: len(set(df["ground_truth"]).intersection(set(df["aid"]))), axis=1)


In [16]:
test_labels = pd.read_parquet("./input/otto-validation/test_labels.parquet")
test_labels = test_labels.loc[test_labels["type"] == t]
test_labels = test_labels.merge(sub, how="left", on=["session"])

In [17]:
test_labels.shape

(1755534, 4)

In [46]:
type(test_labels.loc[0]["aid"]) == list

True

In [47]:
def calc_hits(x):
    if type(x["aid"]) != list:
        return 0
    else:
        return len(set(x["ground_truth"]).intersection(set(x["aid"])))

In [48]:
test_labels["hits"] = test_labels.apply(lambda df: calc_hits(df), axis=1)

In [49]:
test_labels["gt_count"] = test_labels.ground_truth.str.len().clip(0, 20)
test_labels["recall"] = test_labels["hits"] / test_labels["gt_count"]
recall = test_labels["hits"].sum() / test_labels["gt_count"].sum()

In [50]:
recall

0.0009797588653936637