In [1]:
%cd ../

%load_ext autoreload
%autoreload 2

/Users/hoangle/Uni/Thesis


In [3]:
from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from recbole.config import Config
from recbole.data import (
    create_dataset,
    data_preparation,
)
from tqdm.notebook import tqdm
from recbole.utils import init_seed, ModelType

from src.real_temporal import SimulatedOnlineDataset, SimulatedOnlineSequentialDataset

In [4]:
plt.style.use('seaborn-v0_8')
plt.rcParams.update({'font.size': 8})

# Get suitable cutoff

In [5]:
def get_suitable_cutoff(ds_name: str) -> tuple:
    """Get suitable cutoff timestamp: at which there are the most active users

    Args:
        ds_name (str): dataset name

    Returns:
        tuple: suitable timestamp and the number of active users
    """

    # Get dataset without normalizing the timestamp
    config_dict = {
        'normalize_all': False,

        'load_col': {"inter": ['user_id', 'item_id', 'timestamp']},
        'train_neg_sample_args': None,
        'device': 'cpu',
        'use_gpu': False,

        'eval_args': {
            "order": "TO",
            "split": { "LS": "valid_and_test" },
            "group_by": None,
            'mode': 'full'
        },
    }
    config = Config(
        model='NPE',
        dataset=ds_name,
        config_dict=config_dict,
    )
    init_seed(config["seed"], config["reproducibility"])
    df = create_dataset(config).inter_feat.copy()

    # Create dataframe of users and corresponding first/last timestamp
    user_max_ts = df.groupby('user_id')['timestamp'].max()
    user_min_ts = df.groupby('user_id')['timestamp'].min()
    df_user = pd.DataFrame(
        {
            'max': user_max_ts,
            'min': user_min_ts,
        },
        index=user_max_ts.index
    )

    counts = defaultdict(int) 
    for ts in df_user['min']:
        counts[ts] += 1
    for ts in df_user['max']:
        counts[ts] -= 1

    timestamps = sorted(counts.keys())
    accum = {}

    s = 0
    for ts in timestamps:
        s += counts[ts]
        accum[ts] = s
    series = pd.Series(accum)

    suitable_ts = series.idxmax()
    max_active_user = series[suitable_ts]

    return suitable_ts, max_active_user

ds = "ml-100k"
# ds = "ml-1m"
# ds = "amazon-digital-music"
print(f"{ds}: {get_suitable_cutoff(ds)}")

ml-100k: (884471835.0, 141)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=feat[field].mean(), inplace=True)


# Test with online simulated evaluation scheme

In [39]:
seed = 42

use_cutoff = True
reproducible = True

model_name = "NPE"
dataset_name = "amazon-digital-music"
loss_type = "CE"
init_seed(seed, reproducible)

cutoff_time = 976324045.0


config_dict = {
    # For model 
    'model': model_name,
    'loss_type': loss_type,

    # For data
    'dataset': dataset_name, 
    'load_col': {"inter": ['user_id', 'item_id', 'timestamp']},
    'use_cutoff': use_cutoff,

    # For training
    'train_batch_size': 4096,
    'eval_batch_size': 4096,
    "train_neg_sample_args": None,

    # Environment
    "seed": seed,
    "reproducibility": reproducible,
    'device': 'cpu',
    'use_gpu': False,

    # Evaluation
    "eval_args": {
        "order": "TO",
        "split": {"CO": cutoff_time},
        "group_by": 'user_id',
        'mode': 'full'
    }
}
config = Config(
    model_name,
    dataset_name,
    config_dict=config_dict,
)
if config["use_cutoff"] is True:
    match config["MODEL_TYPE"]:
        case ModelType.GENERAL | ModelType.TRADITIONAL:
            dataset = SimulatedOnlineDataset(config)
        case ModelType.SEQUENTIAL:
            dataset = SimulatedOnlineSequentialDataset(config)

else:
    dataset = create_dataset(config)

cutoff_dates = [
    dataset.inter_feat['timestamp'].quantile(i / 100).item()
    for i in np.linspace(70, 99, num=20)
]

size_LOO_train = 266800.0
size_LOO_val = 34157.0
size_LOO_test = 56814.0

# print(f"train_dataset: {len(train_data.dataset)}")
# print(f"valid_dataset: {len(valid_data.dataset)}")
# print(f"test_dataset : {len(test_data.dataset)}")


In [40]:
results = []

for cutoff_date in tqdm(cutoff_dates):
    config_dict['eval_args']['split']['CO'] = cutoff_date
    config = Config(
        model_name,
        dataset_name,
        config_dict=config_dict,
    )

    dataset = TimeCutoffDataset(config)
    train_data, valid_data, test_data = data_preparation(config, dataset)
    size_simo_train = len(train_data.dataset)
    size_simo_val = len(valid_data.dataset)
    size_simo_test = len(test_data.dataset)

    sum_sq = np.abs(size_simo_train - size_LOO_train) / size_LOO_train\
        + np.abs(size_simo_val - size_LOO_val) / size_LOO_val\
        + np.abs(size_simo_test - size_LOO_test) /size_LOO_test
    
    results.append({
        'cutoff_date': cutoff_date,
        'train': size_simo_train,
        'val': size_simo_val,
        'test': size_simo_test,
        'sum_sq': sum_sq
    })

pd.DataFrame.from_records(results).sort_values('sum_sq')

  0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,cutoff_date,train,val,test,sum_sq
0,1376179000.0,154965,37220,13755,1.26674
1,1377562000.0,158853,38190,13632,1.282731
2,1379117000.0,162966,39220,13438,1.300884
3,1380758000.0,167092,40234,13249,1.318432
4,1382486000.0,171353,41239,13116,1.334225
5,1384301000.0,175609,42212,12789,1.352515
6,1386202000.0,180112,43276,12364,1.374268
7,1387757000.0,183962,44182,11940,1.393825
8,1388966000.0,187818,45121,11626,1.41239
9,1390176000.0,191708,46039,11212,1.431973


In [43]:
pd.DataFrame.from_records(results).iloc[18]['cutoff_date'].item()

1403568000.0