In [2]:
from collections import defaultdict

from recbole.quick_start import run_recbole

from recbole.config import Config
from recbole.data import (
    create_dataset,
    data_preparation,
)
from recbole.data.transform import construct_transform
from recbole.utils import (
    init_logger,
    get_model,
    get_trainer,
    init_seed,
    set_color,
    get_flops,
    get_environment,
)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

plt.style.use('seaborn-v0_8')
plt.rcParams.update({'font.size': 8})

In [2]:
model = 'BPR'
dataset = 'ml-100k'

config_dict = {
    # 'normalize_field': None,
    'normalize_all': False,
    'train_batch_size': 1024,
    'eval_batch_size': 1024,
    'load_col': {"inter": ['user_id', 'item_id', 'timestamp']},
    'train_neg_sample_args': {
        'distribution': 'uniform',
        'sample_num': 30,
        'dynamic': False,
        'candidate_num': 0,
    },
    'eval_args': {
        "order": "TO",
        "split": { "LS": "valid_and_test" },
        "group_by": None,
        'mode': "full"
    },
    # 'train_neg_sample_args': None
}

config = Config(
    model=model,
    dataset=dataset,

    config_dict=config_dict,
)

dataset = create_dataset(config)

run_recbole(model=model, dataset=dataset, config_dict=config_dict)

dataset.inter_feat['timestamp']

0        881250949.0
1        891717742.0
2        878887116.0
3        880606923.0
4        886397596.0
            ...     
99995    880175444.0
99996    879795543.0
99997    874795795.0
99998    882399156.0
99999    879959583.0
Name: timestamp, Length: 100000, dtype: float64

In [9]:
def get_suitable_cutoff(ds_name: str) -> tuple:
    """Get suitable cutoff timestamp: at which there are the most active users

    Args:
        ds_name (str): dataset name

    Returns:
        tuple: suitable timestamp and the number of active users
    """

    # Get dataset without normalizing the timestamp
    config_dict = {
        'normalize_all': False,

        'load_col': {"inter": ['user_id', 'item_id', 'timestamp']},
        'train_neg_sample_args': None,

        'eval_args': {
            "order": "TO",
            "split": { "LS": "valid_and_test" },
            "group_by": None,
            'mode': 'full'
        },
    }
    config = Config(
        model='NPE',
        dataset=ds_name,
        config_dict=config_dict,
    )
    init_seed(config["seed"], config["reproducibility"])
    df = create_dataset(config).inter_feat.copy()

    # Create dataframe of users and corresponding first/last timestamp
    user_max_ts = df.groupby('user_id')['timestamp'].max()
    user_min_ts = df.groupby('user_id')['timestamp'].min()
    df_user = pd.DataFrame(
        {
            'max': user_max_ts,
            'min': user_min_ts,
        },
        index=user_max_ts.index
    )

    counts = defaultdict(int) 
    for ts in df_user['min']:
        counts[ts] += 1
    for ts in df_user['max']:
        counts[ts] -= 1

    timestamps = sorted(counts.keys())
    accum = {}

    s = 0
    for ts in timestamps:
        s += counts[ts]
        accum[ts] = s
    series = pd.Series(accum)

    suitable_ts = series.idxmax()
    max_active_user = series[suitable_ts]

    return suitable_ts, max_active_user

get_suitable_cutoff('ml-1M')

(976324045.0, 1190)

In [10]:
get_suitable_cutoff('amazon-automotive')

(1373846400.0, 64126)