In [2]:
from collections import defaultdict

from recbole.quick_start import run_recbole

from recbole.config import Config
from recbole.data import (
    create_dataset,
    data_preparation,
)
from recbole.data.transform import construct_transform
from recbole.utils import (
    init_logger,
    get_model,
    get_trainer,
    init_seed,
    set_color,
    get_flops,
    get_environment,
)
import pandas as pd

In [3]:
model = 'BPR'
dataset = 'ml-100k'

config_dict = {
    # 'normalize_field': None,
    'normalize_all': False,
    'use_gpu': True,
    'gpu_id': 0,

    'train_batch_size': 1024,
    'eval_batch_size': 1024,

    'load_col': {"inter": ['user_id', 'item_id', 'timestamp']},
    'train_neg_sample_args': None,
    # 'train_neg_sample_args': {
    #     'distribution': 'uniform',
    #     'sample_num': 30,
    #     'dynamic': False,
    #     'candidate_num': 0,
    # },
    'eval_args': {
        "order": "TO",
        "split": { "LS": "valid_and_test" },
        "group_by": None,
        'mode': "full"
    },
    # 'train_neg_sample_args': None
}

# config = Config(
#     model=model,
#     dataset=dataset,

#     config_dict=config_dict,
# )

# dataset = create_dataset(config)

run_recbole(model=model, dataset=dataset, config_dict=config_dict)

# dataset.inter_feat['timestamp']

03 Jul 23:03    INFO  ['/Users/macos/miniforge3/envs/py/lib/python3.10/site-packages/ipykernel_launcher.py', '--f=/Users/macos/Library/Jupyter/runtime/kernel-v2-12367F6ZQzYVulNJz.json']
03 Jul 23:03    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/ml-1m
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 300
train_batch_size = 1024
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'LS': 'valid_and_test'}, 'order': 'TO', 'group_by': None, 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = True
metric

KeyboardInterrupt: 

In [3]:
def get_suitable_cutoff(ds_name: str) -> tuple:
    """Get suitable cutoff timestamp: at which there are the most active users

    Args:
        ds_name (str): dataset name

    Returns:
        tuple: suitable timestamp and the number of active users
    """

    # Get dataset without normalizing the timestamp
    config_dict = {
        'normalize_all': False,

        'load_col': {"inter": ['user_id', 'item_id', 'timestamp']},
        'train_neg_sample_args': None,

        'eval_args': {
            "order": "TO",
            "split": { "LS": "valid_and_test" },
            "group_by": None,
            'mode': 'full'
        },
    }
    config = Config(
        model='NPE',
        dataset=ds_name,
        config_dict=config_dict,
    )
    init_seed(config["seed"], config["reproducibility"])
    df = create_dataset(config).inter_feat.copy()

    # Create dataframe of users and corresponding first/last timestamp
    user_max_ts = df.groupby('user_id')['timestamp'].max()
    user_min_ts = df.groupby('user_id')['timestamp'].min()
    df_user = pd.DataFrame(
        {
            'max': user_max_ts,
            'min': user_min_ts,
        },
        index=user_max_ts.index
    )

    counts = defaultdict(int) 
    for ts in df_user['min']:
        counts[ts] += 1
    for ts in df_user['max']:
        counts[ts] -= 1

    timestamps = sorted(counts.keys())
    accum = {}

    s = 0
    for ts in timestamps:
        s += counts[ts]
        accum[ts] = s
    series = pd.Series(accum)

    suitable_ts = series.idxmax()
    max_active_user = series[suitable_ts]

    return suitable_ts, max_active_user

get_suitable_cutoff('ml-100k')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=feat[field].mean(), inplace=True)


(884471835.0, 141)

In [10]:
get_suitable_cutoff('amazon-automotive')

(1373846400.0, 64126)