In [1]:
%cd ../

%load_ext autoreload
%autoreload 2

/home/hoanghu/projects/Thesis


In [2]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [3]:
from collections import defaultdict
import datetime

import pandas as pd
import yaml

from recbole.config import Config
from recbole.data import (
    create_dataset,
    data_preparation,
)
from recbole.utils import init_seed

from src.utils import Paths

# Get suitable cutoff

In [35]:
def get_suitable_cutoff(ds_name: str, model_name: str = "S3Rec", scheme: str = "so", n_min_inter: int = 2) -> tuple:
    """Get suitable cutoff timestamp: at which there are the most active users

    Args:
        ds_name (str): dataset name

    Returns:
        tuple: suitable timestamp and the number of active users
    """

    # Get dataset without normalizing the timestamp
    config_dict = {
        'normalize_all': False,
        'user_inter_num_interval': "[5,inf)",
        'item_inter_num_interval': "[5,inf)",
        'device': 'cpu',
        'use_gpu': False,

        'train_neg_sample_args': None,

        'eval_args': {
            "order": "TO",
            "split": { "LS": "valid_and_test" },
            "group_by": None,
            'mode': 'pop100'
        },
    }

    # Get additional dataset-specific config
    paths = Paths(model_name, ds_name, scheme)

    with open(paths.get_path_dataset_conf()) as f:
        data = yaml.load(f, Loader=yaml.FullLoader)
    config_dict.update(data)

    config = Config(
        model=model_name,
        dataset=ds_name,
        config_dict=config_dict,
    )
    init_seed(config["seed"], config["reproducibility"])
    df = create_dataset(config).inter_feat.copy()

    # Create dataframe of users and corresponding first/last timestamp
    user_max_ts = df.groupby('user_id')['timestamp'].max()
    user_min_ts = df.groupby('user_id')['timestamp'].min()

    print(f"max_ts: {datetime.datetime.fromtimestamp(df['timestamp'].max(), datetime.timezone.utc)}")
    print(f"min_ts: {datetime.datetime.fromtimestamp(df['timestamp'].min(), datetime.timezone.utc)}")

    df_user = pd.DataFrame(
        {
            'max': user_max_ts,
            'min': user_min_ts,
        },
        index=user_max_ts.index
    )

    counts = defaultdict(int) 
    for ts in df_user['min']:
        counts[ts] += 1
    for ts in df_user['max']:
        counts[ts] -= 1

    timestamps = sorted(counts.keys())
    accum = {}

    s = 0
    for ts in timestamps:
        s += counts[ts]
        accum[ts] = s
    series = pd.Series(accum)

    suitable_ts = series.idxmax()
    max_active_user = series[suitable_ts]

    print(f"cutoff: {datetime.datetime.fromtimestamp(suitable_ts, datetime.timezone.utc)}")

    # Find no.active users after removing invalid users
    inters_before = df[df['timestamp'] <= suitable_ts]
    inters_after = df[df['timestamp'] > suitable_ts]

    counts = inters_before.groupby('user_id')['timestamp'].count()
    users_invalid = counts[counts < n_min_inter].index

    users_before = set(inters_before[~inters_before['user_id'].isin(users_invalid)]['user_id'].unique())
    users_after = set(inters_after[~inters_after['user_id'].isin(users_invalid)]['user_id'].unique())

    n_active_users_refined = len(users_before.intersection(users_after))
    print(f"No. active users after removing invalid users (with n_min_inter = {n_min_inter}): {n_active_users_refined}")


    return suitable_ts, max_active_user, n_active_users_refined

# ds = "ml-100k"
# ds = "amazon-digital-music"

# ds = "ml-1m"
# ds = "amazon-beauty" 
ds = "yelp"
# ds = "steam"
print(f"{ds}: {get_suitable_cutoff(ds)}")

max_ts: 2019-12-13 07:50:49+00:00
min_ts: 2004-10-18 18:46:40+00:00
cutoff: 2017-06-06 21:04:50+00:00
No. active users after removing invalid users (with n_min_inter = 2): 166376
yelp: (1496783090.0, 184613)
