In [1]:
%cd ../

%load_ext autoreload
%autoreload 2

/home/hoanghu/projects/Thesis


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [3]:
import json
from logging import getLogger

import yaml
from recbole.config import Config
from recbole.data import data_preparation, create_dataset
from recbole.trainer import HyperTuning
from recbole.utils import (
    get_model,
    get_trainer,
    init_seed,
    ModelType
)

import src.utils as utils
from src.real_temporal import SimulatedOnlineSequentialDataset, SimulatedOnlineDataset

In [4]:
seed = 42

use_cutoff = False
test_inactive = True

model_name = "NPE"
loss_type = "CE"
# dataset_name = "amazon-digital-music"
# cutoff_time = "1403568000"

dataset_name = "ml-1m"
cutoff_time = "976324045"

In [5]:
paths = utils.Paths(model_name, dataset_name, use_cutoff)

In [6]:
config_dict = {
    # For model 
    'model': model_name,
    'loss_type': loss_type,

    # For data
    'dataset': dataset_name, 
    'load_col': {"inter": ['user_id', 'item_id', 'timestamp']},
    'use_cutoff': use_cutoff,
    'normalize_all': False,
    'user_inter_num_interval': "[10,inf)",

    # For training
    'epochs': 20,
    'train_batch_size': 4096,
    'eval_step': 1,
    'stopping_step': 3,
    'learning_rate': 1e-3,
    
    # For evaluation
    'eval_batch_size': 4096,
    'metrics': ["NDCG", "Precision", "Recall", "MRR", "Hit", "MAP"],
    'topk': 10,
    'valid_metric': 'NDCG@10',

    # Environment
    'gpu_id': 0,
    "seed": seed,
    "reproducibility": True,
    'device': 'cuda',
    'use_gpu': True,
    'data_path': paths.get_path_data_raw(),
    "checkpoint_dir": paths.get_path_dir_ckpt(),
    "show_progress": True,
    'save_dataset': True,
    'dataset_save_path': paths.get_path_data_processed(),
    'save_dataloaders': True,
    'dataloaders_save_path': paths.get_path_dataloader(),
}

if use_cutoff is True:
    config_dict['eval_args'] = {
        "order": "TO",
        "split": {"CO": cutoff_time},
        "group_by": 'user_id',
        'mode': 'full'
    }
else:
    config_dict['eval_args'] = {
        "order": "TO",
        "split": { "LS": "valid_and_test" },
        "group_by": None,
        'mode': 'full'
    }

if loss_type == "CE":
    config_dict["train_neg_sample_args"] = None
else:
    config_dict["train_neg_sample_args"] = {
        "distribution": "uniform",
        "sample_num": 1,
        # "dynamic": False,
        # "candidate_num": 0,
    }

config = Config(
    model_name,
    dataset_name,
    config_dict=config_dict,
    config_file_list=[paths.get_path_param_conf()],
)

with open(paths.get_path_conf(), 'w+') as f:
    yaml.dump(config.external_config_dict, f, allow_unicode=True)

init_seed(config["seed"], config["reproducibility"])
utils.init_logger(config, paths)

In [7]:
# Define data related things
if use_cutoff is True:
    match (config["MODEL_TYPE"]):
        case ModelType.GENERAL | ModelType.CONTEXT | ModelType.TRADITIONAL:
            ds = "SimulatedOnlineDataset"
        case ModelType.SEQUENTIAL:
            ds = "SimulatedOnlineSequentialDataset"
        case _:
            print(f"model type: {config['MODEL_TYPE']}")
            raise NotImplementedError()

    dataset = eval(ds)(config)
else:
    dataset = create_dataset(config)

# if separate_activeness is True:
#     utils.remove_inactive(dataset, cutoff=cutoff_time)



06 Aug 14:18    INFO  Saving filtered dataset into [logs/Aug06_141802_NPE_ml-1m_usecutoff_False/ckpts/ml-1m-SequentialDataset.pth]


In [25]:
cutoff = cutoff_time

if not isinstance(cutoff, float):
    cutoff = float(cutoff)

feat = dataset.inter_feat

# Determine min/max timestamp for each user
timestamp_byuser = feat.groupby("user_id")["timestamp"]
min_ts = (
    timestamp_byuser.min().reset_index().rename(columns={"timestamp": "min_ts"})
)
max_ts = (
    timestamp_byuser.max().reset_index().rename(columns={"timestamp": "max_ts"})
)
user = min_ts.merge(max_ts, on="user_id", how="inner")

# Determine inactive users using given cutoff
condition_active_user = (user["min_ts"] <= cutoff) & (cutoff <= user["max_ts"])
user_inactive = user[~condition_active_user]['user_id']
user_active = user[condition_active_user]['user_id']

In [31]:
feat_active = feat[feat['user_id'].isin(user_active)].copy()
feat_inactive = feat[feat['user_id'].isin(user_inactive)].copy()

dataset_active = dataset.copy(feat_active)
dataset_inactive = dataset.copy(feat_inactive)

In [34]:
train_data, valid_data, _ = data_preparation(config, dataset)
_, _, test_data_active = data_preparation(config, dataset_active)
_, _, test_data_inactive = data_preparation(config, dataset_inactive)

06 Aug 14:34    INFO  Saving split dataloaders into: [logs/Aug06_141802_NPE_ml-1m_usecutoff_False/ckpts/ml-1m-for-NPE-dataloader.pth]
06 Aug 14:34    INFO  [Training]: train_batch_size = [4096] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
06 Aug 14:34    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'LS': 'valid_and_test'}, 'order': 'TO', 'group_by': None, 'mode': {'valid': 'full', 'test': 'full'}}]
06 Aug 14:34    INFO  Saving split dataloaders into: [logs/Aug06_141802_NPE_ml-1m_usecutoff_False/ckpts/ml-1m-for-NPE-dataloader.pth]
06 Aug 14:34    INFO  [Training]: train_batch_size = [4096] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
06 Aug 14:34    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'LS': 'valid_and_test'}, 'order': 'TO', 'group_by': None, 'mode': {'valid': 'full', 'tes

In [66]:
users = []
for x in test_data_active:
    users.extend(x[0]['user_id'].tolist())

print(len(users))

users_active = set(user_active.tolist())
users = set(users)

print(users.difference(users_active))
print(users_active.difference(users))

1190
set()
set()


In [11]:
# # Define model
# model_name = config['model']
# model = get_model(model_name)(config, train_data._dataset).to(config['device'])

# # Define trainer
# trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

06 Aug 14:15    INFO  Saving split dataloaders into: [logs/Aug06_141311_NPE_ml-1m_usecutoff_False/ckpts/ml-1m-for-NPE-dataloader.pth]
06 Aug 14:15    INFO  [Training]: train_batch_size = [4096] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
06 Aug 14:15    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'LS': 'valid_and_test'}, 'order': 'TO', 'group_by': None, 'mode': {'valid': 'full', 'test': 'full'}}]
