In [1]:
%load_ext autoreload
%autoreload 2

%cd ../

/Users/hoangle/Uni/Thesis


In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
import json
from logging import getLogger

import yaml
from recbole.config import Config
from recbole.data import data_preparation, create_dataset
from recbole.trainer import HyperTuning
from recbole.utils import (
    get_model,
    get_trainer,
    init_seed,
    ModelType
)

import src.utils as utils
from src.real_temporal import SimulatedOnlineSequentialDataset, SimulatedOnlineDataset

# 1. Declarations & Definitions

## 1.1. Define flags and global variables

In [3]:
seed = 42

use_cutoff = False
separate_activeness = True

model_name = "NPE"
loss_type = "CE"
# dataset_name = "amazon-digital-music"
# cutoff_time = "1403568000"

dataset_name = "ml-1m"
cutoff_time = "991854688"

## 1.2. Define configurations

Configuration for data, model, training and evaluation

In [4]:
paths = utils.Paths(model_name, dataset_name, use_cutoff)

In [5]:
config_dict = {
    # For model 
    'model': model_name,
    'loss_type': loss_type,

    # For data
    'dataset': dataset_name, 
    'load_col': {"inter": ['user_id', 'item_id', 'timestamp']},
    "separate_activeness": separate_activeness,
    "cutoff_time": cutoff_time,
    'normalize_all': False,
    'user_inter_num_interval': "[10,inf)",

    # For training
    'epochs': 20,
    'train_batch_size': 4096,
    'eval_step': 1,
    'stopping_step': 3,
    'learning_rate': 1e-3,
    
    # For evaluation
    'eval_batch_size': 4096,
    'metrics': ["NDCG", "Precision", "Recall", "MRR", "Hit", "MAP"],
    'topk': 10,
    'valid_metric': 'NDCG@10',

    # Environment
    'gpu_id': 0,
    "seed": seed,
    "reproducibility": True,
    'device': 'cuda',
    'use_gpu': True,
    'data_path': paths.get_path_data_raw(),
    "checkpoint_dir": paths.get_path_dir_ckpt(),
    "show_progress": True,
    'save_dataset': True,
    'dataset_save_path': paths.get_path_data_processed(),
    'save_dataloaders': True,
    'dataloaders_save_path': paths.get_path_dataloader(),
}

if use_cutoff is True:
    config_dict['eval_args'] = {
        "order": "TO",
        "split": {"CO": cutoff_time},
        "group_by": 'user_id',
        'mode': 'full'
    }
else:
    config_dict['eval_args'] = {
        "order": "TO",
        "split": { "LS": "valid_and_test" },
        "group_by": None,
        'mode': 'full'
    }

if loss_type == "CE":
    config_dict["train_neg_sample_args"] = None
else:
    config_dict["train_neg_sample_args"] = {
        "distribution": "uniform",
        "sample_num": 1,
        # "dynamic": False,
        # "candidate_num": 0,
    }

config = Config(
    model_name,
    dataset_name,
    config_dict=config_dict,
    config_file_list=[paths.get_path_param_conf()],
)

with open(paths.get_path_conf(), 'w+') as f:
    yaml.dump(config.external_config_dict, f, allow_unicode=True)

init_seed(config["seed"], config["reproducibility"])
utils.init_logger(config, paths)

# 2. Train

## 2.1. Declare necessary components for training

In [6]:
# Define data related things
if use_cutoff is True:
    match (config["MODEL_TYPE"]):
        case ModelType.GENERAL | ModelType.CONTEXT | ModelType.TRADITIONAL:
            ds = "SimulatedOnlineDataset"
        case ModelType.SEQUENTIAL:
            ds = "SimulatedOnlineSequentialDataset"
        case _:
            print(f"model type: {config['MODEL_TYPE']}")
            raise NotImplementedError()

    dataset = eval(ds)(config)
else:
    dataset = create_dataset(config)

if separate_activeness is True:
    train_data, valid_data, test_data_active, test_data_inactive = utils.get_loader(dataset, config, True, cutoff_time)
else:
    train_data, valid_data, test_data = utils.get_loader(dataset, config, False, None)

# Define model
model_name = config['model']
model = get_model(model_name)(config, train_data._dataset).to(config['device'])

# Define trainer
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=feat[field].mean(), inplace=True)
29 Jul 21:54    INFO  Saving filtered dataset into [logs/Jul29_215443_NPE_ml-1m_usecutoff_False/ckpts/ml-1m-SequentialDataset.pth]


In [8]:
logger = getLogger()

logger.info(config)
# logger.info(dataset)
# logger.info(model)

logger.info(f"train_dataset         : {len(train_data.dataset)}")
logger.info(f"valid_dataset         : {len(valid_data.dataset)}")

if separate_activeness is True:
    logger.info(f"test_dataset_active   : {len(test_data_active.dataset)}")
    logger.info(f"test_dataset_inactive : {len(test_data_inactive.dataset)}")
else:
    logger.info(f"test_dataset          : {len(test_data.dataset)}")

29 Jul 21:54    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 42
state = INFO
reproducibility = True
data_path = data/raw/ml-1m
checkpoint_dir = logs/Jul29_215443_NPE_ml-1m_usecutoff_False/ckpts
show_progress = True
save_dataset = True
dataset_save_path = data/processed/ml-1m.pth
save_dataloaders = True
dataloaders_save_path = data/dataloader/NPE-ml-1m.pth
log_wandb = False

Training Hyper Parameters:
epochs = 20
train_batch_size = 4096
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 3
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'LS': 'valid_and_test'}, 'order': 'TO', 'group_by': None, 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = True
metrics = ['NDCG', 'Precision', 'Recall', 'MRR', 'Hit', 'MAP']
topk = [10]
valid_metric = NDCG@10
valid_metr

In [9]:
best_valid_score, best_valid_result = trainer.fit(
    train_data, 
    valid_data,
    verbose=True,
    show_progress=config["show_progress"]
)

logger.info("** Validation result")
logger.info(f"best_valid_score: {best_valid_score:.4f}")
for metric, val in best_valid_result.items():
    logger.info(f"{metric:<15}: {val:.4f}")

Train     0:   0%|                                                           | 0/68 [00:00<?, ?it/s]:   1%|▊                                                  | 1/68 [00:00<00:11,  5.92it/s]:   4%|██▎                                                | 3/68 [00:00<00:07,  9.22it/s]:   7%|███▊                                               | 5/68 [00:00<00:06, 10.34it/s]:  10%|█████▎                                             | 7/68 [00:00<00:05, 10.85it/s]:  13%|██████▊                                            | 9/68 [00:00<00:05, 11.04it/s]:  16%|████████                                          | 11/68 [00:01<00:05, 11.29it/s]:  19%|█████████▌                                        | 13/68 [00:01<00:04, 11.44it/s]:  22%|███████████                                       | 15/68 [00:01<00:05, 10.26it/s]:  25%|████████████▌                                     | 17/68 [00:01<00:05,  9.70it/s]:  28%|█████████████▉                                    | 19/68 [00:01<00:04, 10.12it/s]:  31%|███

KeyboardInterrupt: 

## 2.3. Start testing

In [None]:
if separate_activeness is True:
    pairs = [
        ("Inactive", test_data_inactive),
        ("Active", test_data_active),
    ]

    for tag, test_data in pairs:
        test_result = trainer.evaluate(test_data)

        logger.info(f"** Test result: {tag}")
        for metric, val in test_result.items():
            logger.info(f"{metric:<15}: {val:.4f}")
else:
    test_result = trainer.evaluate(test_data)

    logger.info("** Test result")
    for metric, val in test_result.items():
        logger.info(f"{metric:<15}: {val:.4f}")

27 Jul 03:01    INFO  Loading model structure and parameters from logs/Jul27_004230_ItemKNN_amazon-digital-music_usecutoff_True/ckpts/ItemKNN-Jul-27-2024_02-42-08.pth
27 Jul 03:01    INFO  ** Test result
27 Jul 03:01    INFO  ndcg@10        : 0.0005
27 Jul 03:01    INFO  precision@10   : 0.0001
27 Jul 03:01    INFO  recall@10      : 0.0012
27 Jul 03:01    INFO  mrr@10         : 0.0003
27 Jul 03:01    INFO  hit@10         : 0.0012
27 Jul 03:01    INFO  map@10         : 0.0003


# 3. Tune hyper params

## 3.1. Define hyper params and object function

In [None]:
def objective_function(config_dict=None, config_file_list=None):
    config = Config(
        config_dict=config_dict,
        config_file_list=config_file_list,
    )

    init_seed(config["seed"], config["reproducibility"])

    # Define data related things
    # Define data related things
    if config["use_cutoff"] is True:
        match (config["MODEL_TYPE"]):
            case ModelType.GENERAL | ModelType.CONTEXT | ModelType.TRADITIONAL:
                ds = "SimulatedOnlineDataset"
            case ModelType.SEQUENTIAL:
                ds = "SimulatedOnlineSequentialDataset"
            case _:
                print(f"model type: {config['MODEL_TYPE']}")
                raise NotImplementedError()

        dataset = eval(ds)(config)
    else:
        dataset = create_dataset(config)

    if separate_activeness is True:
        train_data, valid_data, test_data_active, test_data_inactive = utils.get_loader(dataset, config, True, cutoff_time)
    else:
        train_data, valid_data, test_data = utils.get_loader(dataset, config, False, None)

    # Define model
    model_name = config['model']
    model = get_model(model_name)(config, train_data._dataset).to(config['device'])

    # Define trainer
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # Start training
    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, verbose=True)

    results = {
        'model': model_name,
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
    }

    # Start testing
    if separate_activeness is True:
        pairs = [
            ("inactive", test_data_inactive),
            ("active", test_data_active),
        ]

        for tag, test_data in pairs:
            test_result = trainer.evaluate(test_data)

            results[f'test_result_{tag}'] = test_result
    else:
        test_result = trainer.evaluate(test_data)

        results['test_result'] = test_result

    return results

## 3.2. Start tuning

In [None]:
tuning_algo = "bayes"
early_stop = 3
max_evals = 5

hp = HyperTuning(
    objective_function=objective_function,
    algo=tuning_algo,
    early_stop=early_stop,
    max_evals=max_evals,
    fixed_config_file_list=[paths.get_path_conf(), paths.get_path_param_conf()],
    params_file=paths.get_path_tuning_conf(),
)


hp.run()

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

27 Jul 03:01    INFO  build_posterior_wrapper took 0.001099 seconds
27 Jul 03:01    INFO  TPE using 0 trials


running parameters:                                  
{'k': 200, 'shrink': 0.0}                            
  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

27 Jul 03:01    ERROR  job exception: local variable 'ds' referenced before assignment


  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]


UnboundLocalError: local variable 'ds' referenced before assignment

## 3.3. Export tunning result

In [None]:
# print best parameters
logger.info('best params: ')
logger.info(hp.best_params)

# print best result
logger.info('best result: ')
logger.info(hp.params2result[hp.params2str(hp.best_params)])

# export to JSON file
tune_result = {
    'best_params': hp.best_params,
    'best_result': hp.params2result[hp.params2str(hp.best_params)]
}
with open(paths.get_path_tuning_log(), "w+") as f:
    json.dump(tune_result, f, indent=2, ensure_ascii=False)

07 Jul 22:36    INFO  best params: 
07 Jul 22:36    INFO  {'learning_rate': 0.01}
07 Jul 22:36    INFO  best result: 
07 Jul 22:36    INFO  {'model': 'BPR', 'best_valid_score': 0.0641, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('ndcg@10', 0.0641), ('precision@10', 0.0121), ('recall@10', 0.1209), ('mrr@10', 0.0472), ('hit@10', 0.1209), ('map@10', 0.0472)]), 'test_result': OrderedDict([('ndcg@10', 0.0256), ('precision@10', 0.0043), ('recall@10', 0.0429), ('mrr@10', 0.0206), ('hit@10', 0.0429), ('map@10', 0.0206)])}
