In [1]:
%load_ext autoreload
%autoreload 2

%cd ../

/Users/hoangle/Uni/Thesis


In [2]:
import json
from logging import getLogger

import yaml
from recbole.config import Config
from recbole.data import data_preparation, create_dataset
from recbole.trainer import HyperTuning
from recbole.utils import (
    get_model,
    get_trainer,
    init_seed,
    ModelType
)

import src.utils as utils
from src.real_temporal import SimulatedOnlineSequentialDataset, SimulatedOnlineDataset

# 1. Declarations & Definitions

## 1.1. Define flags and global variables

In [3]:
seed = 42

use_cutoff = True
reproducible = True

model_name = "ItemKNN"
loss_type = "BPR"
# cutoff_time = "884471835"
dataset_name = "amazon-digital-music"
cutoff_time = "1403568000"

## 1.2. Define configurations

Configuration for data, model, training and evaluation

In [4]:
paths = utils.Paths(model_name, dataset_name, use_cutoff)

In [5]:
config_dict = {
    # For model 
    'model': model_name,
    'loss_type': loss_type,

    # For data
    'dataset': dataset_name, 
    'load_col': {"inter": ['user_id', 'item_id', 'timestamp']},
    'use_cutoff': use_cutoff,
    'normalize_all': False,

    # For training
    'epochs': 20,
    'train_batch_size': 4096,
    'eval_step': 1,
    'stopping_step': 3,
    'learning_rate': 1e-3,
    
    # For evaluation
    'eval_batch_size': 4096,
    'metrics': ["NDCG", "Precision", "Recall", "MRR", "Hit", "MAP"],
    'topk': 10,
    'valid_metric': 'NDCG@10',

    # Environment
    'gpu_id': 0,
    "seed": seed,
    "reproducibility": reproducible,
    'device': 'cuda',
    'use_gpu': True,
    'data_path': paths.get_path_data_raw(),
    "checkpoint_dir": paths.get_path_dir_ckpt(),
    "show_progress": True,
    'save_dataset': True,
    'dataset_save_path': paths.get_path_data_processed(),
    'save_dataloaders': True,
    'dataloaders_save_path': paths.get_path_dataloader(),
}

if use_cutoff is True:
    config_dict['eval_args'] = {
        "order": "TO",
        "split": {"CO": cutoff_time},
        "group_by": 'user_id',
        'mode': 'full'
    }
else:
    config_dict['eval_args'] = {
        "order": "TO",
        "split": { "LS": "valid_and_test" },
        "group_by": None,
        'mode': 'full'
    }

if loss_type == "CE":
    config_dict["train_neg_sample_args"] = None
else:
    config_dict["train_neg_sample_args"] = {
        "distribution": "uniform",
        "sample_num": 1,
        # "dynamic": False,
        # "candidate_num": 0,
    }

config = Config(
    model_name,
    dataset_name,
    config_dict=config_dict,
    config_file_list=[paths.get_path_param_conf()],
)

with open(paths.get_path_conf(), 'w+') as f:
    yaml.dump(config.external_config_dict, f, allow_unicode=True)

init_seed(config["seed"], config["reproducibility"])
utils.init_logger(config, paths)

# 2. Train

## 2.1. Declare necessary components for training

In [7]:
# Define data related things
if use_cutoff is True:
    match (config["MODEL_TYPE"]):
        case ModelType.GENERAL | ModelType.CONTEXT | ModelType.TRADITIONAL:
            ds = "SimulatedOnlineDataset"
        case ModelType.SEQUENTIAL:
            ds = "SimulatedOnlineSequentialDataset"
        case _:
            print(f"model type: {config['MODEL_TYPE']}")
            raise NotImplementedError()
    

    dataset = eval(ds)(config)
else:
    dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

# Define model
model_name = config['model']
model = get_model(model_name)(config, train_data._dataset).to(config['device'])

# Define trainer
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

27 Jul 00:43    INFO  Saving split dataloaders into: [logs/Jul27_004230_ItemKNN_amazon-digital-music_usecutoff_True/ckpts/amazon-digital-music-for-ItemKNN-dataloader.pth]
27 Jul 00:43    INFO  [Training]: train_batch_size = [4096] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
27 Jul 00:43    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'CO': '1403568000'}, 'order': 'TO', 'group_by': 'user_id', 'mode': {'valid': 'full', 'test': 'full'}}]


In [8]:
logger = getLogger()

logger.info(config)
# logger.info(dataset)
# logger.info(model)

27 Jul 02:42    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 42
state = INFO
reproducibility = True
data_path = data/raw/amazon-digital-music
checkpoint_dir = logs/Jul27_004230_ItemKNN_amazon-digital-music_usecutoff_True/ckpts
show_progress = True
save_dataset = True
dataset_save_path = data/processed/amazon-digital-music.pth
save_dataloaders = True
dataloaders_save_path = data/dataloader/ItemKNN-amazon-digital-music.pth
log_wandb = False

Training Hyper Parameters:
epochs = 20
train_batch_size = 4096
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 3
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'CO': '1403568000'}, 'order': 'TO', 'group_by': 'user_id', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = False
metrics = ['NDCG', 'Precision', 'Recall', 'M

## 2.2. Start training

In [9]:
best_valid_score, best_valid_result = trainer.fit(
    train_data, 
    valid_data,
    verbose=True,
    show_progress=config["show_progress"]
)

logger.info("** Validation result")
logger.info(f"best_valid_score: {best_valid_score:.4f}")
for metric, val in best_valid_result.items():
    logger.info(f"{metric:<15}: {val:.4f}")

Train     0:   0%|                                                          | 0/170 [00:00<?, ?it/s]:  22%|██████████▋                                     | 38/170 [00:00<00:00, 374.69it/s]:  58%|███████████████████████████▉                    | 99/170 [00:00<00:00, 508.47it/s]:  94%|████████████████████████████████████████████▏  | 160/170 [00:00<00:00, 553.89it/s]: 100%|███████████████████████████████████████████████| 170/170 [00:00<00:00, 516.62it/s]
27 Jul 02:42    INFO  epoch 0 training [time: 0.38s, train loss: 0.0000]
Evaluate   :   0%|                                                       | 0/116395 [00:00<?, ?it/s]:   0%|                                             | 67/116395 [00:00<02:55, 662.07it/s]:   0%|                                            | 134/116395 [00:00<02:55, 664.27it/s]:   0%|                                            | 201/116395 [00:00<02:59, 645.69it/s]:   0%|                                            | 267/116395 [00:00<02:58, 650.93it/s]:   0%|▏      

## 2.3. Start testing

In [10]:
test_result = trainer.evaluate(test_data)

logger.info("** Test result")
for metric, val in test_result.items():
    logger.info(f"{metric:<15}: {val:.4f}")

27 Jul 03:01    INFO  Loading model structure and parameters from logs/Jul27_004230_ItemKNN_amazon-digital-music_usecutoff_True/ckpts/ItemKNN-Jul-27-2024_02-42-08.pth
27 Jul 03:01    INFO  ** Test result
27 Jul 03:01    INFO  ndcg@10        : 0.0005
27 Jul 03:01    INFO  precision@10   : 0.0001
27 Jul 03:01    INFO  recall@10      : 0.0012
27 Jul 03:01    INFO  mrr@10         : 0.0003
27 Jul 03:01    INFO  hit@10         : 0.0012
27 Jul 03:01    INFO  map@10         : 0.0003


# 3. Tune hyper params

## 3.1. Define hyper params and object function

In [11]:
def objective_function(config_dict=None, config_file_list=None):
    config = Config(
        config_dict=config_dict,
        config_file_list=config_file_list,
    )

    init_seed(config["seed"], config["reproducibility"])

    # Define data related things
    if use_cutoff is True:
        match (config["MODEL_TYPE"]):
            case ModelType.GENERAL | ModelType.CONTEXT:
                ds = "SimulatedOnlineDataset"
            case ModelType.SEQUENTIAL:
                ds = "SimulatedOnlineSequentialDataset"

        dataset = eval(ds)(config)
    else:
        dataset = create_dataset(config)
    train_data, valid_data, test_data = data_preparation(config, dataset)

    # Define model
    model_name = config['model']
    model = get_model(model_name)(config, train_data._dataset).to(config['device'])

    # Define trainer
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # Start training
    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, verbose=True)

    # Start evaluating
    test_result = trainer.evaluate(test_data)

    return {
        'model': model_name,
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }

## 3.2. Start tuning

In [12]:
tuning_algo = "bayes"
early_stop = 3
max_evals = 5

hp = HyperTuning(
    objective_function=objective_function,
    algo=tuning_algo,
    early_stop=early_stop,
    max_evals=max_evals,
    fixed_config_file_list=[paths.get_path_conf(), paths.get_path_param_conf()],
    params_file=paths.get_path_tuning_conf(),
)


hp.run()

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

27 Jul 03:01    INFO  build_posterior_wrapper took 0.001099 seconds
27 Jul 03:01    INFO  TPE using 0 trials


running parameters:                                  
{'k': 200, 'shrink': 0.0}                            
  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

27 Jul 03:01    ERROR  job exception: local variable 'ds' referenced before assignment


  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]


UnboundLocalError: local variable 'ds' referenced before assignment

## 3.3. Export tunning result

In [None]:
# print best parameters
logger.info('best params: ')
logger.info(hp.best_params)

# print best result
logger.info('best result: ')
logger.info(hp.params2result[hp.params2str(hp.best_params)])

# export to JSON file
tune_result = {
    'best_params': hp.best_params,
    'best_result': hp.params2result[hp.params2str(hp.best_params)]
}
with open(paths.get_path_tuning_log(), "w+") as f:
    json.dump(tune_result, f, indent=2, ensure_ascii=False)

07 Jul 22:36    INFO  best params: 
07 Jul 22:36    INFO  {'learning_rate': 0.01}
07 Jul 22:36    INFO  best result: 
07 Jul 22:36    INFO  {'model': 'BPR', 'best_valid_score': 0.0641, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('ndcg@10', 0.0641), ('precision@10', 0.0121), ('recall@10', 0.1209), ('mrr@10', 0.0472), ('hit@10', 0.1209), ('map@10', 0.0472)]), 'test_result': OrderedDict([('ndcg@10', 0.0256), ('precision@10', 0.0043), ('recall@10', 0.0429), ('mrr@10', 0.0206), ('hit@10', 0.0429), ('map@10', 0.0206)])}
