# Create config file

In [14]:
import yaml
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from logging import getLogger
from recbole.config import Config
from recbole.utils import init_seed, init_logger
from recbole.data import create_dataset, data_preparation
from recbole.utils import get_model, get_trainer
from recbole.trainer import HyperTuning
from recbole.quick_start import objective_function

SEED = 2022

In [15]:
parameter_dict = {
        
    # environment
    'seed': SEED,
    'reproducibility': True,
    'data_path': 'dataset/collections/',
    'checkpoint_dir': 'saved/',
    'show_progress': True,
    'save_dataset': False,
    'log_wandb': True,
    'save_dataloaders': True,
    'dataloaders_save_path': 'dataloader/',
    
    # data
    'field_separator': '\t',
    'seq_separator': ' ',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'RATING_FIELD': 'rating',
    'item_inter_num_interval': '[0,inf)', 
    
    # training
    'epochs': 50,
    'train_batch_size': 2048, # 2048
    'learner': 'adam',
    'learning_rate': 0.001, # 0.001
    'train_neg_sample_args': {'distribution': 'popularity',
                              'sample_num': 5,
                              'dynamic': False,
                              'candidate_num': 0},
    'eval_step': 1,
    'stopping_step': 3000, # 15
    'loss_decimal_place': 4,
    
    # evaluation
    'eval_args': {'group_by': 'user',
                  'order': 'RO',
                  'split': {'RS':[8,1,1]},
                  'mode': 'pop100'},
    'metrics': ['Recall', 'MRR', 'NDCG', 'Hit', 'MAP', 'Precision', 'GAUC'],
    'topk': [1, 2, 5, 10, 20], 
    'valid_metric': 'NDCG@20', # for early stopping
    'eval_batch_size': 4096, # 4096
    'metric_decimal_place': 4
    
}

# convert parameter_dict to yaml file
with open(r'config/fixed_config_baseline.yaml', 'w') as file:
    documents = yaml.dump(parameter_dict, file)

# Example: Run models

In [18]:
MODEL_names = ['BPR'] #['BPR', 'DMF', 'NeuMF', 'NGCF', 'LightGCN'] 
DATASET_names = ['bayc']
ITEM_CUT_list = [3]

result_path = './result/'
if not os.path.exists(result_path):
    os.makedirs(result_path)

In [19]:
if __name__ == '__main__':
    
    for MODEL in tqdm(MODEL_names):
        test_result_list = []
        for DATASET in DATASET_names:
            for ITEM_CUT in ITEM_CUT_list:
            
                config = Config(model=MODEL, dataset=DATASET, config_file_list=['config/fixed_config_baseline.yaml'])
                config['user_inter_num_interval'] = f'[{ITEM_CUT},inf)'
                
                # init random seed
                init_seed(config['seed'], config['reproducibility'])

                # logger initialization
                init_logger(config)
                logger = getLogger()

                # write config info into log
                # logger.info(config) # print config info

                # dataset creating and filtering # convert atomic files -> Dataset
                dataset = create_dataset(config)
                logger.info(dataset) # print dataset info

                # dataset splitting # convert Dataset -> Dataloader
                train_data, valid_data, test_data = data_preparation(config, dataset)

        #         # model loading and initialization
        #         model = get_model(config['model'])(config, train_data.dataset).to(config['device'])
        #         logger.info(model)

        #         # trainer loading and initialization
        #         trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)
                

        #         """ (1) training """

        #         # # resume from break point
        #         # checkpoint_file = 'checkpoint.pth'
        #         # trainer.resume_checkpoint(checkpoint_file)
                
        #         # model training
        #         best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)


        #         """ (2) testing """

        #         # When calculate ItemCoverage metrics, we need to run this code for set item_nums in eval_collector.
        #         trainer.eval_collector.data_collect(train_data)

        #         # model evaluation
        #         checkpoint_file = get_last_file('./saved/')
        #         print(checkpoint_file)
        #         test_result = trainer.evaluate(test_data, model_file=checkpoint_file)
        #         print('FINAL TEST RESULT')
        #         print(test_result)
        #         test_result_list.append(pd.DataFrame.from_dict(test_result, orient='index', columns=[DATASET+'_'+str(ITEM_CUT)]))
                
        # pd.concat(test_result_list, axis=1).to_csv(result_path + f'{MODEL}.csv', index=True)

20 Feb 19:28    INFO  bayc
The number of users: 529
Average actions of users: 5.087121212121212
The number of items: 1958
Average actions of items: 1.372508942258559
The number of inters: 2686
The sparsity of the dataset: 99.74067902319213%
Remain Fields: ['user_id', 'item_id']
20 Feb 19:28    INFO  Saving split dataloaders into: [saved/bayc-for-BPR-dataloader.pth]
20 Feb 19:28    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'candidate_num': 0, 'distribution': 'popularity', 'dynamic': False, 'sample_num': 5, 'alpha': 1.0}]
20 Feb 19:28    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'group_by': 'user', 'mode': 'pop100', 'order': 'RO', 'split': {'RS': [8, 1, 1]}}]
100%|██████████| 1/1 [00:00<00:00,  6.45it/s]


# Example: HPO

In [20]:
def objective_function(config_dict=None, config_file_list=None):
    
    config = Config(model=MODEL, dataset=DATASET, config_dict=config_dict, config_file_list=config_file_list)
    init_seed(config['seed'], config['reproducibility'])
    dataset = create_dataset(config)
    train_data, valid_data, test_data = data_preparation(config, dataset)
    model_name = config['model']
    model = get_model(model_name)(config, train_data.dataset).to(config['device'])
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)
    """ (1) training """
    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, verbose=False)
    """ (2) testing """
    test_result = trainer.evaluate(test_data)

    return {
        'model': model_name,
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }

In [21]:
MODEL_names = ['NGCF']
DATASET_names = ['meebits']
ITEM_CUT_list = [3]

result_path = './result/'
# create folder result_path
if not os.path.exists(result_path):
    os.makedirs(result_path)

In [1]:
for MODEL in MODEL_names:
    for DATASET in tqdm(DATASET_names):
        HPO_test_result_list = []
        for ITEM_CUT in ITEM_CUT_list:
            
            hp = HyperTuning(objective_function=objective_function, algo='exhaustive', 
                                max_evals=50, params_file=f'hyper/{MODEL}.hyper', fixed_config_file_list=['config/fixed_config_baseline.yaml'])

            # run
            hp.run()
            # export result to the file
            hp.export_result(output_file=f'hyper/{MODEL}_{DATASET}_{ITEM_CUT}.result')
            # print best parameters
            print('best params: ', hp.best_params)
            # save best parameters
            with open(f'hyper/{MODEL}_{DATASET}_{ITEM_CUT}.best_params', 'w') as file:
                documents = yaml.dump(hp.best_params, file)
            # print best result
            best_result = hp.params2result[hp.params2str(hp.best_params)]
            print('best result: ')
            print(best_result)
            
            HPO_test_result_list.append(pd.DataFrame.from_dict(best_result['test_result'], orient='index', columns=[f'{DATASET}_{ITEM_CUT}'])) 
        
        pd.concat(HPO_test_result_list, axis=1).to_csv(result_path + f'{MODEL}_{DATASET}_{ITEM_CUT}.csv', index=True)

NameError: name 'MODEL_names' is not defined