In [1]:
%cd ../../../

/Users/nseverin/MyData/Projects/Science/LLM/sasrec-bert4rec-recsys23


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import os
from itertools import product
from copy import deepcopy
import os

from src.experiment_tools.utils import read_text, save_text, read_json


def generate_config_params():
    cur_main_params = deepcopy(CONFIG_PARAMS)
    final_combinations = []
    keys, values = zip(*cur_main_params.items())
    all_combinations = [dict(zip(keys, v)) for v in product(*values)]
    for combination in all_combinations:
        if MODE == 'BASELINE':
            final_combinations.append(combination)
        else:
            if combination['use_down_scale'] and combination['use_upscale']:
                continue
            if combination['use_down_scale'] == False and combination['use_upscale'] == False and (
                    combination['user_profile_embeddings_files'] is not None and
                    not ('-256.json' in combination['user_profile_embeddings_files'] or
                     '-128.json' in combination['user_profile_embeddings_files'] or
                     '-64.json' in combination['user_profile_embeddings_files'])):
                continue
            if combination['multi_profile']:
                if isinstance(combination['user_profile_embeddings_files'], list):
                    emb_name = FILE_TO_EMB_NAME[tuple(combination['user_profile_embeddings_files'])]
                else:
                    emb_name = FILE_TO_EMB_NAME[combination['user_profile_embeddings_files']]
                if not emb_name.endswith('_multiple'):
                    continue
                if combination['multi_profile_aggr_scheme'] == 'attention' and combination['use_upscale']:
                    continue
            else:
                if combination['multi_profile_aggr_scheme'] != 'mean':
                    continue
                if isinstance(combination['user_profile_embeddings_files'], list):
                    emb_name = FILE_TO_EMB_NAME[tuple(combination['user_profile_embeddings_files'])]
                else:
                    emb_name = FILE_TO_EMB_NAME[combination['user_profile_embeddings_files']]
                if emb_name.endswith('_multiple'):
                    continue
                
            final_combinations.append(combination)
    return final_combinations


def create_list_of_lists(k, N):
    result = []
    start = 0
    while start < N:
        end = min(start + k - 1, N)
        result.append([start, end])
        start += k
    return result


def create_config_files(generated_configs):
    def fill_by_config_dict(config_base, config_vals):
        for field in config_vals:
            cur_val = config_vals[field]
            if field not in config_base:
                print(field)
                raise Exception()
            config_base = config_base.replace('{' + field + '}', f'{cur_val}')
        return config_base

    for config in generated_configs:
        config_base = read_text(CONFIG_TEMPLATE_PATH)
        config_base = fill_by_config_dict(config_base, PREDEFINED_PARAMS)
        config_base = fill_by_config_dict(config_base, config)
        config_base_for_file_save = {k:v for k,v in config.items()}
        if 'user_profile_embeddings_files' in config_base_for_file_save:
            if config_base_for_file_save['user_profile_embeddings_files'] is None:
                emb_name = None
            elif isinstance(config_base_for_file_save['user_profile_embeddings_files'], list):
                emb_name = FILE_TO_EMB_NAME[tuple(config_base_for_file_save['user_profile_embeddings_files'])]
            else:
                emb_name = FILE_TO_EMB_NAME[config_base_for_file_save['user_profile_embeddings_files']]
            config_base_for_file_save['user_profile_embeddings_files'] = emb_name
        save_path = SAVE_CONFIG_FILE.format(**config_base_for_file_save)
        save_text(save_path, config_base)


def create_bash(start, end):
    base_bash = read_text(BASH_TEMPLATE_PATH)
    base_bash = ((base_bash.replace('{start}', str(start))
                 .replace('{end}', str(end)))
                 .replace('{folder_path}', CONFIG_PATH))
    save_text(os.path.join(BASH_PATH, f'{start}-{end}.sh'), base_bash)


def create_bash_files(generated_confs, n_confs_per_bash):
    ind_lst = create_list_of_lists(n_confs_per_bash, len(generated_confs))
    print(len(ind_lst))
    for start, end in ind_lst:
        create_bash(start, end)


def create_sbatch_files():
    sbatch_temp = read_text(SBATCH_TEMPLATE_PATH)

    for file in os.listdir(BASH_PATH):
        cur_sbatch = (sbatch_temp.
                      replace('{model_name}', MODEL_NAME).
                      replace('{dataset}', DATASET).
                      replace('{time_to_take}', TIME_TO_TAKE).
                      replace('{artefact_path}', ARTEFACT_PATH).
                      replace('{experiment_name}', EXPERIMENT_NAME).
                      replace('{seed_folder}', SEED_FOLDER).
                      replace('{bash_name}', file))
        if file[0] == '.':
            continue
        save_text(os.path.join(SBATCH_PATH, file.replace('.sh', '.sbatch')), cur_sbatch)


SBATCH_TEMPLATE_PATH = 'experiments-2_0/sbatch/BASE.sbatch'
BASH_TEMPLATE_PATH = 'experiments-2_0/bash/BASE.sh'
FILE_MAPPING_PATH = 'experiments-2_0/configs/file_mapping'

In [4]:
# BASELINE MODE

MODEL_NAME = 'sasrec'
DATASET = 'kion_en'
EXPERIMENT_NAME = 'baseline'
SEED_FOLDER = 'single_seed'
SPLIT_NAME = 'general'
MODE = 'BASELINE'

PROFILE_PATH_NAMES = None

PREDEFINED_PARAMS = {'epochs': 100}

CONFIG_PARAMS = {
    'hidden_units': [64, 128, 256],
    'num_blocks': [2,4],
    'num_heads': [2,4,8],
    'dropout_rate': [0.2, 0.3],
    'learning_rate': [0.001, 0.0005, 0.00075],
    'seed': [42] if SEED_FOLDER == 'single_seed' else [1, 256],
}

CONFIG_TEMPLATE_PATH = f'experiments-2_0/configs/{MODEL_NAME}/template/baseline.yaml'
SAVE_FILE_TEMPLATE = '{hidden_units}-{num_blocks}-{num_heads}-{dropout_rate}-{learning_rate}-{seed}.yaml'


In [3]:
# BASELINE MODE

MODEL_NAME = 'sasrec'
DATASET = 'amazon_m2'
EXPERIMENT_NAME = 'baseline'
SEED_FOLDER = 'single_seed'
SPLIT_NAME = 'general'
MODE = 'BASELINE'

PROFILE_PATH_NAMES = None

PREDEFINED_PARAMS = {'epochs': 25}

CONFIG_PARAMS = {
    'hidden_units': [64],
    'num_blocks': [2,4],
    'num_heads': [2,4,8],
    'dropout_rate': [0.2, 0.3],
    'learning_rate': [0.001, 0.0005, 0.00075],
    'seed': [42] if SEED_FOLDER == 'single_seed' else [1, 256],
}

CONFIG_TEMPLATE_PATH = f'experiments-2_0/configs/{MODEL_NAME}/template/baseline_less_bs.yaml'
SAVE_FILE_TEMPLATE = '{hidden_units}-{num_blocks}-{num_heads}-{dropout_rate}-{learning_rate}-{seed}.yaml'
SBATCH_TEMPLATE_PATH = 'experiments-2_0/sbatch/BASE_BIG.sbatch'

In [10]:
# LLM MODE

MODEL_NAME = 'sasrec'
DATASET = 'kion_en'
EXPERIMENT_NAME = 'KION_INITIAL'
SEED_FOLDER = 'single_seed'
SPLIT_NAME = 'general'
MODE = 'LLM'

PROFILE_PATH_NAMES = ['gemma_short_large_single', 'gemma_long_large_single', "gemma_long_large_umap_single", 'gemma_short_large_umap_single']

PREDEFINED_PARAMS = {
    'epochs': 20,
    'hidden_units': 64,
    'num_blocks': 4,
    'num_heads': 4,
    'dropout_rate': 0.2,
    'learning_rate': 0.00075,
}

CONFIG_PARAMS = {
    'weighting_scheme': ['mean', 'exponential', 'attention'],
    'alpha': [0.8, 0.5, 0.65],
    'fine_tune_epoch': [4, 6, 8],
    'reconstruct_loss': ['MSE', 'RMSE'],
    'reconstruction_layer': [1,2],
    'weight_scale': [0.1],
    'use_down_scale': [True, False],
    'use_upscale': [True, False],
    'multi_profile': [False],
    'multi_profile_aggr_scheme': ['mean'],
    'scale_guide_loss': [True, False],
    'seed': [42] if SEED_FOLDER == 'single_seed' else [1, 256],
}

CONFIG_TEMPLATE_PATH = f'experiments-2_0/configs/{MODEL_NAME}/template/llm.yaml'
SAVE_FILE_TEMPLATE = '{weighting_scheme}-{alpha}-{fine_tune_epoch}-{reconstruct_loss}-{reconstruction_layer}-{weight_scale}-{use_down_scale}-{use_upscale}-{multi_profile}-{multi_profile_aggr_scheme}-{scale_guide_loss}-{user_profile_embeddings_files}-{seed}.yaml'

In [9]:
# LLM MODE

MODEL_NAME = 'sasrec'
DATASET = 'ml20m'
EXPERIMENT_NAME = 'ML20M_INITIAL'
SEED_FOLDER = 'single_seed'
SPLIT_NAME = 'general'
MODE = 'LLM'

PROFILE_PATH_NAMES = ['gemma_short_large_single', 'gemma_long_large_single', "gemma_long_large_umap_single", 'gemma_short_large_umap_single']

PREDEFINED_PARAMS = {
    'epochs': 100,
    'hidden_units': 256,
    'num_blocks': 2,
    'num_heads': 8,
    'dropout_rate': 0.3,
    'learning_rate': 0.001,
}

CONFIG_PARAMS = {
    'weighting_scheme': ['mean', 'exponential', 'attention'],
    'alpha': [0.8, 0.5, 0.65],
    'fine_tune_epoch': [30, 50, 70],
    'reconstruct_loss': ['MSE', 'RMSE'],
    'reconstruction_layer': [1],
    'weight_scale': [0.1],
    'use_down_scale': [True, False],
    'use_upscale': [False],
    'multi_profile': [False],
    'multi_profile_aggr_scheme': ['mean'],
    'scale_guide_loss': [True, False],
    'seed': [42] if SEED_FOLDER == 'single_seed' else [1, 256],
}

CONFIG_TEMPLATE_PATH = f'experiments-2_0/configs/{MODEL_NAME}/template/llm.yaml'
SAVE_FILE_TEMPLATE = '{weighting_scheme}-{alpha}-{fine_tune_epoch}-{reconstruct_loss}-{reconstruction_layer}-{weight_scale}-{use_down_scale}-{use_upscale}-{multi_profile}-{multi_profile_aggr_scheme}-{scale_guide_loss}-{user_profile_embeddings_files}-{seed}.yaml'

In [2]:
# LLM MODE

MODEL_NAME = 'sasrec'
DATASET = 'beauty'
EXPERIMENT_NAME = 'BEAUTY_INITIAL'
SEED_FOLDER = 'single_seed'
SPLIT_NAME = 'general'
MODE = 'LLM'

PROFILE_PATH_NAMES = ['gemma_long_small_single', 'gemma_long_large_single', "gemma_long_large_umap_single", 'gemma_short_large_umap_single', 'gemma__large_multiple']

PREDEFINED_PARAMS = {
    'epochs': 25,
    'hidden_units': 64,
    'num_blocks': 4,
    'num_heads': 4,
    'dropout_rate': 0.2,
    'learning_rate': 0.0005,
}

CONFIG_PARAMS = {
    'weighting_scheme': ['mean', 'attention'],
    'alpha': [0.6, 0.5, 0.7],
    'fine_tune_epoch': [6, 12],
    'reconstruct_loss': ['MSE', 'RMSE'],
    'reconstruction_layer': [1,2],
    'weight_scale': [0.1],
    'use_down_scale': [True, False],
    'use_upscale': [False],
    'multi_profile': [True, False],
    'multi_profile_aggr_scheme': ['mean', 'attention'],
    'scale_guide_loss': [True, False],
    'seed': [42] if SEED_FOLDER == 'single_seed' else [1, 256],
}

CONFIG_TEMPLATE_PATH = f'experiments-2_0/configs/{MODEL_NAME}/template/llm.yaml'
SAVE_FILE_TEMPLATE = '{weighting_scheme}-{alpha}-{fine_tune_epoch}-{reconstruct_loss}-{reconstruction_layer}-{weight_scale}-{use_down_scale}-{use_upscale}-{multi_profile}-{multi_profile_aggr_scheme}-{scale_guide_loss}-{user_profile_embeddings_files}-{seed}.yaml'

In [3]:
# LLM MODE

MODEL_NAME = 'sasrec'
DATASET = 'beauty'
EXPERIMENT_NAME = 'BEAUTY_INITIAL_MULTI'
SEED_FOLDER = 'single_seed'
SPLIT_NAME = 'general'
MODE = 'LLM'

PROFILE_PATH_NAMES = ['gemma__large_multiple']

PREDEFINED_PARAMS = {
    'epochs': 25,
    'hidden_units': 256,
    'num_blocks': 2,
    'num_heads': 2,
    'dropout_rate': 0.2,
    'learning_rate': 0.0005,
}

CONFIG_PARAMS = {
    'weighting_scheme': ['mean', 'attention'],
    'alpha': [0.6, 0.5, 0.7],
    'fine_tune_epoch': [6, 12],
    'reconstruct_loss': ['MSE', 'RMSE'],
    'reconstruction_layer': [1,2],
    'weight_scale': [0.1],
    'use_down_scale': [True, False],
    'use_upscale': [False],
    'multi_profile': [True],
    'multi_profile_aggr_scheme': ['mean', 'attention'],
    'scale_guide_loss': [True, False],
    'seed': [42] if SEED_FOLDER == 'single_seed' else [1, 256],
}

CONFIG_TEMPLATE_PATH = f'experiments-2_0/configs/{MODEL_NAME}/template/llm.yaml'
SAVE_FILE_TEMPLATE = '{weighting_scheme}-{alpha}-{fine_tune_epoch}-{reconstruct_loss}-{reconstruction_layer}-{weight_scale}-{use_down_scale}-{use_upscale}-{multi_profile}-{multi_profile_aggr_scheme}-{scale_guide_loss}-{user_profile_embeddings_files}-{seed}.yaml'

In [4]:
ARTEFACT_PATH = f'results/{MODEL_NAME}/{DATASET}/{EXPERIMENT_NAME}/{SEED_FOLDER}'
CONFIG_PATH = f'experiments-2_0/configs/{MODEL_NAME}/{DATASET}/{EXPERIMENT_NAME}/{SEED_FOLDER}'
BASH_PATH = f'experiments-2_0/bash/{MODEL_NAME}/{DATASET}/{EXPERIMENT_NAME}/{SEED_FOLDER}'
SBATCH_PATH = f'experiments-2_0/sbatch/{MODEL_NAME}/{DATASET}/{EXPERIMENT_NAME}/{SEED_FOLDER}'

SAVE_CONFIG_FILE = os.path.join(CONFIG_PATH, SAVE_FILE_TEMPLATE)

# creation of folders
os.makedirs(ARTEFACT_PATH, exist_ok=True)
os.makedirs(CONFIG_PATH, exist_ok=True)
os.makedirs(BASH_PATH, exist_ok=True)
os.makedirs(SBATCH_PATH, exist_ok=True)

FILE_MAPPING = read_json(os.path.join(FILE_MAPPING_PATH, f'{DATASET}.json'))
FILE_TO_EMB_NAME = {}
for emb_name, emb_file in FILE_MAPPING[SPLIT_NAME]['user_profile_embeddings_files'].items():
    if isinstance(emb_file, str):
        FILE_TO_EMB_NAME[emb_file] = emb_name
    elif isinstance(emb_file, list):
        FILE_TO_EMB_NAME[tuple(emb_file)] = emb_name
    else:
        print(emb_file)
        raise Exception()
    

# add to predefined params the values about dataset
PREDEFINED_PARAMS.update({
    'profile_train_sequences': FILE_MAPPING[SPLIT_NAME]['profile_train_sequences'],
    'finetune_train_sequences': FILE_MAPPING[SPLIT_NAME]['finetune_train_sequences'],
    'valid_sequences': FILE_MAPPING[SPLIT_NAME]['valid_sequences'],
    'test_sequences': FILE_MAPPING[SPLIT_NAME]['test_sequences'],
    'mappings': FILE_MAPPING[SPLIT_NAME]['mappings'],
    'counts': FILE_MAPPING[SPLIT_NAME]['counts'],
    'experiment_name': EXPERIMENT_NAME,
})

# add different profiles
CONFIG_PARAMS.update({
    'user_profile_embeddings_files': [None] if PROFILE_PATH_NAMES is None else
        [FILE_MAPPING[SPLIT_NAME]['user_profile_embeddings_files'][profile_name] for profile_name in PROFILE_PATH_NAMES]
})

#--------------------

In [5]:
all_config_params = generate_config_params()
N_CONFS_PER_BASH = 5
TIME_TO_TAKE = '2-0:0'

print('Number of configs:', len(all_config_params))
print(len(create_list_of_lists(N_CONFS_PER_BASH, len(all_config_params))))

Number of configs: 36
8


In [6]:
create_config_files(all_config_params)
create_bash_files(all_config_params, n_confs_per_bash=N_CONFS_PER_BASH)
create_sbatch_files()

8


In [7]:
for file in os.listdir(SBATCH_PATH):
    print('sbatch', SBATCH_PATH+'/'+file)

sbatch experiments-2_0/sbatch/sasrec/amazon_m2/baseline/single_seed/15-19.sbatch
sbatch experiments-2_0/sbatch/sasrec/amazon_m2/baseline/single_seed/10-14.sbatch
sbatch experiments-2_0/sbatch/sasrec/amazon_m2/baseline/single_seed/30-34.sbatch
sbatch experiments-2_0/sbatch/sasrec/amazon_m2/baseline/single_seed/25-29.sbatch
sbatch experiments-2_0/sbatch/sasrec/amazon_m2/baseline/single_seed/35-36.sbatch
sbatch experiments-2_0/sbatch/sasrec/amazon_m2/baseline/single_seed/0-4.sbatch
sbatch experiments-2_0/sbatch/sasrec/amazon_m2/baseline/single_seed/20-24.sbatch
sbatch experiments-2_0/sbatch/sasrec/amazon_m2/baseline/single_seed/5-9.sbatch


In [33]:
all_config_params

[{'weighting_scheme': 'mean',
  'alpha': 0.6,
  'fine_tune_epoch': 6,
  'reconstruct_loss': 'MSE',
  'reconstruction_layer': 1,
  'weight_scale': 0.1,
  'use_down_scale': True,
  'use_upscale': False,
  'multi_profile': True,
  'multi_profile_aggr_scheme': 'mean',
  'scale_guide_loss': True,
  'seed': 42,
  'user_profile_embeddings_files': ['/home/nseverin/generate_user_profiles/recsys-user-profiles/data/amazon_beauty/gemma-several-e5-type-1-embs.json',
   '/home/nseverin/generate_user_profiles/recsys-user-profiles/data/amazon_beauty/gemma-several-e5-type-2-embs.json',
   '/home/nseverin/generate_user_profiles/recsys-user-profiles/data/amazon_beauty/gemma-several-e5-type-3-embs.json']},
 {'weighting_scheme': 'mean',
  'alpha': 0.6,
  'fine_tune_epoch': 6,
  'reconstruct_loss': 'MSE',
  'reconstruction_layer': 1,
  'weight_scale': 0.1,
  'use_down_scale': True,
  'use_upscale': False,
  'multi_profile': True,
  'multi_profile_aggr_scheme': 'mean',
  'scale_guide_loss': False,
  'seed': 