In [None]:
%pip install transformers datasets optuna
%pip install accelerate -U

This notebook requires a tokenizer trained on BabyLM 10M corpus
Use the following code for training or load an already trained tokenizer : https://github.com/upunaprosk/BabyBERTa.

In [None]:
path_tokenizer_config = 'trained-tokenizer/custom_tokenizer.json'

Optuna params type: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.FixedTrial.html

In [None]:
from transformers import RobertaTokenizerFast,AdamW
from datasets import load_dataset
from pathlib import Path
from transformers import DataCollatorForLanguageModeling,RobertaConfig,RobertaForMaskedLM,TrainingArguments,Trainer
import optuna
import logging
import sys
import math

def tokenize_function(examples):
    # Remove empty lines
    examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
    return tokenizer(
        examples["text"],
        padding=True,
        truncation=True,
        max_length=128,
        # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
        # receives the `special_tokens_mask`.
        return_special_tokens_mask=True,
    )

tokenizer = RobertaTokenizerFast(vocab_file=None,
                                     merges_file=None,
                                     tokenizer_file=path_tokenizer_config,
                                     )
files=[p.as_posix() for p in Path('./babylm_data/babylm_10M/').glob('*.train')]
files = [
 './babylm_data/babylm_10M/wikipedia.train',
 './babylm_data/babylm_10M/gutenberg.train',
 './babylm_data/babylm_10M/cbt.train']
dataset = load_dataset('text', data_files={'train': list(files)})

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=['text'],
    load_from_cache_file=True,
)
print(f'Length of train data={len(tokenized_datasets["train"])}')

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.135
)

In [None]:
files=[p.as_posix() for p in Path('./babylm_data/babylm_dev/').glob('*.dev')]
files = [
'./babylm_data/babylm_dev/wikipedia.dev',
 './babylm_data/babylm_dev/gutenberg.dev',
 './babylm_data/babylm_dev/cbt.dev']
dataset_dev = load_dataset('text', data_files={'validation': list(files)})

tokenized_datasets_dev = dataset_dev.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=['text'],
    load_from_cache_file=True,
)
print(f'Length of validation data={len(tokenized_datasets_dev["validation"])}')

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.135
)

In [None]:
# Default params are based on RoBERTA-base configuration: https://arxiv.org/abs/1907.11692v1
DEFAULT_PARAMS={
"model_name": "roberta-base",
# "train_epochs": 5,
"batch_size": 16,
"model_parameters": [{
    "vocab_size": 30522,
    "hidden_size": 768,
    "num_hidden_layers": 6,
    "num_attention_heads": 12,
    "intermediate_size": 3072,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "attention_probs_dropout_prog": 0.1,
    "max_position_embeddings": 128,
    "type_vocab_size": 1,
    "initializer_range": 0.02,
    "layer_norm_eps": 1e-12,
    "gradient_checkpointing": False,
    "position_embedding_type": "absolute",
    "use_cache": True
}],

"optimizer_parameters": [{
    "lr": 1e-5,
    "beta_one": 0.9,
    "beta_two": 0.999,
    "eps": 1e-6,
    "warmup_steps": 24000,
    "weight_decay": 0.01
}]
}

In [None]:
!mkdir results
!mkdir logs

mkdir: cannot create directory ‘results’: File exists
mkdir: cannot create directory ‘logs’: File exists


In [None]:
def objective(trial: optuna.Trial):
    """
    Function to set up the model and train it.
    Loss function is based on perplexity
    Find optimal parameters that minimize the output of the objective
    """
    model_parameters = {
    'hidden_size': trial.suggest_int('hidden_size_multiplier', 1, 100),
    'num_hidden_layers': trial.suggest_int('hidden_layers', 1, 12),
    'num_attention_heads': trial.suggest_int('attention_heads', 1, 13),
    'intermediate_size': trial.suggest_int('intermediate_size', 1, 3072),
    'hidden_act': trial.suggest_categorical('hidden_act', [
        'gelu',
        'relu',
        'silu',
        'gelu_new'
    ]),
    'hidden_dropout_prob': trial.suggest_uniform('hidden_dropout_prob', 0.1, 0.5),
    'attention_probs_dropout_prog': trial.suggest_uniform('attention_prob_dropout_prog', 0.1, 0.5),
    'max_position_embeddings': 128,
    'type_vocab_size': 1,
    'initializer_range': 0.02,
    'layer_norm_eps': 1e-12,
    'gradient_checkpointing': False,
    'position_embedding_type': trial.suggest_categorical('position_embedding_type', [
        'absolute',
        'relative_key',
        'relative_key_query'
    ]),
    'use_cache': True,
    }
    _config = RobertaConfig(
            vocab_size=tokenizer.vocab_size,
            hidden_size=model_parameters['hidden_size'] * model_parameters['num_attention_heads'],
            num_hidden_layers=model_parameters['num_hidden_layers'],
            num_attention_heads=model_parameters['num_attention_heads'],
            intermediate_size=model_parameters['intermediate_size'],
            hidden_act=model_parameters['hidden_act'],
            hidden_dropout_prob=model_parameters['hidden_dropout_prob'],
            attention_probs_dropout_prog=model_parameters['attention_probs_dropout_prog'],
            max_position_embeddings=model_parameters['max_position_embeddings'] * 2,
            type_vocab_size=model_parameters['type_vocab_size'],
            initializer_range=model_parameters['initializer_range'],
            layer_norm_eps=model_parameters['layer_norm_eps'],
            gradient_checkpointing=model_parameters['gradient_checkpointing'],
            position_embedding_type=model_parameters['position_embedding_type'],
            use_cache=model_parameters['use_cache'],
        )
    model = RobertaForMaskedLM(config=_config)
    model.resize_token_embeddings(len(tokenizer))
    opt_param = DEFAULT_PARAMS['optimizer_parameters'][0]
    optimizer = AdamW(params=model.parameters(), lr=opt_param['lr'], betas=(opt_param['beta_one'], opt_param['beta_two']), eps=opt_param['eps'],
                      weight_decay=opt_param['weight_decay'])
    scheduler = None
    training_args = TrainingArguments(
        output_dir='./opt_results/',
        num_train_epochs=10,
        per_device_train_batch_size=DEFAULT_PARAMS['batch_size'],
        per_device_eval_batch_size=DEFAULT_PARAMS['batch_size'],
        warmup_steps=opt_param['warmup_steps'],
        weight_decay=opt_param['weight_decay'],
        logging_dir='./opt_logs/',
        eval_accumulation_steps=10
    )
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets_dev['validation'],
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)
    )

    train_metrics = trainer.train()
    eval_results = trainer.evaluate()
    ppl = math.exp(eval_results['eval_loss'])
    
    return ppl

In [None]:
# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name="mlm-parameter-search-2"
storage_name = "sqlite:///{}.db".format(study_name)
# Other possible pruners here: https://optuna.readthedocs.io/en/stable/reference/pruners.html
study_resume = optuna.create_study(study_name=study_name, load_if_exists=True,
                                   storage=storage_name, direction="minimize",
                                   pruner=optuna.pruners.MedianPruner())
study_resume.optimize(objective) # (objective, n_trials=200)

In [None]:
# print(study_resume.best_value)
# print(study_resume.best_params)
# print(study_resume.best_trial)

In [None]:
## Check failed runs
# for trial in study_resume.trials:
#     if trial.state == optuna.trial.TrialState.FAIL:
#         print("FAILED")
#         print(trial.params)
## study_resume.enqueue_trial(params=default_fail)