# Pre-training Tiny-RoBERTa on BabyLM data

In [None]:
%pip install -q evaluate
%pip install -q "optimum-graphcore>=0.5, <0.6" # Version must be under < 0.6
%pip install huggingface_hub==0.10.1

In [None]:
!apt install git-lfs

In [None]:
import yaml
import random
from typing import List, Dict, Any, Tuple
from pathlib import Path
import os
from tokenizers import Tokenizer
from datasets import load_dataset
from optimum.graphcore import IPUConfig, IPUTrainer, IPUTrainingArguments
from transformers import RobertaTokenizerFast,DataCollatorForLanguageModeling,AutoConfig, AutoModelForMaskedLM
from transformers import RobertaConfig, RobertaForMaskedLM,pipeline,AutoTokenizer
from huggingface_hub import notebook_login

Pre-training RoBERTa on BabyLM requires:

* BabyLM data that can be loaded via:  
```
git clone https://github.com/upunaprosk/small-language-models.git
cd small-language-models
bash download_data.sh
```

* RoBERTa Tokenizer pre-trained on BabyLM data using [that code](https://github.com/upunaprosk/BabyBERTa).

In [None]:
n_ipu = int(os.getenv("NUM_AVAILABLE_IPU", 4))
executable_cache_dir = os.getenv("POPLAR_EXECUTABLE_CACHE_DIR", "/tmp/exe_cache/") + "/roberta"
pod_type = os.getenv("GRAPHCORE_POD_TYPE", "pod4")
path_tokenizer_config = 'trained-tokenizer/custom_tokenizer.json' # Path to pre-trained tokenizer
n_ipu,pod_type

(4, 'pod4')

In [None]:
def load_tokenizer(config_path: Path,
                   max_input_length: int,
                   ) -> Tokenizer:

    tokenizer = Tokenizer.from_file(str(config_path))
    tokenizer.enable_truncation(max_length=max_input_length)

    return tokenizer

tokenizer = load_tokenizer(path_tokenizer_config, max_input_length=128)

In [None]:
tokenizer = RobertaTokenizerFast(vocab_file=None,
                                     merges_file=None,
                                     tokenizer_file=path_tokenizer_config,
                                     )

In [None]:
data_path = Path('./babylm_data/babylm_10M/')
files=[p.as_posix() for p in data_path.glob('*.train')]
files_dev = [p.as_posix() for p in data_path.glob('*.dev')]
files_test = [p.as_posix() for p in data_path.glob('*.test')]
files_test

['data/qed.test',
 'data/switchboard.test',
 'data/cbt.test',
 'data/simple_wikipedia.test',
 'data/aochildes.test',
 'data/children_stories.test',
 'data/gutenberg.test',
 'data/wikipedia.test',
 'data/bnc_spoken.test',
 'data/open_subtitles.test']

In [None]:
d = load_dataset('text', data_files={'train': list(files)})
d_dev = load_dataset('text', data_files={'train': list(files_dev)})
d_test = load_dataset('text', data_files={'train': list(files_test)})

In [None]:
block_size = 128

In [None]:
def tokenize_function(examples):
    # Remove empty lines
    examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
    return tokenizer(
        examples["text"],
        padding=True,
        truncation=True,
        max_length=128,
        # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
        # receives the `special_tokens_mask`.
        # return_special_tokens_mask=True,
    )
tokenized_datasets = d.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
tokenized_datasets_dev = d_dev.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
tokenized_datasets_test = d_test.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

print(f'Length of train data={len(tokenized_datasets["train"])}')

Length of train data=1015494


In [None]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
tokenizer.vocab_size

8192

In [None]:
ipu_config_name = "Graphcore/roberta-base-ipu"
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.135
)

In [None]:
DEFAULT_ROBERTA_CONFIG={
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": None,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": True,
  "vocab_size": 30522
}

In [None]:
# param_id  trial_id  param_name                   param_value  distribution_json
# 318       40        hidden_size_multiplier       87.000000    {"name": "IntDistribution", "attributes": {"log": false, "step": 1, "low": 1, "high": 100}}                           1
# 319       40        hidden_layers                9.000000     {"name": "IntDistribution", "attributes": {"log": false, "step": 1, "low": 1, "high": 12}}                            1
# 320       40        attention_heads              12.000000    {"name": "IntDistribution", "attributes": {"log": false, "step": 1, "low": 1, "high": 18}}                            1
# 321       40        intermediate_size            2048.000000  {"name": "IntDistribution", "attributes": {"log": false, "step": 1, "low": 1, "high": 3072}}                          1
# 322       40        hidden_act                   3.000000     {"name": "CategoricalDistribution", "attributes": {"choices": ["gelu", "relu", "silu", "gelu_new"]}}                  1
# 323       40        hidden_dropout_prob          0.146995     {"name": "FloatDistribution", "attributes": {"step": null, "low": 0.1, "high": 1.0, "log": false}}                    1
# 324       40        attention_prob_dropout_prog  0.995935     {"name": "FloatDistribution", "attributes": {"step": null, "low": 0.1, "high": 1.0, "log": false}}                    1
# 325       40        position_embedding_type      2.000000     {"name": "CategoricalDistribution", "attributes": {"choices": ["absolute", "relative_key", "relative_key_query"]}}

In [None]:
# hidden_size_multiplier meaning
# hidden_size=model_parameters['hidden_size_multiplier'] * model_parameters['num_attention_heads'],
# 87*12=1044
# by DEFAULT:"hidden_size": 768, hidden_size_multiplier = 64
# optimum value:
# hidden_size_multiplier=70

In [None]:
opt_roberta_config={
  "pad_token_id":tokenizer.convert_tokens_to_ids ('<pad>'),
  "bos_token_id":tokenizer.convert_tokens_to_ids('<s>'),
  "eos_token_id":tokenizer.convert_tokens_to_ids('</s>'),
  "attention_probs_dropout_prob": 0.3,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.15,
  "hidden_size": 70*8,
  "initializer_range": 0.02,
  "intermediate_size": 1412,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 128,
  "model_type": "roberta",
  "num_attention_heads": 8,
  "num_hidden_layers": 4,
  "position_embedding_type": "relative_key_query",
  "type_vocab_size": 1,
  "vocab_size": tokenizer.vocab_size
}

In [None]:
config = RobertaConfig(**opt_roberta_config)

In [None]:
model = RobertaForMaskedLM(config)
ipu_config = IPUConfig.from_pretrained(
    "Graphcore/roberta-base-ipu",
    executable_cache_dir="/tmp/exe_cache/3.2.1/roberta",
    ipus_per_replica= 3,
    layers_per_ipu= [
        1,
        1,
        1,
        1
    ],
    inference_layers_per_ipu=[-1]
)

Setting replicated_tensor_sharding to False when replication_factor=1


In [None]:
!mkdir roberta

mkdir: cannot create directory ‘roberta_toddler’: File exists


In [None]:
training_args = IPUTrainingArguments(output_dir="roberta",
                                     do_train=False,
                                     do_eval=True,
                                     per_device_train_batch_size=4,
                                     per_device_eval_batch_size=4,
                                     gradient_accumulation_steps=128,
                                     learning_rate=1e-4,
                                     num_train_epochs=5,
                                     # fp32=True,
                                     # logging_steps=25,
                                     dataloader_num_workers=52,
                                     weight_decay=0.1,
                                     dataloader_drop_last=True,
                                     prediction_loss_only=True,
                                     # resume_from_checkpoint="./roberta/checkpoint-142803/",
                                     # pad_on_batch_axis=True,
                                     # # pod_type=pod_type,
                                     # pad_on_batch_axis = True,
                                     save_strategy="epoch",
                                     # lamb=True,
                                     auto_loss_scaling=True,
                                     overwrite_output_dir=True,
                                     # report_to="none",
                                    )

In [None]:
trainer1 = IPUTrainer(
    model=model,
    ipu_config=ipu_config,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    data_collator=data_collator,
    eval_dataset=tokenized_datasets_dev['train'],
)

Overriding IPU config: gradient_accumulation_steps=128,auto_loss_scaling=True
-------------------- Device Allocation --------------------
Embedding  --> IPU 0
Encoder 0  --> IPU 1
Encoder 1  --> IPU 1
Encoder 2  --> IPU 1
Encoder 3  --> IPU 2
LM Head    --> IPU 0
-----------------------------------------------------------


In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1015494
    })
})

In [None]:
trainer1.train()

Compiling Model...
Graph compilation: 100%|██████████| 100/100 [02:28<00:00]
Compiled/Loaded model in 192.6328711300157 secs
***** Running training *****
  Num examples = 1015494
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 64
  Total optimization steps = 79335


  0%|          | 0/79335 [00:00<?, ?it/s]

{'loss': 2.7565, 'learning_rate': 9.936976113947186e-05, 'epoch': 0.03}
{'loss': 2.7209, 'learning_rate': 9.873952227894372e-05, 'epoch': 0.06}
{'loss': 2.7275, 'learning_rate': 9.810928341841559e-05, 'epoch': 0.09}
{'loss': 2.6101, 'learning_rate': 9.747904455788744e-05, 'epoch': 0.13}
{'loss': 2.7243, 'learning_rate': 9.68488056973593e-05, 'epoch': 0.16}
{'loss': 2.7339, 'learning_rate': 9.621856683683115e-05, 'epoch': 0.19}
{'loss': 2.7137, 'learning_rate': 9.558832797630302e-05, 'epoch': 0.22}
{'loss': 2.5916, 'learning_rate': 9.495808911577488e-05, 'epoch': 0.25}
{'loss': 2.6203, 'learning_rate': 9.432785025524674e-05, 'epoch': 0.28}
{'loss': 2.5641, 'learning_rate': 9.369761139471859e-05, 'epoch': 0.32}
{'loss': 2.4523, 'learning_rate': 9.306737253419046e-05, 'epoch': 0.35}
{'loss': 2.5155, 'learning_rate': 9.243713367366232e-05, 'epoch': 0.38}
{'loss': 2.3701, 'learning_rate': 9.180689481313419e-05, 'epoch': 0.41}
{'loss': 2.6104, 'learning_rate': 9.117665595260604e-05, 'epoch':

Saving model checkpoint to roberta_5/checkpoint-15867
-------------------- Device Allocation --------------------
Embedding  --> IPU 0
Encoder 0  --> IPU 1
Encoder 1  --> IPU 1
Encoder 2  --> IPU 1
Encoder 3  --> IPU 2
LM Head    --> IPU 0
-----------------------------------------------------------
Configuration saved in roberta_5/checkpoint-15867/ipu_config.json


{'loss': 2.1439, 'learning_rate': 7.983235646309952e-05, 'epoch': 1.01}
{'loss': 2.2581, 'learning_rate': 7.920211760257138e-05, 'epoch': 1.04}
{'loss': 2.1662, 'learning_rate': 7.857187874204323e-05, 'epoch': 1.07}
{'loss': 2.1791, 'learning_rate': 7.794163988151509e-05, 'epoch': 1.1}
{'loss': 2.184, 'learning_rate': 7.731140102098696e-05, 'epoch': 1.13}
{'loss': 2.0694, 'learning_rate': 7.668116216045881e-05, 'epoch': 1.17}
{'loss': 2.3353, 'learning_rate': 7.605092329993067e-05, 'epoch': 1.2}
{'loss': 2.2295, 'learning_rate': 7.542068443940254e-05, 'epoch': 1.23}
{'loss': 2.1829, 'learning_rate': 7.47904455788744e-05, 'epoch': 1.26}
{'loss': 2.125, 'learning_rate': 7.416020671834627e-05, 'epoch': 1.29}
{'loss': 2.0657, 'learning_rate': 7.352996785781812e-05, 'epoch': 1.32}
{'loss': 2.4687, 'learning_rate': 7.289972899728998e-05, 'epoch': 1.36}
{'loss': 2.226, 'learning_rate': 7.226949013676185e-05, 'epoch': 1.39}
{'loss': 2.3861, 'learning_rate': 7.16392512762337e-05, 'epoch': 1.42}

Saving model checkpoint to roberta_5/checkpoint-31734
-------------------- Device Allocation --------------------
Embedding  --> IPU 0
Encoder 0  --> IPU 1
Encoder 1  --> IPU 1
Encoder 2  --> IPU 1
Encoder 3  --> IPU 2
LM Head    --> IPU 0
-----------------------------------------------------------
Configuration saved in roberta_5/checkpoint-31734/ipu_config.json


{'loss': 2.1167, 'learning_rate': 5.966471292619903e-05, 'epoch': 2.02}
{'loss': 2.2098, 'learning_rate': 5.90344740656709e-05, 'epoch': 2.05}
{'loss': 1.9882, 'learning_rate': 5.8404235205142755e-05, 'epoch': 2.08}
{'loss': 2.2684, 'learning_rate': 5.777399634461461e-05, 'epoch': 2.11}
{'loss': 2.0564, 'learning_rate': 5.714375748408647e-05, 'epoch': 2.14}
{'loss': 2.1961, 'learning_rate': 5.651351862355832e-05, 'epoch': 2.17}
{'loss': 2.0022, 'learning_rate': 5.588327976303019e-05, 'epoch': 2.21}
{'loss': 2.0053, 'learning_rate': 5.525304090250205e-05, 'epoch': 2.24}
{'loss': 2.1097, 'learning_rate': 5.462280204197391e-05, 'epoch': 2.27}
{'loss': 2.0055, 'learning_rate': 5.399256318144577e-05, 'epoch': 2.3}
{'loss': 2.0297, 'learning_rate': 5.336232432091763e-05, 'epoch': 2.33}
{'loss': 1.922, 'learning_rate': 5.2732085460389493e-05, 'epoch': 2.36}
{'loss': 2.0509, 'learning_rate': 5.210184659986135e-05, 'epoch': 2.39}
{'loss': 1.9075, 'learning_rate': 5.1471607739333206e-05, 'epoch'

Saving model checkpoint to roberta_5/checkpoint-47601
-------------------- Device Allocation --------------------
Embedding  --> IPU 0
Encoder 0  --> IPU 1
Encoder 1  --> IPU 1
Encoder 2  --> IPU 1
Encoder 3  --> IPU 2
LM Head    --> IPU 0
-----------------------------------------------------------
Configuration saved in roberta_5/checkpoint-47601/ipu_config.json


{'loss': 2.0404, 'learning_rate': 3.9497069389298545e-05, 'epoch': 3.03}
{'loss': 2.0917, 'learning_rate': 3.88668305287704e-05, 'epoch': 3.06}
{'loss': 1.8917, 'learning_rate': 3.8236591668242264e-05, 'epoch': 3.09}
{'loss': 1.9019, 'learning_rate': 3.760635280771413e-05, 'epoch': 3.12}
{'loss': 2.0237, 'learning_rate': 3.697611394718599e-05, 'epoch': 3.15}
{'loss': 1.9214, 'learning_rate': 3.6345875086657846e-05, 'epoch': 3.18}
{'loss': 2.0049, 'learning_rate': 3.571563622612971e-05, 'epoch': 3.21}
{'loss': 2.0326, 'learning_rate': 3.5085397365601565e-05, 'epoch': 3.25}
{'loss': 2.0372, 'learning_rate': 3.445515850507343e-05, 'epoch': 3.28}
{'loss': 1.9194, 'learning_rate': 3.3824919644545283e-05, 'epoch': 3.31}
{'loss': 1.7669, 'learning_rate': 3.3194680784017146e-05, 'epoch': 3.34}
{'loss': 1.8913, 'learning_rate': 3.2564441923489e-05, 'epoch': 3.37}
{'loss': 2.037, 'learning_rate': 3.1934203062960865e-05, 'epoch': 3.4}
{'loss': 2.0412, 'learning_rate': 3.130396420243272e-05, 'epoc

Saving model checkpoint to roberta_5/checkpoint-63468
-------------------- Device Allocation --------------------
Embedding  --> IPU 0
Encoder 0  --> IPU 1
Encoder 1  --> IPU 1
Encoder 2  --> IPU 1
Encoder 3  --> IPU 2
LM Head    --> IPU 0
-----------------------------------------------------------
Configuration saved in roberta_5/checkpoint-63468/ipu_config.json


{'loss': 2.0347, 'learning_rate': 1.9959664712926198e-05, 'epoch': 4.0}
{'loss': 2.0159, 'learning_rate': 1.9329425852398057e-05, 'epoch': 4.03}
{'loss': 1.8853, 'learning_rate': 1.869918699186992e-05, 'epoch': 4.07}
{'loss': 1.7826, 'learning_rate': 1.806894813134178e-05, 'epoch': 4.1}
{'loss': 1.8884, 'learning_rate': 1.743870927081364e-05, 'epoch': 4.13}
{'loss': 1.9225, 'learning_rate': 1.68084704102855e-05, 'epoch': 4.16}
{'loss': 2.2672, 'learning_rate': 1.6178231549757358e-05, 'epoch': 4.19}
{'loss': 2.0474, 'learning_rate': 1.5547992689229217e-05, 'epoch': 4.22}
{'loss': 1.7927, 'learning_rate': 1.4917753828701079e-05, 'epoch': 4.25}
{'loss': 1.9031, 'learning_rate': 1.4287514968172938e-05, 'epoch': 4.29}
{'loss': 2.0115, 'learning_rate': 1.3657276107644797e-05, 'epoch': 4.32}
{'loss': 1.9207, 'learning_rate': 1.3027037247116659e-05, 'epoch': 4.35}
{'loss': 1.9431, 'learning_rate': 1.2396798386588518e-05, 'epoch': 4.38}
{'loss': 2.0146, 'learning_rate': 1.1766559526060377e-05, 

Saving model checkpoint to roberta_5/checkpoint-79335
-------------------- Device Allocation --------------------
Embedding  --> IPU 0
Encoder 0  --> IPU 1
Encoder 1  --> IPU 1
Encoder 2  --> IPU 1
Encoder 3  --> IPU 2
LM Head    --> IPU 0
-----------------------------------------------------------
Configuration saved in roberta_5/checkpoint-79335/ipu_config.json


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 13996.3023, 'train_samples_per_second': 362.77, 'train_steps_per_second': 5.668, 'train_loss': 2.1173090456263393, 'epoch': 5.0}


TrainOutput(global_step=79335, training_loss=2.1173090456263393, metrics={'train_runtime': 13996.3023, 'train_samples_per_second': 362.77, 'train_steps_per_second': 5.668, 'train_loss': 2.1173090456263393, 'epoch': 5.0})

In [None]:
trainer1.evaluate()

In [None]:
trainer1.save_model("roberta")

In [None]:
tokenizer.save_pretrained("roberta")

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model = AutoModelForMaskedLM.from_pretrained("roberta")
tokenizer=AutoTokenizer.from_pretrained("roberta")
model.push_to_hub("roberta")
tokenizer.push_to_hub("roberta")

In [None]:
fill_mask = pipeline(
    "fill-mask",
    model="roberta",
    tokenizer="roberta"
)

# The sun <mask>.
# => great

fill_mask("The sun is <mask>.")

Saving model checkpoint to roberta_bebeshka
-------------------- Device Allocation --------------------
Embedding  --> IPU 0
Encoder 0  --> IPU 0
Encoder 1  --> IPU 1
Encoder 2  --> IPU 2
Encoder 3  --> IPU 3
LM Head    --> IPU 0
-----------------------------------------------------------
Configuration saved in roberta_bebeshka/ipu_config.json


[{'score': 0.05490488559007645,
  'token': 228,
  'token_str': ' it',
  'sequence': 'the sun is it.'},
 {'score': 0.02775665931403637,
  'token': 708,
  'token_str': ' great',
  'sequence': 'the sun is great.'},
 {'score': 0.024511652067303658,
  'token': 415,
  'token_str': ' here',
  'sequence': 'the sun is here.'},
 {'score': 0.020964989438652992,
  'token': 437,
  'token_str': ' right',
  'sequence': 'the sun is right.'},
 {'score': 0.019934361800551414,
  'token': 503,
  'token_str': ' good',
  'sequence': 'the sun is good.'}]