# Setup

In [1]:
from os import path

__DIR__ = globals()['_dh'][0]
data_dir = path.relpath(path.join(__DIR__, "..", "_data"))

In [2]:
# Settings
_testing = True
_colab_install = True
_pm_log_sections = False

# Parameters
dataset = path.join(data_dir, "wiki", "20220301.en.1gb")
text_col = "text"

base_model = "bert-base-cased"
max_length = 128
vocab_size = 20_000

tokenize_params = dict(batched=True)
tokenizer_dir = path.join(data_dir, "pretrain", "tokenizer")

mlm_probability = 0.15
bert_config = dict()
training_args = dict(
    optim = "adamw_torch",
    num_train_epochs = 3,
    per_device_train_batch_size = 128,
    save_steps = 500,
)
model_dir = path.join(data_dir, "pretrain", "model")

In [5]:
# Process settings / parameters
from pprint import pprint

if _pm_log_sections:
    def pm_log_section(message):
        print(f"\n[===== {message} =====]\n")
else:
    def pm_log_section(message):
        return

if _colab_install:
    try:
        import google.colab
        colab_install_script = path.join(__DIR__, "..", "colab_install.sh")
        !pip 

    except ModuleNotFoundError:
        pass

if _testing:
    pm_log_section("Running on testing mode")
    dataset = path.join(data_dir, "wiki", "20220301.en.test")

    training_args = dict(
        optim = "adamw_torch",
        per_device_train_batch_size = 128,
        max_steps = 3,
        logging_steps = 1,
        evaluation_strategy = "steps",
    )

config = dict(
    dataset = dataset,
    text_col = text_col,

    base_model = base_model,
    max_length = max_length,
    vocab_size = vocab_size,

    tokenize_params = tokenize_params,
    tokenizer_dir = tokenizer_dir,

    mlm_probability = mlm_probability,
    bert_config = bert_config,
    training_args = training_args,
    model_dir = model_dir,
)

print(f"{'TESTING' if _testing else ''} Parameters:")
pprint(config, indent=2, sort_dicts=False)

TESTING Parameters:
{ 'dataset': '../_data/wiki/20220301.en.test',
  'text_col': 'text',
  'base_model': 'bert-base-cased',
  'max_length': 128,
  'vocab_size': 20000,
  'tokenize_params': {'batched': True},
  'tokenizer_dir': '../_data/pretrain/tokenizer',
  'mlm_probability': 0.15,
  'bert_config': {},
  'training_args': { 'optim': 'adamw_torch',
                     'per_device_train_batch_size': 128,
                     'max_steps': 3,
                     'logging_steps': 1,
                     'evaluation_strategy': 'steps'},
  'model_dir': '../_data/pretrain/model'}


# Load dataset

In [6]:
from datasets import Dataset, load_dataset

pm_log_section("Loading dataset")

ds_dir = dataset
dataset = dict()
for split in ["train", "val", "test"]:
    data_file = path.join(ds_dir, f"{split}_data.json")
    if not path.isfile(data_file):  data_file += ".gz"
    dataset[split] = load_dataset("json", data_files=data_file, field="data")["train"]

Using custom data configuration default-76618ebe809bedea
Reusing dataset json (/Users/yenson/.cache/huggingface/datasets/json/default-76618ebe809bedea/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-29cf15927ba28352
Reusing dataset json (/Users/yenson/.cache/huggingface/datasets/json/default-29cf15927ba28352/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-d702a3f3dcca03ab
Reusing dataset json (/Users/yenson/.cache/huggingface/datasets/json/default-d702a3f3dcca03ab/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

# Tokenization

In [7]:
from transformers import BertTokenizerFast

pm_log_section("Tokenizing")

In [8]:
tokenizer = (BertTokenizerFast
                .from_pretrained(base_model)
                .train_new_from_iterator(dataset["train"][text_col], vocab_size))
tokenizer.model_max_length = max_length

tokenizer.save_pretrained(tokenizer_dir);






In [9]:
tokenize_function = lambda ex: tokenizer(ex[text_col], truncation=True)

tokenized_dataset = {
    k: v.map(tokenize_function, remove_columns = list(v.features), **tokenize_params)
    for k, v in dataset.items()
}

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Train masked language model

In [11]:
from transformers import (BertConfig,
                          BertForMaskedLM,
                          DataCollatorForLanguageModeling,
                          Trainer,
                          TrainingArguments)

pm_log_section("Training MLM")

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer,
                                                mlm_probability = mlm_probability)

bert_config = BertConfig(vocab_size = tokenizer.vocab_size, **bert_config)
model = BertForMaskedLM(config = bert_config)

training_args = TrainingArguments(output_dir = model_dir,
                                  overwrite_output_dir = True,
                                  **training_args)

trainer = Trainer(model = model,
                  args = training_args,
                  data_collator = data_collator,
                  train_dataset = tokenized_dataset["train"],
                  eval_dataset=tokenized_dataset["val"])

trainer.train()
trainer.save_model(model_dir)

max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 8788
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 3


  0%|          | 0/3 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
***** Running Evaluation *****
  Num examples = 998
  Batch size = 8


{'loss': 10.0299, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.01}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 9.616159439086914, 'eval_runtime': 33.0368, 'eval_samples_per_second': 30.209, 'eval_steps_per_second': 3.784, 'epoch': 0.01}


***** Running Evaluation *****
  Num examples = 998
  Batch size = 8


{'loss': 9.5617, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.03}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 9.430152893066406, 'eval_runtime': 20.5777, 'eval_samples_per_second': 48.499, 'eval_steps_per_second': 6.075, 'epoch': 0.03}


***** Running Evaluation *****
  Num examples = 998
  Batch size = 8


{'loss': 9.4715, 'learning_rate': 0.0, 'epoch': 0.04}


  0%|          | 0/125 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ../_data/pretrain/model
Configuration saved in ../_data/pretrain/model/config.json


{'eval_loss': 9.366676330566406, 'eval_runtime': 21.3607, 'eval_samples_per_second': 46.721, 'eval_steps_per_second': 5.852, 'epoch': 0.04}
{'train_runtime': 114.1717, 'train_samples_per_second': 3.363, 'train_steps_per_second': 0.026, 'train_loss': 9.687678019205729, 'epoch': 0.04}


Model weights saved in ../_data/pretrain/model/pytorch_model.bin
