# Setup

In [1]:
from os import path
from typing import Optional

__DIR__ = globals()['_dh'][0]
data_dir = path.relpath(path.join(__DIR__, "..", "_data"))

In [2]:
# Settings
_colab_install = True
_pm_log_sections = False
_testing = True

# Parameters
dataset = path.join(data_dir, "wiki", "20220301.en.1gb")

base_model = "bert-base-cased"
max_length = 128
vocab_size = 20_000

tokenize_params = dict(batched=True, num_proc=4)
tokenizer_dir = path.join(data_dir, "pretrain", "tokenizer")

mlm_probability = 0.15
bert_config = dict()
training_args = dict(
    optim = "adamw_torch",
    num_train_epochs = 3,
    per_device_train_batch_size = 64,
    eval_accumulation_steps = 10,
    evaluation_strategy = "steps",
    logging_steps = 5000,
    save_steps = 5000,
    save_total_limit = 5,
)
max_eval_samples: Optional[int] = 2000
model_dir = path.join(data_dir, "pretrain", "model")

In [3]:
# Parameters
_pm_log_sections = True


In [4]:
if _testing:
    dataset = path.join(data_dir, "wiki", "20220301.en.test")

    training_args.update(dict(
        max_steps = 3,
        logging_steps = 1,
    ))

    max_eval_samples = 1000

## Process settings / parameters

In [5]:
from pprint import pprint
from collections import OrderedDict

if _colab_install:
    try:
        import google.colab
        
        colab_install_script = path.join(__DIR__, "..", "colab_install.sh")

        if not path.isfile(colab_install_script):
            script_url = "https://raw.githubusercontent.com/yenson-lau/pii-remediation/main/colab_install.sh"
            !wget $script_url -O $colab_install_script

        !bash $colab_install_script

    except ModuleNotFoundError:
        pass

if _pm_log_sections:
    def pm_log_section(message):
        print(f"\n[===== {message} =====]\n")
else:
    def pm_log_section(message):
        return

if _testing:
    pm_log_section("Running on testing mode")

config = OrderedDict(
    dataset = dataset,

    base_model = base_model,
    max_length = max_length,
    vocab_size = vocab_size,

    tokenize_params = tokenize_params,
    tokenizer_dir = tokenizer_dir,

    mlm_probability = mlm_probability,
    bert_config = bert_config,
    training_args = training_args,
    max_eval_samples = max_eval_samples,
    model_dir = model_dir,
)

print(f"{'TESTING ' if _testing else ''}Parameters:")
pprint(config, indent=2)


[===== Running on testing mode =====]

TESTING Parameters:
OrderedDict([ ('dataset', '../_data/wiki/20220301.en.test'),
              ('base_model', 'bert-base-cased'),
              ('max_length', 128),
              ('vocab_size', 20000),
              ('tokenize_params', {'batched': True, 'num_proc': 4}),
              ('tokenizer_dir', '../_data/pretrain/tokenizer'),
              ('mlm_probability', 0.15),
              ('bert_config', {}),
              ( 'training_args',
                { 'eval_accumulation_steps': 10,
                  'evaluation_strategy': 'steps',
                  'logging_steps': 1,
                  'max_steps': 3,
                  'num_train_epochs': 3,
                  'optim': 'adamw_torch',
                  'per_device_train_batch_size': 64,
                  'save_steps': 5000,
                  'save_total_limit': 5}),
              ('max_eval_samples', 1000),
              ('model_dir', '../_data/pretrain/model')])


# Load dataset

In [6]:
from datasets import Dataset, load_dataset

pm_log_section("Loading dataset")

ds_dir = dataset
dataset = dict()
for split in ["train", "val"]:
    data_file = path.join(ds_dir, f"{split}_data.json")
    if not path.isfile(data_file):  data_file += ".gz"
    dataset[split] = load_dataset("json", data_files=data_file, field="data")["train"]

    if ((split != "train") 
        and (max_eval_samples is not None) 
        and (len(dataset[split]) > max_eval_samples)):

        dataset[split] = dataset[split].select(range(max_eval_samples))

  from .autonotebook import tqdm as notebook_tqdm



[===== Loading dataset =====]



Using custom data configuration default-e5574e295c9a56a3


Reusing dataset json (/Users/yenson/.cache/huggingface/datasets/json/default-e5574e295c9a56a3/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 825.16it/s]




Using custom data configuration default-e7bfbf373754ac09


Reusing dataset json (/Users/yenson/.cache/huggingface/datasets/json/default-e7bfbf373754ac09/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 1069.70it/s]




# Tokenization

In [7]:
from transformers import BertTokenizerFast

pm_log_section("Tokenizing")


[===== Tokenizing =====]



In [8]:
tokenizer = (BertTokenizerFast
                .from_pretrained(base_model)
                .train_new_from_iterator(dataset["train"]["text"], vocab_size))
tokenizer.model_max_length = max_length

tokenizer.save_pretrained(tokenizer_dir);








In [9]:
tokenize_function = lambda ex: tokenizer(ex["text"], truncation=True)

tokenized_dataset = {
    k: v.map(tokenize_function, remove_columns = list(v.features), **tokenize_params)
    for k, v in dataset.items()
}

#0:   0%|          | 0/3 [00:00<?, ?ba/s]




#1:   0%|          | 0/3 [00:00<?, ?ba/s]

[A






#3:   0%|          | 0/3 [00:00<?, ?ba/s]

[A[A[A





#2:   0%|          | 0/3 [00:00<?, ?ba/s]

[A[A

#0:  67%|██████▋   | 2/3 [00:00<00:00, 13.38ba/s]

#0: 100%|██████████| 3/3 [00:00<00:00, 18.28ba/s]







#1:  67%|██████▋   | 2/3 [00:00<00:00, 13.66ba/s]

[A






#3:  67%|██████▋   | 2/3 [00:00<00:00, 14.17ba/s]

[A[A[A





#2:  67%|██████▋   | 2/3 [00:00<00:00, 13.96ba/s]

[A[A

#1: 100%|██████████| 3/3 [00:00<00:00, 18.31ba/s]




#2: 100%|██████████| 3/3 [00:00<00:00, 17.54ba/s]




#3: 100%|██████████| 3/3 [00:00<00:00, 16.64ba/s]




#0:   0%|          | 0/1 [00:00<?, ?ba/s]




#1:   0%|          | 0/1 [00:00<?, ?ba/s]

[A





#2:   0%|          | 0/1 [00:00<?, ?ba/s]

[A[A

#0: 100%|██████████| 1/1 [00:00<00:00, 38.74ba/s]









#3:   0%|          | 0/1 [00:00<?, ?ba/s]

[A[A[A

#1: 100%|██████████| 1/1 [00:00<00:00, 40.66ba/s]




#2: 100%|██████████| 1/1 [00:00<00:00, 40.51ba/s]




#3: 100%|██████████| 1/1 [00:00<00:00, 34.85ba/s]




# Train masked language model

In [10]:
import numpy as np
from transformers import (BertConfig,
                          BertForMaskedLM,
                          DataCollatorForLanguageModeling,
                          Trainer,
                          TrainingArguments)

pm_log_section("Training MLM")

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer,
                                                mlm_probability = mlm_probability)

bert_config = BertConfig(vocab_size = tokenizer.vocab_size, **bert_config)
model = BertForMaskedLM(config = bert_config)

training_args = TrainingArguments(output_dir = model_dir,
                                  overwrite_output_dir = True,
                                  **training_args)

# def compute_metrics(eval_preds):
#     idxs0, idxs1 = np.where(eval_preds.label_ids!=-100)

#     preds = np.argmax(eval_preds.predictions[idxs0, idxs1, :], axis=-1)
#     labels = eval_preds.label_ids[idxs0, idxs1]

#     acc = (preds==labels).sum()/len(preds)

#     return {"accuracy": acc}


[===== Training MLM =====]



In [11]:
trainer = Trainer(model = model,
                  args = training_args,
                  data_collator = data_collator,
                  train_dataset = tokenized_dataset["train"],
                  eval_dataset=tokenized_dataset["val"])

trainer.train()
trainer.save_model(model_dir)

max_steps is given, it will override any value given in num_train_epochs


***** Running training *****


  Num examples = 8788


  Num Epochs = 1


  Instantaneous batch size per device = 64


  Total train batch size (w. parallel, distributed & accumulation) = 64


  Gradient Accumulation steps = 1


  Total optimization steps = 3


  0%|          | 0/3 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


 33%|███▎      | 1/3 [00:04<00:08,  4.30s/it]

                                             



 33%|███▎      | 1/3 [00:04<00:08,  4.30s/it]

***** Running Evaluation *****


  Num examples = 998


  Batch size = 8


{'loss': 9.9926, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.01}





  0%|          | 0/125 [00:00<?, ?it/s]

[A




  2%|▏         | 2/125 [00:00<00:09, 13.43it/s]

[A




  3%|▎         | 4/125 [00:00<00:14,  8.28it/s]

[A




  4%|▍         | 5/125 [00:00<00:15,  7.54it/s]

[A




  5%|▍         | 6/125 [00:00<00:16,  7.06it/s]

[A




  6%|▌         | 7/125 [00:00<00:17,  6.68it/s]

[A




  6%|▋         | 8/125 [00:01<00:18,  6.26it/s]

[A




  7%|▋         | 9/125 [00:01<00:19,  6.08it/s]

[A




  8%|▊         | 10/125 [00:01<00:18,  6.20it/s]

[A




  9%|▉         | 11/125 [00:01<00:17,  6.62it/s]

[A




 10%|▉         | 12/125 [00:01<00:17,  6.64it/s]

[A




 10%|█         | 13/125 [00:01<00:17,  6.33it/s]

[A




 11%|█         | 14/125 [00:02<00:17,  6.41it/s]

[A




 12%|█▏        | 15/125 [00:02<00:18,  5.94it/s]

[A




 13%|█▎        | 16/125 [00:02<00:19,  5.53it/s]

[A




 14%|█▎        | 17/125 [00:02<00:18,  5.99it/s]

[A




 14%|█▍        | 18/125 [00:02<00:17,  6.01it/s]

[A




 15%|█▌        | 19/125 [00:02<00:18,  5.69it/s]

[A




 16%|█▌        | 20/125 [00:03<00:17,  6.02it/s]

[A




 17%|█▋        | 21/125 [00:03<00:16,  6.22it/s]

[A




 18%|█▊        | 22/125 [00:03<00:14,  6.96it/s]

[A




 18%|█▊        | 23/125 [00:03<00:16,  6.31it/s]

[A




 19%|█▉        | 24/125 [00:03<00:16,  6.06it/s]

[A




 20%|██        | 25/125 [00:03<00:16,  6.03it/s]

[A




 21%|██        | 26/125 [00:04<00:15,  6.36it/s]

[A




 22%|██▏       | 27/125 [00:04<00:14,  6.85it/s]

[A




 22%|██▏       | 28/125 [00:04<00:15,  6.31it/s]

[A




 23%|██▎       | 29/125 [00:04<00:14,  6.44it/s]

[A




 24%|██▍       | 30/125 [00:04<00:13,  6.80it/s]

[A




 25%|██▍       | 31/125 [00:04<00:15,  6.12it/s]

[A




 26%|██▌       | 32/125 [00:04<00:14,  6.55it/s]

[A




 26%|██▋       | 33/125 [00:05<00:13,  6.68it/s]

[A




 27%|██▋       | 34/125 [00:05<00:13,  6.73it/s]

[A




 28%|██▊       | 35/125 [00:05<00:13,  6.79it/s]

[A




 29%|██▉       | 36/125 [00:05<00:14,  6.27it/s]

[A




 30%|██▉       | 37/125 [00:05<00:13,  6.64it/s]

[A




 30%|███       | 38/125 [00:05<00:12,  6.71it/s]

[A




 31%|███       | 39/125 [00:06<00:16,  5.18it/s]

[A




 32%|███▏      | 40/125 [00:06<00:15,  5.42it/s]

[A




 33%|███▎      | 41/125 [00:06<00:14,  5.86it/s]

[A




 34%|███▎      | 42/125 [00:06<00:13,  6.28it/s]

[A




 34%|███▍      | 43/125 [00:06<00:12,  6.42it/s]

[A




 35%|███▌      | 44/125 [00:06<00:12,  6.70it/s]

[A




 36%|███▌      | 45/125 [00:06<00:11,  6.97it/s]

[A




 37%|███▋      | 46/125 [00:07<00:10,  7.52it/s]

[A




 38%|███▊      | 47/125 [00:07<00:11,  6.97it/s]

[A




 38%|███▊      | 48/125 [00:07<00:11,  6.45it/s]

[A




 39%|███▉      | 49/125 [00:07<00:11,  6.44it/s]

[A




 40%|████      | 50/125 [00:07<00:13,  5.61it/s]

[A




 41%|████      | 51/125 [00:07<00:12,  6.06it/s]

[A




 42%|████▏     | 52/125 [00:08<00:12,  6.06it/s]

[A




 42%|████▏     | 53/125 [00:08<00:10,  6.60it/s]

[A




 43%|████▎     | 54/125 [00:08<00:10,  6.91it/s]

[A




 44%|████▍     | 55/125 [00:08<00:10,  6.51it/s]

[A




 45%|████▍     | 56/125 [00:08<00:11,  6.11it/s]

[A




 46%|████▌     | 57/125 [00:08<00:10,  6.46it/s]

[A




 46%|████▋     | 58/125 [00:09<00:09,  6.84it/s]

[A




 47%|████▋     | 59/125 [00:09<00:10,  6.53it/s]

[A




 48%|████▊     | 60/125 [00:09<00:09,  6.67it/s]

[A




 49%|████▉     | 61/125 [00:09<00:09,  7.00it/s]

[A




 50%|████▉     | 62/125 [00:09<00:08,  7.55it/s]

[A




 50%|█████     | 63/125 [00:09<00:09,  6.39it/s]

[A




 51%|█████     | 64/125 [00:10<00:11,  5.38it/s]

[A




 52%|█████▏    | 65/125 [00:10<00:10,  5.98it/s]

[A




 53%|█████▎    | 66/125 [00:10<00:09,  6.48it/s]

[A




 54%|█████▎    | 67/125 [00:10<00:09,  6.31it/s]

[A




 54%|█████▍    | 68/125 [00:10<00:09,  6.01it/s]

[A




 55%|█████▌    | 69/125 [00:10<00:09,  5.97it/s]

[A




 56%|█████▌    | 70/125 [00:10<00:09,  5.68it/s]

[A




 57%|█████▋    | 71/125 [00:11<00:09,  5.44it/s]

[A




 58%|█████▊    | 72/125 [00:11<00:09,  5.64it/s]

[A




 58%|█████▊    | 73/125 [00:11<00:08,  5.78it/s]

[A




 59%|█████▉    | 74/125 [00:11<00:08,  5.94it/s]

[A




 60%|██████    | 75/125 [00:11<00:08,  5.71it/s]

[A




 61%|██████    | 76/125 [00:12<00:08,  6.11it/s]

[A




 62%|██████▏   | 77/125 [00:12<00:07,  6.42it/s]

[A




 62%|██████▏   | 78/125 [00:12<00:07,  6.70it/s]

[A




 63%|██████▎   | 79/125 [00:12<00:06,  6.68it/s]

[A




 64%|██████▍   | 80/125 [00:12<00:06,  6.57it/s]

[A




 65%|██████▍   | 81/125 [00:12<00:06,  7.19it/s]

[A




 66%|██████▌   | 82/125 [00:12<00:05,  7.28it/s]

[A




 66%|██████▋   | 83/125 [00:12<00:06,  6.72it/s]

[A




 67%|██████▋   | 84/125 [00:13<00:06,  6.70it/s]

[A




 68%|██████▊   | 85/125 [00:13<00:06,  6.64it/s]

[A




 69%|██████▉   | 86/125 [00:13<00:05,  6.87it/s]

[A




 70%|██████▉   | 87/125 [00:13<00:06,  6.19it/s]

[A




 70%|███████   | 88/125 [00:13<00:05,  6.36it/s]

[A




 71%|███████   | 89/125 [00:13<00:05,  7.04it/s]

[A




 72%|███████▏  | 90/125 [00:14<00:05,  6.93it/s]

[A




 73%|███████▎  | 91/125 [00:14<00:04,  6.84it/s]

[A




 74%|███████▎  | 92/125 [00:14<00:04,  6.65it/s]

[A




 74%|███████▍  | 93/125 [00:14<00:04,  6.61it/s]

[A




 75%|███████▌  | 94/125 [00:14<00:04,  6.58it/s]

[A




 76%|███████▌  | 95/125 [00:14<00:04,  6.22it/s]

[A




 77%|███████▋  | 96/125 [00:14<00:04,  6.24it/s]

[A




 78%|███████▊  | 97/125 [00:15<00:04,  6.58it/s]

[A




 78%|███████▊  | 98/125 [00:15<00:03,  6.81it/s]

[A




 79%|███████▉  | 99/125 [00:15<00:04,  6.24it/s]

[A




 80%|████████  | 100/125 [00:15<00:03,  6.40it/s]

[A




 81%|████████  | 101/125 [00:15<00:04,  5.72it/s]

[A




 82%|████████▏ | 102/125 [00:15<00:03,  5.80it/s]

[A




 82%|████████▏ | 103/125 [00:16<00:03,  6.16it/s]

[A




 83%|████████▎ | 104/125 [00:16<00:04,  5.00it/s]

[A




 84%|████████▍ | 105/125 [00:16<00:04,  4.90it/s]

[A




 85%|████████▍ | 106/125 [00:16<00:04,  4.56it/s]

[A




 86%|████████▌ | 107/125 [00:17<00:03,  4.96it/s]

[A




 86%|████████▋ | 108/125 [00:17<00:03,  5.14it/s]

[A




 87%|████████▋ | 109/125 [00:17<00:03,  5.03it/s]

[A




 88%|████████▊ | 110/125 [00:17<00:02,  5.14it/s]

[A




 89%|████████▉ | 111/125 [00:17<00:02,  5.33it/s]

[A




 90%|████████▉ | 112/125 [00:17<00:02,  5.65it/s]

[A




 90%|█████████ | 113/125 [00:18<00:02,  5.45it/s]

[A




 91%|█████████ | 114/125 [00:18<00:02,  5.31it/s]

[A




 92%|█████████▏| 115/125 [00:18<00:01,  5.20it/s]

[A




 93%|█████████▎| 116/125 [00:18<00:01,  4.62it/s]

[A




 94%|█████████▎| 117/125 [00:19<00:01,  4.71it/s]

[A




 94%|█████████▍| 118/125 [00:19<00:01,  4.84it/s]

[A




 95%|█████████▌| 119/125 [00:19<00:01,  5.35it/s]

[A




 96%|█████████▌| 120/125 [00:19<00:01,  4.98it/s]

[A




 97%|█████████▋| 121/125 [00:19<00:00,  5.24it/s]

[A




 98%|█████████▊| 122/125 [00:20<00:00,  4.55it/s]

[A




 98%|█████████▊| 123/125 [00:20<00:00,  4.30it/s]

[A




 99%|█████████▉| 124/125 [00:20<00:00,  4.44it/s]

[A




100%|██████████| 125/125 [00:20<00:00,  5.28it/s]

[A

                                             






                                                 

[A

 33%|███▎      | 1/3 [00:25<00:08,  4.30s/it]




100%|██████████| 125/125 [00:20<00:00,  5.28it/s]

[A




                                                 

[A

{'eval_loss': 9.465776443481445, 'eval_runtime': 20.8791, 'eval_samples_per_second': 47.799, 'eval_steps_per_second': 5.987, 'epoch': 0.01}


 67%|██████▋   | 2/3 [00:28<00:16, 16.24s/it]

                                             



 67%|██████▋   | 2/3 [00:28<00:16, 16.24s/it]

***** Running Evaluation *****


  Num examples = 998


  Batch size = 8


{'loss': 9.5738, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.01}





  0%|          | 0/125 [00:00<?, ?it/s]

[A




  2%|▏         | 2/125 [00:00<00:08, 15.02it/s]

[A




  3%|▎         | 4/125 [00:00<00:13,  9.13it/s]

[A




  5%|▍         | 6/125 [00:00<00:15,  7.80it/s]

[A




  6%|▌         | 7/125 [00:00<00:15,  7.44it/s]

[A




  6%|▋         | 8/125 [00:01<00:16,  6.93it/s]

[A




  7%|▋         | 9/125 [00:01<00:18,  6.31it/s]

[A




  8%|▊         | 10/125 [00:01<00:18,  6.19it/s]

[A




  9%|▉         | 11/125 [00:01<00:18,  6.20it/s]

[A




 10%|▉         | 12/125 [00:01<00:18,  5.96it/s]

[A




 10%|█         | 13/125 [00:02<00:21,  5.18it/s]

[A




 11%|█         | 14/125 [00:02<00:22,  4.91it/s]

[A




 12%|█▏        | 15/125 [00:02<00:23,  4.72it/s]

[A




 13%|█▎        | 16/125 [00:02<00:23,  4.68it/s]

[A




 14%|█▎        | 17/125 [00:02<00:20,  5.18it/s]

[A




 14%|█▍        | 18/125 [00:03<00:20,  5.32it/s]

[A




 15%|█▌        | 19/125 [00:03<00:20,  5.21it/s]

[A




 16%|█▌        | 20/125 [00:03<00:18,  5.63it/s]

[A




 17%|█▋        | 21/125 [00:03<00:17,  5.98it/s]

[A




 18%|█▊        | 22/125 [00:03<00:15,  6.70it/s]

[A




 18%|█▊        | 23/125 [00:03<00:16,  6.12it/s]

[A




 19%|█▉        | 24/125 [00:03<00:17,  5.88it/s]

[A




 20%|██        | 25/125 [00:04<00:16,  5.91it/s]

[A




 21%|██        | 26/125 [00:04<00:15,  6.24it/s]

[A




 22%|██▏       | 27/125 [00:04<00:14,  6.73it/s]

[A




 22%|██▏       | 28/125 [00:04<00:15,  6.21it/s]

[A




 23%|██▎       | 29/125 [00:04<00:15,  6.33it/s]

[A




 24%|██▍       | 30/125 [00:04<00:14,  6.71it/s]

[A




 25%|██▍       | 31/125 [00:05<00:15,  6.19it/s]

[A




 26%|██▌       | 32/125 [00:05<00:14,  6.62it/s]

[A




 26%|██▋       | 33/125 [00:05<00:13,  6.60it/s]

[A




 27%|██▋       | 34/125 [00:05<00:13,  6.55it/s]

[A




 28%|██▊       | 35/125 [00:05<00:13,  6.65it/s]

[A




 29%|██▉       | 36/125 [00:05<00:14,  6.25it/s]

[A




 30%|██▉       | 37/125 [00:05<00:13,  6.41it/s]

[A




 30%|███       | 38/125 [00:06<00:13,  6.37it/s]

[A




 31%|███       | 39/125 [00:06<00:17,  4.94it/s]

[A




 32%|███▏      | 40/125 [00:06<00:15,  5.31it/s]

[A




 33%|███▎      | 41/125 [00:06<00:14,  5.72it/s]

[A




 34%|███▎      | 42/125 [00:06<00:13,  6.02it/s]

[A




 34%|███▍      | 43/125 [00:07<00:13,  6.11it/s]

[A




 35%|███▌      | 44/125 [00:07<00:12,  6.40it/s]

[A




 36%|███▌      | 45/125 [00:07<00:12,  6.66it/s]

[A




 37%|███▋      | 46/125 [00:07<00:11,  7.17it/s]

[A




 38%|███▊      | 47/125 [00:07<00:11,  6.67it/s]

[A




 38%|███▊      | 48/125 [00:07<00:12,  6.24it/s]

[A




 39%|███▉      | 49/125 [00:07<00:12,  6.27it/s]

[A




 40%|████      | 50/125 [00:08<00:13,  5.59it/s]

[A




 41%|████      | 51/125 [00:08<00:12,  5.96it/s]

[A




 42%|████▏     | 52/125 [00:08<00:12,  6.02it/s]

[A




 42%|████▏     | 53/125 [00:08<00:10,  6.57it/s]

[A




 43%|████▎     | 54/125 [00:08<00:10,  6.89it/s]

[A




 44%|████▍     | 55/125 [00:08<00:10,  6.53it/s]

[A




 45%|████▍     | 56/125 [00:09<00:11,  6.10it/s]

[A




 46%|████▌     | 57/125 [00:09<00:10,  6.45it/s]

[A




 46%|████▋     | 58/125 [00:09<00:09,  6.89it/s]

[A




 47%|████▋     | 59/125 [00:09<00:10,  6.58it/s]

[A




 48%|████▊     | 60/125 [00:09<00:09,  6.69it/s]

[A




 49%|████▉     | 61/125 [00:09<00:09,  7.03it/s]

[A




 50%|████▉     | 62/125 [00:09<00:08,  7.55it/s]

[A




 50%|█████     | 63/125 [00:10<00:09,  6.41it/s]

[A




 51%|█████     | 64/125 [00:10<00:11,  5.37it/s]

[A




 52%|█████▏    | 65/125 [00:10<00:09,  6.04it/s]

[A




 53%|█████▎    | 66/125 [00:10<00:08,  6.60it/s]

[A




 54%|█████▎    | 67/125 [00:10<00:09,  6.40it/s]

[A




 54%|█████▍    | 68/125 [00:10<00:09,  6.01it/s]

[A




 55%|█████▌    | 69/125 [00:11<00:09,  5.92it/s]

[A




 56%|█████▌    | 70/125 [00:11<00:10,  5.48it/s]

[A




 57%|█████▋    | 71/125 [00:11<00:10,  5.25it/s]

[A




 58%|█████▊    | 72/125 [00:11<00:09,  5.60it/s]

[A




 58%|█████▊    | 73/125 [00:11<00:08,  5.79it/s]

[A




 59%|█████▉    | 74/125 [00:12<00:08,  5.87it/s]

[A




 60%|██████    | 75/125 [00:12<00:09,  5.42it/s]

[A




 61%|██████    | 76/125 [00:12<00:08,  5.82it/s]

[A




 62%|██████▏   | 77/125 [00:12<00:07,  6.21it/s]

[A




 62%|██████▏   | 78/125 [00:12<00:07,  6.61it/s]

[A




 63%|██████▎   | 79/125 [00:12<00:06,  6.71it/s]

[A




 64%|██████▍   | 80/125 [00:12<00:06,  6.68it/s]

[A




 65%|██████▍   | 81/125 [00:13<00:05,  7.35it/s]

[A




 66%|██████▌   | 82/125 [00:13<00:05,  7.37it/s]

[A




 66%|██████▋   | 83/125 [00:13<00:06,  6.74it/s]

[A




 67%|██████▋   | 84/125 [00:13<00:06,  6.63it/s]

[A




 68%|██████▊   | 85/125 [00:13<00:06,  6.36it/s]

[A




 69%|██████▉   | 86/125 [00:13<00:05,  6.51it/s]

[A




 70%|██████▉   | 87/125 [00:14<00:06,  5.79it/s]

[A




 70%|███████   | 88/125 [00:14<00:06,  5.96it/s]

[A




 71%|███████   | 89/125 [00:14<00:05,  6.51it/s]

[A




 72%|███████▏  | 90/125 [00:14<00:05,  6.50it/s]

[A




 73%|███████▎  | 91/125 [00:14<00:05,  6.42it/s]

[A




 74%|███████▎  | 92/125 [00:14<00:05,  6.19it/s]

[A




 74%|███████▍  | 93/125 [00:14<00:05,  6.10it/s]

[A




 75%|███████▌  | 94/125 [00:15<00:05,  6.19it/s]

[A




 76%|███████▌  | 95/125 [00:15<00:05,  5.95it/s]

[A




 77%|███████▋  | 96/125 [00:15<00:04,  6.07it/s]

[A




 78%|███████▊  | 97/125 [00:15<00:04,  6.43it/s]

[A




 78%|███████▊  | 98/125 [00:15<00:04,  6.71it/s]

[A




 79%|███████▉  | 99/125 [00:15<00:04,  6.19it/s]

[A




 80%|████████  | 100/125 [00:16<00:03,  6.56it/s]

[A




 81%|████████  | 101/125 [00:16<00:04,  5.90it/s]

[A




 82%|████████▏ | 102/125 [00:16<00:03,  5.95it/s]

[A




 82%|████████▏ | 103/125 [00:16<00:03,  6.19it/s]

[A




 83%|████████▎ | 104/125 [00:16<00:04,  4.83it/s]

[A




 84%|████████▍ | 105/125 [00:17<00:04,  4.75it/s]

[A




 85%|████████▍ | 106/125 [00:17<00:04,  4.46it/s]

[A




 86%|████████▌ | 107/125 [00:17<00:03,  4.73it/s]

[A




 86%|████████▋ | 108/125 [00:17<00:03,  4.92it/s]

[A




 87%|████████▋ | 109/125 [00:17<00:03,  4.75it/s]

[A




 88%|████████▊ | 110/125 [00:18<00:03,  4.91it/s]

[A




 89%|████████▉ | 111/125 [00:18<00:02,  5.23it/s]

[A




 90%|████████▉ | 112/125 [00:18<00:02,  5.66it/s]

[A




 90%|█████████ | 113/125 [00:18<00:02,  5.53it/s]

[A




 91%|█████████ | 114/125 [00:18<00:02,  5.42it/s]

[A




 92%|█████████▏| 115/125 [00:19<00:01,  5.25it/s]

[A




 93%|█████████▎| 116/125 [00:19<00:01,  4.67it/s]

[A




 94%|█████████▎| 117/125 [00:19<00:01,  4.74it/s]

[A




 94%|█████████▍| 118/125 [00:19<00:01,  4.94it/s]

[A




 95%|█████████▌| 119/125 [00:19<00:01,  5.44it/s]

[A




 96%|█████████▌| 120/125 [00:20<00:00,  5.14it/s]

[A




 97%|█████████▋| 121/125 [00:20<00:00,  5.38it/s]

[A




 98%|█████████▊| 122/125 [00:20<00:00,  4.67it/s]

[A




 98%|█████████▊| 123/125 [00:20<00:00,  4.32it/s]

[A




 99%|█████████▉| 124/125 [00:20<00:00,  4.46it/s]

[A




100%|██████████| 125/125 [00:21<00:00,  5.29it/s]

[A

                                             






                                                 

[A

 67%|██████▋   | 2/3 [00:50<00:16, 16.24s/it]




100%|██████████| 125/125 [00:21<00:00,  5.29it/s]

[A




                                                 

[A

{'eval_loss': 9.401997566223145, 'eval_runtime': 21.3523, 'eval_samples_per_second': 46.74, 'eval_steps_per_second': 5.854, 'epoch': 0.01}


100%|██████████| 3/3 [00:56<00:00, 21.24s/it]

                                             



100%|██████████| 3/3 [00:56<00:00, 21.24s/it]

***** Running Evaluation *****


  Num examples = 998


  Batch size = 8


{'loss': 9.4368, 'learning_rate': 0.0, 'epoch': 0.02}





  0%|          | 0/125 [00:00<?, ?it/s]

[A




  2%|▏         | 2/125 [00:00<00:08, 14.96it/s]

[A




  3%|▎         | 4/125 [00:00<00:13,  9.05it/s]

[A




  5%|▍         | 6/125 [00:00<00:15,  7.78it/s]

[A




  6%|▌         | 7/125 [00:00<00:15,  7.44it/s]

[A




  6%|▋         | 8/125 [00:01<00:16,  6.98it/s]

[A




  7%|▋         | 9/125 [00:01<00:17,  6.61it/s]

[A




  8%|▊         | 10/125 [00:01<00:17,  6.46it/s]

[A




  9%|▉         | 11/125 [00:01<00:16,  6.90it/s]

[A




 10%|▉         | 12/125 [00:01<00:16,  7.00it/s]

[A




 10%|█         | 13/125 [00:01<00:16,  6.63it/s]

[A




 11%|█         | 14/125 [00:01<00:16,  6.55it/s]

[A




 12%|█▏        | 15/125 [00:02<00:18,  6.01it/s]

[A




 13%|█▎        | 16/125 [00:02<00:19,  5.71it/s]

[A




 14%|█▎        | 17/125 [00:02<00:17,  6.22it/s]

[A




 14%|█▍        | 18/125 [00:02<00:17,  6.28it/s]

[A




 15%|█▌        | 19/125 [00:02<00:17,  6.03it/s]

[A




 16%|█▌        | 20/125 [00:02<00:16,  6.40it/s]

[A




 17%|█▋        | 21/125 [00:03<00:15,  6.62it/s]

[A




 18%|█▊        | 22/125 [00:03<00:14,  7.26it/s]

[A




 18%|█▊        | 23/125 [00:03<00:15,  6.43it/s]

[A




 19%|█▉        | 24/125 [00:03<00:16,  6.02it/s]

[A




 20%|██        | 25/125 [00:03<00:16,  5.95it/s]

[A




 21%|██        | 26/125 [00:03<00:16,  6.17it/s]

[A




 22%|██▏       | 27/125 [00:04<00:14,  6.54it/s]

[A




 22%|██▏       | 28/125 [00:04<00:16,  5.99it/s]

[A




 23%|██▎       | 29/125 [00:04<00:16,  5.84it/s]

[A




 24%|██▍       | 30/125 [00:04<00:15,  6.00it/s]

[A




 25%|██▍       | 31/125 [00:04<00:17,  5.53it/s]

[A




 26%|██▌       | 32/125 [00:04<00:15,  5.94it/s]

[A




 26%|██▋       | 33/125 [00:05<00:15,  6.06it/s]

[A




 27%|██▋       | 34/125 [00:05<00:15,  5.89it/s]

[A




 28%|██▊       | 35/125 [00:05<00:14,  6.07it/s]

[A




 29%|██▉       | 36/125 [00:05<00:15,  5.86it/s]

[A




 30%|██▉       | 37/125 [00:05<00:14,  6.27it/s]

[A




 30%|███       | 38/125 [00:05<00:13,  6.41it/s]

[A




 31%|███       | 39/125 [00:06<00:17,  4.99it/s]

[A




 32%|███▏      | 40/125 [00:06<00:15,  5.31it/s]

[A




 33%|███▎      | 41/125 [00:06<00:14,  5.77it/s]

[A




 34%|███▎      | 42/125 [00:06<00:13,  6.17it/s]

[A




 34%|███▍      | 43/125 [00:06<00:13,  6.28it/s]

[A




 35%|███▌      | 44/125 [00:06<00:12,  6.66it/s]

[A




 36%|███▌      | 45/125 [00:07<00:11,  7.00it/s]

[A




 37%|███▋      | 46/125 [00:07<00:10,  7.48it/s]

[A




 38%|███▊      | 47/125 [00:07<00:11,  6.57it/s]

[A




 38%|███▊      | 48/125 [00:07<00:12,  6.03it/s]

[A




 39%|███▉      | 49/125 [00:07<00:12,  6.09it/s]

[A




 40%|████      | 50/125 [00:07<00:13,  5.47it/s]

[A




 41%|████      | 51/125 [00:08<00:12,  5.85it/s]

[A




 42%|████▏     | 52/125 [00:08<00:12,  5.77it/s]

[A




 42%|████▏     | 53/125 [00:08<00:11,  6.24it/s]

[A




 43%|████▎     | 54/125 [00:08<00:10,  6.52it/s]

[A




 44%|████▍     | 55/125 [00:08<00:11,  6.21it/s]

[A




 45%|████▍     | 56/125 [00:08<00:11,  5.87it/s]

[A




 46%|████▌     | 57/125 [00:09<00:10,  6.21it/s]

[A




 46%|████▋     | 58/125 [00:09<00:10,  6.66it/s]

[A




 47%|████▋     | 59/125 [00:09<00:10,  6.42it/s]

[A




 48%|████▊     | 60/125 [00:09<00:09,  6.61it/s]

[A




 49%|████▉     | 61/125 [00:09<00:09,  7.02it/s]

[A




 50%|████▉     | 62/125 [00:09<00:08,  7.54it/s]

[A




 50%|█████     | 63/125 [00:09<00:09,  6.40it/s]

[A




 51%|█████     | 64/125 [00:10<00:11,  5.34it/s]

[A




 52%|█████▏    | 65/125 [00:10<00:09,  6.01it/s]

[A




 53%|█████▎    | 66/125 [00:10<00:09,  6.54it/s]

[A




 54%|█████▎    | 67/125 [00:10<00:08,  6.45it/s]

[A




 54%|█████▍    | 68/125 [00:10<00:09,  6.17it/s]

[A




 55%|█████▌    | 69/125 [00:10<00:09,  6.10it/s]

[A




 56%|█████▌    | 70/125 [00:11<00:09,  5.74it/s]

[A




 57%|█████▋    | 71/125 [00:11<00:09,  5.59it/s]

[A




 58%|█████▊    | 72/125 [00:11<00:08,  5.96it/s]

[A




 58%|█████▊    | 73/125 [00:11<00:08,  6.07it/s]

[A




 59%|█████▉    | 74/125 [00:11<00:08,  6.12it/s]

[A




 60%|██████    | 75/125 [00:11<00:08,  5.73it/s]

[A




 61%|██████    | 76/125 [00:12<00:08,  6.09it/s]

[A




 62%|██████▏   | 77/125 [00:12<00:07,  6.47it/s]

[A




 62%|██████▏   | 78/125 [00:12<00:06,  6.85it/s]

[A




 63%|██████▎   | 79/125 [00:12<00:06,  6.89it/s]

[A




 64%|██████▍   | 80/125 [00:12<00:06,  6.98it/s]

[A




 66%|██████▌   | 82/125 [00:12<00:05,  7.80it/s]

[A




 66%|██████▋   | 83/125 [00:13<00:05,  7.27it/s]

[A




 67%|██████▋   | 84/125 [00:13<00:05,  6.96it/s]

[A




 68%|██████▊   | 85/125 [00:13<00:05,  6.88it/s]

[A




 69%|██████▉   | 86/125 [00:13<00:05,  7.04it/s]

[A




 70%|██████▉   | 87/125 [00:13<00:06,  6.27it/s]

[A




 70%|███████   | 88/125 [00:13<00:05,  6.33it/s]

[A




 71%|███████   | 89/125 [00:13<00:05,  7.01it/s]

[A




 72%|███████▏  | 90/125 [00:14<00:05,  6.93it/s]

[A




 73%|███████▎  | 91/125 [00:14<00:04,  6.86it/s]

[A




 74%|███████▎  | 92/125 [00:14<00:04,  6.64it/s]

[A




 74%|███████▍  | 93/125 [00:14<00:04,  6.46it/s]

[A




 75%|███████▌  | 94/125 [00:14<00:04,  6.44it/s]

[A




 76%|███████▌  | 95/125 [00:14<00:04,  6.14it/s]

[A




 77%|███████▋  | 96/125 [00:15<00:04,  6.19it/s]

[A




 78%|███████▊  | 97/125 [00:15<00:04,  6.52it/s]

[A




 78%|███████▊  | 98/125 [00:15<00:04,  6.72it/s]

[A




 79%|███████▉  | 99/125 [00:15<00:04,  6.15it/s]

[A




 80%|████████  | 100/125 [00:15<00:03,  6.51it/s]

[A




 81%|████████  | 101/125 [00:15<00:04,  5.97it/s]

[A




 82%|████████▏ | 102/125 [00:15<00:03,  6.14it/s]

[A




 82%|████████▏ | 103/125 [00:16<00:03,  6.40it/s]

[A




 83%|████████▎ | 104/125 [00:16<00:04,  5.04it/s]

[A




 84%|████████▍ | 105/125 [00:16<00:03,  5.01it/s]

[A




 85%|████████▍ | 106/125 [00:16<00:03,  4.83it/s]

[A




 86%|████████▌ | 107/125 [00:17<00:03,  5.28it/s]

[A




 86%|████████▋ | 108/125 [00:17<00:03,  5.50it/s]

[A




 87%|████████▋ | 109/125 [00:17<00:02,  5.39it/s]

[A




 88%|████████▊ | 110/125 [00:17<00:02,  5.45it/s]

[A




 89%|████████▉ | 111/125 [00:17<00:02,  5.70it/s]

[A




 90%|████████▉ | 112/125 [00:17<00:02,  6.09it/s]

[A




 90%|█████████ | 113/125 [00:18<00:02,  5.91it/s]

[A




 91%|█████████ | 114/125 [00:18<00:01,  5.76it/s]

[A




 92%|█████████▏| 115/125 [00:18<00:01,  5.62it/s]

[A




 93%|█████████▎| 116/125 [00:18<00:01,  4.89it/s]

[A




 94%|█████████▎| 117/125 [00:18<00:01,  4.90it/s]

[A




 94%|█████████▍| 118/125 [00:19<00:01,  5.02it/s]

[A




 95%|█████████▌| 119/125 [00:19<00:01,  5.53it/s]

[A




 96%|█████████▌| 120/125 [00:19<00:00,  5.17it/s]

[A




 97%|█████████▋| 121/125 [00:19<00:00,  5.41it/s]

[A




 98%|█████████▊| 122/125 [00:19<00:00,  4.71it/s]

[A




 98%|█████████▊| 123/125 [00:20<00:00,  4.38it/s]

[A




 99%|█████████▉| 124/125 [00:20<00:00,  4.51it/s]

[A




100%|██████████| 125/125 [00:20<00:00,  5.35it/s]

[A

                                             






                                                 

[A

100%|██████████| 3/3 [01:16<00:00, 21.24s/it]




100%|██████████| 125/125 [00:20<00:00,  5.35it/s]

[A




                                                 

[A



Training completed. Do not forget to share your model on huggingface.co/models =)




                                             



100%|██████████| 3/3 [01:16<00:00, 21.24s/it]

100%|██████████| 3/3 [01:16<00:00, 25.59s/it]


Saving model checkpoint to ../_data/pretrain/model


Configuration saved in ../_data/pretrain/model/config.json


{'eval_loss': 9.364686012268066, 'eval_runtime': 20.6742, 'eval_samples_per_second': 48.273, 'eval_steps_per_second': 6.046, 'epoch': 0.02}
{'train_runtime': 76.7765, 'train_samples_per_second': 2.501, 'train_steps_per_second': 0.039, 'train_loss': 9.667746543884277, 'epoch': 0.02}


Model weights saved in ../_data/pretrain/model/pytorch_model.bin


In [12]:
pm_log_section("Finished pretraining!")


[===== Finished pretraining! =====]

