# Setup

In [3]:
from os import path
import pandas as pd
from pprint import PrettyPrinter, pprint
from typing import Optional

__DIR__ = globals()['_dh'][0]
data_dir = path.relpath(path.join(__DIR__, "..", "_data"))

pp = PrettyPrinter(indent=2, width=120)

pd.set_option('display.width', 120)
pd.set_option('display.max_colwidth', 90)

In [4]:
# Settings
_testing = False
_colab_install = True
_pm_log_sections = False

# Parameters
dataset_dir = path.join(data_dir, "wiki", "20220301.en.1gb")
apply_pii_remediation = True

base_model = "bert-base-cased"
max_length = 128
vocab_size = 20_000

tokenizer_dir = path.join(data_dir, "pretrain", "tokenizer")
tokenize_params = dict(batched=True, num_proc=4)

mlm_probability = 0.15
bert_config = dict()
training_args = dict(
    optim = "adamw_torch",
    num_train_epochs = 1,
    per_device_train_batch_size = 64,
    eval_accumulation_steps = 10,
    evaluation_strategy = "steps",
    logging_steps = 10000,
    save_steps = 10000,
    save_total_limit = 3,
)
max_eval_samples: Optional[int] = 5000
model_dir = path.join(data_dir, "pretrain", "model")

In [5]:
if _testing:
    dataset = path.join(data_dir, "wiki", "20220301.en.test")

    training_args.update(dict(
        max_steps = 3,
        logging_steps = 1,
    ))

    max_eval_samples = 1000

## Process settings / parameters

In [6]:
# Colab
try:
    import google.colab
    
    # Wrap output text
    from IPython.display import HTML, display
    
    def set_css():
        display(HTML('''
        <style>
            pre {
                white-space: pre-wrap;
            }
        </style>
        '''))
        get_ipython().events.register('pre_run_cell', set_css)
    
    if _colab_install:
        colab_install_script = path.join(__DIR__, "..", "colab_install.sh")

        if not path.isfile(colab_install_script):
            script_url = ("https://raw.githubusercontent.com/"
                            "yenson-lau/pii-remediation/main/colab_install.sh")
            !wget $script_url -O $colab_install_script

        !bash $colab_install_script

        print()

except ModuleNotFoundError:
    pass

In [9]:
from collections import OrderedDict

if _pm_log_sections:
    def pm_log_section(message):
        print(f"\n[===== {message} =====]\n")
else:
    def pm_log_section(message):
        return

if _testing:
    pm_log_section("Running on testing mode")

config = OrderedDict(
    dataset_dir = dataset_dir,

    base_model = base_model,
    max_length = max_length,
    vocab_size = vocab_size,

    tokenizer_dir = tokenizer_dir,
    tokenize_params = tokenize_params,

    mlm_probability = mlm_probability,
    bert_config = bert_config,
    training_args = training_args,
    max_eval_samples = max_eval_samples,
    model_dir = model_dir,
)

print(f"{'TESTING ' if _testing else ''}Parameters:")
pp.pprint(config)

Parameters:
OrderedDict([ ('dataset_dir', '../_data/wiki/20220301.en.1gb'),
              ('base_model', 'bert-base-cased'),
              ('max_length', 128),
              ('vocab_size', 20000),
              ('tokenizer_dir', '../_data/pretrain/tokenizer'),
              ('tokenize_params', {'batched': True, 'num_proc': 4}),
              ('mlm_probability', 0.15),
              ('bert_config', {}),
              ( 'training_args',
                { 'eval_accumulation_steps': 10,
                  'evaluation_strategy': 'steps',
                  'logging_steps': 10000,
                  'num_train_epochs': 1,
                  'optim': 'adamw_torch',
                  'per_device_train_batch_size': 64,
                  'save_steps': 10000,
                  'save_total_limit': 3}),
              ('max_eval_samples', 5000),
              ('model_dir', '../_data/pretrain/model')])


# Load dataset

In [16]:
from datasets import Dataset, load_dataset

pm_log_section("Loading dataset")

dataset = dict()
for split in ["train", "val"]:
    data_file = path.join(dataset_dir, f"{split}_data.json")
    if not path.isfile(data_file):  data_file += ".gz"
    dataset[split] = load_dataset("json", data_files=data_file, field="data")["train"]

    if ((split != "train") 
        and (max_eval_samples is not None) 
        and (len(dataset[split]) > max_eval_samples)):
        
        dataset[split] = dataset[split].select(range(max_eval_samples))

display(pd.DataFrame(dataset["train"][:10]))



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,article_id,text
0,14877816,Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as...
1,14877816,The myeloid cell nuclear differentiation antigen (MNDA) is detected only in nuclei of ...
2,14877816,A 200-amino acid region of human MNDA is strikingly similar to a region in the protein...
3,14877816,"The 1.8-kb MNDA mRNA, which contains an interferon-stimulated response element in the ..."
4,14877816,"MNDA is located within 2,200 kb of FCER1A, APCS, CRP, and SPTA1."
5,14877816,"In its pattern of expression and/or regulation, MNDA resembles IFI16, suggesting that ..."
6,4845938,"""Boris the Spider"" is a song written by the Who's bass guitarist, John Entwistle."
7,4845938,It appears as the second track of their 1966 album A Quick One.
8,4845938,"This song is claimed to be Entwistle's first composition, and became a staple of live ..."
9,4845938,"This song, along with ""My Wife"", ""Heaven and Hell"" and ""The Quiet One"", were Entwistle..."


In [17]:
if apply_pii_remediation:
    from pii import apply_anonymization

    def anon_function(ex):
        if isinstance(ex["text"], list):
            texts = "\n".join(ex["text"])
            return {"text": apply_anonymization(texts)["text"].splitlines()}
        else:
            return {"text": apply_anonymization(ex["text"])["text"]}

    anon_dataset = {
        k: v.map(anon_function, load_from_cache_file=False)
        for k, v in dataset.items()
    }

display(pd.DataFrame(anon_dataset["train"][:10]))

  0%|          | 0/9403586 [00:00<?, ?ex/s]

KeyboardInterrupt: 

# Tokenization

In [7]:
from transformers import BertTokenizerFast

pm_log_section("Tokenizing")

In [8]:
tokenizer = (BertTokenizerFast
                .from_pretrained(base_model)
                .train_new_from_iterator(dataset["train"]["text"], vocab_size))
tokenizer.model_max_length = max_length

tokenizer.save_pretrained(tokenizer_dir);

In [9]:
tokenize_function = lambda ex: tokenizer(ex["text"], truncation=True)

tokenized_dataset = {
    k: v.map(tokenize_function, remove_columns = list(v.features), **tokenize_params)
    for k, v in dataset.items()
}

display(pd.DataFrame(tokenized_dataset["train"][:10]))

     



 



  



     



 



 



 



Unnamed: 0,input_ids,token_type_ids,attention_mask
0,"[2, 1933, 17790, 212, 3796, 13993, 18314, 2658, 19885, 171, 214, 69, 6632, 254, 175, 7...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,"[2, 199, 2107, 17790, 212, 3796, 5792, 1508, 2658, 3544, 2935, 12, 49, 19394, 125, 13,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1..."
2,"[2, 37, 307, 17, 16755, 6995, 1973, 173, 2171, 49, 19394, 125, 214, 12751, 210, 1937, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1..."
3,"[2, 199, 21, 18, 28, 17, 79, 110, 49, 19394, 125, 81, 16405, 16, 329, 3311, 244, 15761...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1..."
4,"[2, 49, 19394, 125, 214, 1191, 1389, 22, 16, 307, 79, 110, 173, 3718, 6113, 144, 125, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1..."
5,"[2, 252, 416, 5305, 173, 7524, 179, 19, 284, 11471, 16, 49, 19394, 125, 18045, 11526, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1..."
6,"[2, 6, 14484, 162, 11497, 6, 214, 69, 855, 1727, 229, 162, 6340, 11, 87, 4210, 7321, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
7,"[2, 374, 3770, 216, 162, 731, 1424, 173, 411, 3510, 832, 37, 1764, 568, 1565, 18, 3]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
8,"[2, 636, 855, 214, 3652, 184, 235, 7195, 129, 12136, 11, 87, 377, 7003, 16, 179, 766, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
9,"[2, 636, 855, 16, 1166, 226, 6, 1933, 59, 763, 6, 16, 6, 11510, 179, 7462, 6, 179, 6, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1..."


# Train masked language model

In [None]:
import numpy as np
from transformers import (BertConfig,
                          BertForMaskedLM,
                          DataCollatorForLanguageModeling,
                          Trainer,
                          TrainingArguments)

pm_log_section("Training MLM")

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer,
                                                mlm_probability = mlm_probability)

model = BertForMaskedLM(config=BertConfig(vocab_size=tokenizer.vocab_size, **bert_config))

train_args = TrainingArguments(output_dir = model_dir,
                                  overwrite_output_dir = True,
                                  **training_args)

# def compute_metrics(eval_preds):
#     idxs0, idxs1 = np.where(eval_preds.label_ids!=-100)

#     preds = np.argmax(eval_preds.predictions[idxs0, idxs1, :], axis=-1)
#     labels = eval_preds.label_ids[idxs0, idxs1]

#     acc = (preds==labels).sum()/len(preds)

#     return {"accuracy": acc}

trainer = Trainer(model = model,
                  args = train_args,
                  data_collator = data_collator,
                  # compute_metrics=compute_metrics,
                  train_dataset = tokenized_dataset["train"],
                  eval_dataset = tokenized_dataset["val"])

using `logging_steps` to initialize `eval_steps` to 10000
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()
trainer.save_model(model_dir)

***** Running training *****
  Num examples = 9403586
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 146932
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
10000,5.9727,4.935154
20000,4.7029,4.35125
30000,4.2704,4.028666
40000,4.0094,3.825895
50000,3.8189,3.638723
60000,3.6626,3.555936
70000,3.5422,3.428748
80000,3.4416,3.319535
90000,3.3547,3.251397
100000,3.2839,3.193127


***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to ../_data/pretrain/model/checkpoint-10000
Configuration saved in ../_data/pretrain/model/checkpoint-10000/config.json
Model weights saved in ../_data/pretrain/model/checkpoint-10000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to ../_data/pretrain/model/checkpoint-20000
Configuration saved in ../_data/pretrain/model/checkpoint-20000/config.json
Model weights saved in ../_data/pretrain/model/checkpoint-20000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to ../_data/pretrain/model/checkpoint-30000
Configuration saved in ../_data/pretrain/model/checkpoint-30000/config.json
Model weights saved in ../_data/pretrain/model/checkpoint-30000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to ../_data/pretra

Step,Training Loss,Validation Loss
10000,5.9727,4.935154
20000,4.7029,4.35125
30000,4.2704,4.028666
40000,4.0094,3.825895
50000,3.8189,3.638723
60000,3.6626,3.555936
70000,3.5422,3.428748
80000,3.4416,3.319535
90000,3.3547,3.251397
100000,3.2839,3.193127


***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to ../_data/pretrain/model/checkpoint-140000
Configuration saved in ../_data/pretrain/model/checkpoint-140000/config.json
Model weights saved in ../_data/pretrain/model/checkpoint-140000/pytorch_model.bin
Deleting older checkpoint [../_data/pretrain/model/checkpoint-110000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ../_data/pretrain/model
Configuration saved in ../_data/pretrain/model/config.json
Model weights saved in ../_data/pretrain/model/pytorch_model.bin


# Evaluation
Run **Setup**, then proceed

In [None]:
from datasets import load_dataset
from transformers import BertTokenizerFast


pm_log_section("Evaluating MLM")

data_file = path.join(ds_dir, "val_data.json")
if not path.isfile(data_file):  data_file += ".gz"

val_dataset = load_dataset("json", data_files=data_file, field="data")["train"]

tokenizer = BertTokenizerFast.from_pretrained(tokenizer_dir)
tokenize_function = lambda ex: tokenizer(ex["text"], truncation=True)

tokenized_val_dataset = val_dataset.map(
    tokenize_function, 
    remove_columns=list(val_dataset.features), 
    **tokenize_params
)



  0%|          | 0/1 [00:00<?, ?it/s]

     



 



 



 



In [None]:
import numpy as np
import torch
from transformers import (BertForMaskedLM,
                          DataCollatorForLanguageModeling,
                          Trainer,
                          TrainingArguments)


model = BertForMaskedLM.from_pretrained(model_dir)

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer,
                                                mlm_probability = mlm_probability)

train_args = TrainingArguments(output_dir = model_dir,
                               overwrite_output_dir = True,
                               **training_args)

trainer = Trainer(model = model,
                  args = train_args,
                  data_collator = data_collator,
                  eval_dataset = tokenized_val_dataset)

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1174433
  Batch size = 8
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 2.975706100463867,
 'eval_runtime': 4467.8428,
 'eval_samples_per_second': 262.864,
 'eval_steps_per_second': 32.858}

## Random examples

In [None]:
np.random.seed(0)

samples = np.random.permutation(len(tokenized_val_dataset))[:5]
samples = [tokenized_val_dataset[int(i)] for i in samples]

inputs = {k: v.to("cuda") for k, v in data_collator(samples).items()}
preds = torch.argmax(model(**inputs).logits.cpu(), axis=-1)

In [None]:
decode_kwargs = dict(
    skip_special_tokens=False
)

for sample, input, pred in zip(samples, inputs["input_ids"], preds):
    len_sample = len(sample["input_ids"])
    pp.pprint(tokenizer.decode(sample["input_ids"][1:len_sample-1], **decode_kwargs))
    pp.pprint(tokenizer.decode(input[1:len_sample-1], **decode_kwargs))
    pp.pprint(tokenizer.decode(pred[1:len_sample-1], **decode_kwargs))
    print()

'Handball players at the 2016 Summer Olympics'
'[MASK]ball players at the 2016 Summer [MASK]'
'Handball players at the 2016 Summer Olympics'

('Of special interest is the six petal rosette derived from the " seven overlapping circles " pattern, also known as " '
 'Sun of the Alps " from its frequent use in alpine folk art in the 17th and 18th century.')
('Of special interest is [MASK] six [MASK]al [MASK]tte derived from the " seven overlapping [MASK] " [MASK], also '
 'known as " Sun of the Alps " Bark its frequent use in alpine folk art in the 17th and 18th century [MASK]')
('Of special interest is the six spiral palette derived from the " seven overlappings " style, also known as " Sun of '
 'the Alps ", its frequent use in alpine folk art in the 17th and 18th century.')

("For example, in Scholasticism, it was believed that God was capable of performing any miracle so long as it didn't "
 'lead to a logical contradiction.')
('For example, [MASK] [MASK] [MASK]ism, it was believed tha