In [1]:
# Set pd display width
import pandas as pd

pd.set_option('display.width', 140)
pd.set_option('display.max_colwidth', 140)

## Load data

In [3]:
from datasets import load_dataset

wiki = load_dataset("json", 
                    data_files="../_data/wiki/20220301.en.1gb.json.gz", 
                    field="train")["train"]

Using custom data configuration default-a8ffa258544588d4
Reusing dataset json (/Users/yenson/.cache/huggingface/datasets/json/default-a8ffa258544588d4/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)
100%|██████████| 1/1 [00:00<00:00,  5.99it/s]


In [4]:
example = wiki[0]
print(example)

{'article_id': 14877816, 'text': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene.'}


In [5]:
# Note each `text` is one sentence from article `article_id`
display(pd.DataFrame(wiki[:10]))

Unnamed: 0,article_id,text
0,14877816,Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene.
1,14877816,The myeloid cell nuclear differentiation antigen (MNDA) is detected only in nuclei of cells of the granulocyte-monocyte lineage.
2,14877816,A 200-amino acid region of human MNDA is strikingly similar to a region in the proteins encoded by a family of interferon-inducible mous...
3,14877816,"The 1.8-kb MNDA mRNA, which contains an interferon-stimulated response element in the 5' UTR, was significantly upregulated in human mon..."
4,14877816,"MNDA is located within 2,200 kb of FCER1A, APCS, CRP, and SPTA1."
5,14877816,"In its pattern of expression and/or regulation, MNDA resembles IFI16, suggesting that these genes participate in blood cell-specific res..."
6,4845938,"""Boris the Spider"" is a song written by the Who's bass guitarist, John Entwistle."
7,4845938,It appears as the second track of their 1966 album A Quick One.
8,4845938,"This song is claimed to be Entwistle's first composition, and became a staple of live shows."
9,4845938,"This song, along with ""My Wife"", ""Heaven and Hell"" and ""The Quiet One"", were Entwistle's most popular songs to perform live."


## Train new instance of `BertTokenizerFast`

In [9]:
from transformers import BertTokenizerFast

vocab_size = 20_000
tokenizer_output_dir = "../_data/tokenizer"
use_pretrained = True

if use_pretrained:
    tokenizer = BertTokenizerFast.from_pretrained(tokenizer_output_dir)
else:
    tokenizer = (BertTokenizerFast
                    .from_pretrained("bert-base-cased")
                    .train_new_from_iterator(wiki["text"], vocab_size))

    tokenizer.save_pretrained(tokenizer_output_dir)

In [9]:
example_encoding = tokenizer(example["text"])
print(example_encoding)

{'input_ids': [2, 1933, 17790, 212, 3796, 13993, 18314, 2658, 19885, 171, 214, 69, 6632, 254, 175, 7079, 214, 17251, 216, 49, 19394, 137, 8234, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [10]:
print(" ".join(tokenizer.convert_ids_to_tokens(example_encoding["input_ids"])))
print(tokenizer.decode(example_encoding["input_ids"]))

[CLS] My ##elo ##id cell Nuclear Different ##iation Antig ##en is a protein that in humans is encoded as M ##ND ##A gene . [SEP]
[CLS] Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene. [SEP]


## Run pretraining on one example

In [10]:
tokenize_function = lambda examples: tokenizer(examples["text"])

tokenized_dataset = (wiki.select(range(1))
                         .map(tokenize_function, 
                              batched = True, 
                              num_proc = 4, 
                              remove_columns = ["text", "article_id"]))

print(tokenizer.decode(tokenized_dataset[0]["input_ids"]))

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.
100%|██████████| 1/1 [00:00<00:00, 136.46ba/s]


[CLS] Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene. [SEP]


In [11]:
from transformers import (BertConfig,
                          BertForMaskedLM,
                          DataCollatorForLanguageModeling,
                          Trainer,
                          TrainingArguments)

# input 1: tokenizer
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer,
                                                mlm_probability = 0.5)

bert_config = BertConfig(vocab_size = tokenizer.vocab_size)

model = BertForMaskedLM(config = bert_config)

# input 2: tokenized_dataset
num_train_epochs = 100
pretrain_output_dir = "../_data/pretrain"

training_args = TrainingArguments(num_train_epochs = num_train_epochs,
                                  per_device_train_batch_size = 64,
                                  save_steps = 10_000,
                                  save_total_limit = 2,
                                  prediction_loss_only = True,
                                  output_dir = pretrain_output_dir,
                                  overwrite_output_dir = True)

trainer = Trainer(model = model,
                  args = training_args,
                  data_collator = data_collator,
                  train_dataset = tokenized_dataset)

trainer.train()
trainer.save_model(pretrain_output_dir)

***** Running training *****
  Num examples = 1
  Num Epochs = 100
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  0%|          | 0/100 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 100/100 [00:30<00:00,  3.46it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 100/100 [00:30<00:00,  3.33it/s]
Saving model checkpoint to ../_data/pretrain
Configuration saved in ../_data/pretrain/config.json


{'train_runtime': 30.0314, 'train_samples_per_second': 3.33, 'train_steps_per_second': 3.33, 'train_loss': 1.7977354431152344, 'epoch': 100.0}


Model weights saved in ../_data/pretrain/pytorch_model.bin


## Evaluation

In [12]:
from transformers import pipeline

fill_mask = pipeline("fill-mask",
                     model = pretrain_output_dir,
                     tokenizer = tokenizer_output_dir)

loading configuration file ../_data/pretrain/config.json
Model config BertConfig {
  "_name_or_path": "../_data/pretrain",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 20000
}

loading configuration file ../_data/pretrain/config.json
Model config BertConfig {
  "_name_or_path": "../_data/pretrain",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 

In [13]:
# Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene.

fill_mask(
    "Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is [MASK] as MNDA gene."
)

[{'score': 0.9344049096107483,
  'token': 17251,
  'token_str': 'encoded',
  'sequence': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene.'},
 {'score': 8.077319216681644e-05,
  'token': 214,
  'token_str': 'is',
  'sequence': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is is as MNDA gene.'},
 {'score': 3.899676084984094e-05,
  'token': 18,
  'token_str': '.',
  'sequence': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is. as MNDA gene.'},
 {'score': 2.848868643923197e-05,
  'token': 175,
  'token_str': 'in',
  'sequence': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is in as MNDA gene.'},
 {'score': 2.4256074539152905e-05,
  'token': 8234,
  'token_str': 'gene',
  'sequence': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is gene as MNDA gene.'}]

In [14]:
# Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene.

fill_mask(
    "Myeloid cell Nuclear [MASK] Antigen is a protein that in humans is encoded as MNDA gene."
)

[{'score': 0.948901891708374,
  'token': 18314,
  'token_str': 'Different',
  'sequence': 'Myeloid cell Nuclear Different Antigen is a protein that in humans is encoded as MNDA gene.'},
 {'score': 3.187018592143431e-05,
  'token': 175,
  'token_str': 'in',
  'sequence': 'Myeloid cell Nuclear in Antigen is a protein that in humans is encoded as MNDA gene.'},
 {'score': 1.589172643434722e-05,
  'token': 394,
  'token_str': '##ational',
  'sequence': 'Myeloid cell Nuclearational Antigen is a protein that in humans is encoded as MNDA gene.'},
 {'score': 1.528169923403766e-05,
  'token': 1569,
  'token_str': 'exper',
  'sequence': 'Myeloid cell Nuclear exper Antigen is a protein that in humans is encoded as MNDA gene.'},
 {'score': 1.4670657037640922e-05,
  'token': 19756,
  'token_str': 'Germanic',
  'sequence': 'Myeloid cell Nuclear Germanic Antigen is a protein that in humans is encoded as MNDA gene.'}]