In [1]:
# Set pd display width
import pandas as pd

pd.set_option('display.width', 140)
pd.set_option('display.max_colwidth', 140)

## Load data

In [2]:
from datasets import load_dataset

wiki = load_dataset("json", 
                    data_files="../_data/wiki/20220301.en.test/train_data.json", 
                    field="data")["train"]

Using custom data configuration default-76618ebe809bedea


Reusing dataset json (/Users/yenson/.cache/huggingface/datasets/json/default-76618ebe809bedea/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
example = wiki[0]
print(example)

{'article_id': 14877816, 'text': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene.'}


In [4]:
# Note each `text` is one sentence from article `article_id`
display(pd.DataFrame(wiki[:10]))

Unnamed: 0,article_id,text
0,14877816,Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene.
1,14877816,The myeloid cell nuclear differentiation antigen (MNDA) is detected only in nuclei of cells of the granulocyte-monocyte lineage.
2,14877816,A 200-amino acid region of human MNDA is strikingly similar to a region in the proteins encoded by a family of interferon-inducible mous...
3,14877816,"The 1.8-kb MNDA mRNA, which contains an interferon-stimulated response element in the 5' UTR, was significantly upregulated in human mon..."
4,14877816,"MNDA is located within 2,200 kb of FCER1A, APCS, CRP, and SPTA1."
5,14877816,"In its pattern of expression and/or regulation, MNDA resembles IFI16, suggesting that these genes participate in blood cell-specific res..."
6,4845938,"""Boris the Spider"" is a song written by the Who's bass guitarist, John Entwistle."
7,4845938,It appears as the second track of their 1966 album A Quick One.
8,4845938,"This song is claimed to be Entwistle's first composition, and became a staple of live shows."
9,4845938,"This song, along with ""My Wife"", ""Heaven and Hell"" and ""The Quiet One"", were Entwistle's most popular songs to perform live."


## Train new instance of `BertTokenizerFast`

In [5]:
from transformers import BertTokenizerFast

vocab_size = 20_000
tokenizer_output_dir = "../_data/_pretrain/tokenizer"

tokenizer = (BertTokenizerFast
                .from_pretrained("bert-base-cased")
                .train_new_from_iterator(wiki["text"], vocab_size))

tokenizer.save_pretrained(tokenizer_output_dir)










('../_data/_pretrain/tokenizer/tokenizer_config.json',
 '../_data/_pretrain/tokenizer/special_tokens_map.json',
 '../_data/_pretrain/tokenizer/vocab.txt',
 '../_data/_pretrain/tokenizer/added_tokens.json',
 '../_data/_pretrain/tokenizer/tokenizer.json')

In [6]:
example_encoding = tokenizer(example["text"])
print(example_encoding)

{'input_ids': [2, 2291, 15948, 2073, 9946, 19614, 182, 3396, 6274, 6567, 205, 65, 9768, 241, 169, 13756, 205, 13402, 212, 8702, 6162, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [7]:
print(" ".join(tokenizer.convert_ids_to_tokens(example_encoding["input_ids"])))
print(tokenizer.decode(example_encoding["input_ids"]))

[CLS] My ##eloid cell Nuclear Differ ##ent ##iation Anti ##gen is a protein that in humans is encoded as MNDA gene . [SEP]


[CLS] Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene. [SEP]


## Run pretraining on one example

In [8]:
tokenize_function = lambda examples: tokenizer(examples["text"])

tokenized_dataset = (wiki.select(range(1))
                         .map(tokenize_function, 
                              batched = True, 
                              num_proc = 4, 
                              remove_columns = ["text", "article_id"]))

print(tokenizer.decode(tokenized_dataset[0]["input_ids"]))

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


  0%|          | 0/1 [00:00<?, ?ba/s]

[CLS] Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene. [SEP]


In [9]:
from transformers import (BertConfig,
                          BertForMaskedLM,
                          DataCollatorForLanguageModeling,
                          Trainer,
                          TrainingArguments)

# input 1: tokenizer
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer,
                                                mlm_probability = 0.5)

bert_config = BertConfig(vocab_size = tokenizer.vocab_size)

model = BertForMaskedLM(config = bert_config)

# input 2: tokenized_dataset
num_train_epochs = 100
pretrain_output_dir = "../_data/_pretrain/model"

training_args = TrainingArguments(num_train_epochs = num_train_epochs,
                                  per_device_train_batch_size = 64,
                                  save_steps = 10_000,
                                  save_total_limit = 2,
                                  prediction_loss_only = True,
                                  output_dir = pretrain_output_dir,
                                  overwrite_output_dir = True)

trainer = Trainer(model = model,
                  args = training_args,
                  data_collator = data_collator,
                  train_dataset = tokenized_dataset)

trainer.train()
trainer.save_model(pretrain_output_dir)

***** Running training *****


  Num examples = 1


  Num Epochs = 100


  Instantaneous batch size per device = 64


  Total train batch size (w. parallel, distributed & accumulation) = 64


  Gradient Accumulation steps = 1


  Total optimization steps = 100


  0%|          | 0/100 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




Training completed. Do not forget to share your model on huggingface.co/models =)




Saving model checkpoint to ../_data/_pretrain/model


Configuration saved in ../_data/_pretrain/model/config.json


{'train_runtime': 26.3551, 'train_samples_per_second': 3.794, 'train_steps_per_second': 3.794, 'train_loss': 1.8065670776367186, 'epoch': 100.0}


Model weights saved in ../_data/_pretrain/model/pytorch_model.bin


## Evaluation

In [10]:
from transformers import pipeline

fill_mask = pipeline("fill-mask",
                     model = pretrain_output_dir,
                     tokenizer = tokenizer_output_dir)

loading configuration file ../_data/_pretrain/model/config.json


Model config BertConfig {
  "_name_or_path": "../_data/_pretrain/model",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 20000
}



loading configuration file ../_data/_pretrain/model/config.json


Model config BertConfig {
  "_name_or_path": "../_data/_pretrain/model",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 20000
}



loading weights file ../_data/_pretrain/model/pytorch_model.bin


All model checkpoint weights were used when initializing BertForMaskedLM.



All the weights of BertForMaskedLM were initialized from the model checkpoint at ../_data/_pretrain/model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for predictions without further training.


loading file vocab.txt


loading file tokenizer.json


loading file added_tokens.json


loading file special_tokens_map.json


loading file tokenizer_config.json


In [11]:
# Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene.

fill_mask(
    "Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is [MASK] as MNDA gene."
)

[{'score': 0.9725825190544128,
  'token': 13402,
  'token_str': 'encoded',
  'sequence': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene.'},
 {'score': 1.790931128198281e-05,
  'token': 6162,
  'token_str': 'gene',
  'sequence': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is gene as MNDA gene.'},
 {'score': 1.190628063341137e-05,
  'token': 65,
  'token_str': 'a',
  'sequence': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is a as MNDA gene.'},
 {'score': 1.1821502994280308e-05,
  'token': 648,
  'token_str': '##ince',
  'sequence': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans isince as MNDA gene.'},
 {'score': 1.0303525414201431e-05,
  'token': 1070,
  'token_str': 'foot',
  'sequence': 'Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is foot as MNDA gene.'}]

In [12]:
# Myeloid cell Nuclear Differentiation Antigen is a protein that in humans is encoded as MNDA gene.

fill_mask(
    "Myeloid cell Nuclear [MASK] Antigen is a protein that in humans is encoded as MNDA gene."
)

[{'score': 0.9743815064430237,
  'token': 19614,
  'token_str': 'Differ',
  'sequence': 'Myeloid cell Nuclear Differ Antigen is a protein that in humans is encoded as MNDA gene.'},
 {'score': 1.3961661352368537e-05,
  'token': 13402,
  'token_str': 'encoded',
  'sequence': 'Myeloid cell Nuclear encoded Antigen is a protein that in humans is encoded as MNDA gene.'},
 {'score': 1.1701093171723187e-05,
  'token': 13951,
  'token_str': 'quantities',
  'sequence': 'Myeloid cell Nuclear quantities Antigen is a protein that in humans is encoded as MNDA gene.'},
 {'score': 9.511587450106163e-06,
  'token': 2291,
  'token_str': 'My',
  'sequence': 'Myeloid cell Nuclear My Antigen is a protein that in humans is encoded as MNDA gene.'},
 {'score': 9.252627933165058e-06,
  'token': 16475,
  'token_str': 'shops',
  'sequence': 'Myeloid cell Nuclear shops Antigen is a protein that in humans is encoded as MNDA gene.'}]