# Basic MLM

Links:  
https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c

In [1]:
import torch
import transformers

print('Torch Version: {}'.format(torch.__version__))
print('Transformer Version: {}'.format(transformers.__version__))

Torch Version: 1.10.0+cu102
Transformer Version: 4.19.2


In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import BertTokenizer, BertForMaskedLM

In [3]:
tokenizer_checkpoint = 'bert-base-uncased'
model_checkpoint = "distilroberta-base"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint, do_lower_case=False, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [5]:
text = ("After Abraham Lincoln won the November 1860 presidential "
        "election on an anti-slavery platform, an initial seven "
        "slave states declared their secession from the country "
        "to form the Confederacy. War broke out in April 1861 "
        "when secessionist forces attacked Fort Sumter in South "
        "Carolina, just over a month after Lincoln's "
        "inauguration.")

### Tokenization

In [6]:
inputs = tokenizer(text, return_tensors='pt')
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [7]:
inputs

{'input_ids': tensor([[  101,   100,   100,   100,  2180,  1996,   100,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996,   100,  1012,   100,  3631,  2041,  1999,   100,  6863,  2043,
         22965,  2923,  2749,  4457,   100,   100,  1999,   100,   100,  1010,
          2074,  2058,  1037,  3204,  2044,   100,  1005,  1055, 17331,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

### Create Labels

In [8]:
inputs.input_ids.detach().clone()

tensor([[  101,   100,   100,   100,  2180,  1996,   100,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996,   100,  1012,   100,  3631,  2041,  1999,   100,  6863,  2043,
         22965,  2923,  2749,  4457,   100,   100,  1999,   100,   100,  1010,
          2074,  2058,  1037,  3204,  2044,   100,  1005,  1055, 17331,  1012,
           102]])

In [9]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs

{'input_ids': tensor([[  101,   100,   100,   100,  2180,  1996,   100,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996,   100,  1012,   100,  3631,  2041,  1999,   100,  6863,  2043,
         22965,  2923,  2749,  4457,   100,   100,  1999,   100,   100,  1010,
          2074,  2058,  1037,  3204,  2044,   100,  1005,  1055, 17331,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[  101,   100

### Masking

In [10]:
rand = torch.rand(inputs.input_ids.shape)
rand

tensor([[0.8400, 0.4681, 0.6857, 0.4110, 0.6420, 0.2889, 0.2432, 0.7263, 0.9982,
         0.4042, 0.0242, 0.8994, 0.5887, 0.1267, 0.3324, 0.5458, 0.3493, 0.1375,
         0.8353, 0.3522, 0.6388, 0.6152, 0.4932, 0.5298, 0.7013, 0.0383, 0.4491,
         0.7716, 0.0639, 0.4459, 0.8733, 0.2705, 0.4141, 0.4742, 0.0328, 0.2086,
         0.7248, 0.4617, 0.9138, 0.6629, 0.8560, 0.0909, 0.6473, 0.4355, 0.4463,
         0.1385, 0.7687, 0.4634, 0.1919, 0.3024, 0.5737, 0.5893, 0.5354, 0.0651,
         0.2776, 0.5156, 0.2981, 0.5315, 0.5661, 0.1806, 0.7637]])

In [11]:
mask_arr = rand < 0.15
mask_arr

tensor([[False, False, False, False, False, False, False, False, False, False,
          True, False, False,  True, False, False, False,  True, False, False,
         False, False, False, False, False,  True, False, False,  True, False,
         False, False, False, False,  True, False, False, False, False, False,
         False,  True, False, False, False,  True, False, False, False, False,
         False, False, False,  True, False, False, False, False, False, False,
         False]])

In [12]:
# special tokens such as CLS or SEP tokens (101 and 102 respectively)
(inputs.input_ids != 101) * (inputs.input_ids != 102)

tensor([[False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         False]])

In [13]:
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102)
mask_arr

tensor([[False, False, False, False, False, False, False, False, False, False,
          True, False, False,  True, False, False, False,  True, False, False,
         False, False, False, False, False,  True, False, False,  True, False,
         False, False, False, False,  True, False, False, False, False, False,
         False,  True, False, False, False,  True, False, False, False, False,
         False, False, False,  True, False, False, False, False, False, False,
         False]])

In [14]:
# Elements to mask
selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
selection

[10, 13, 17, 25, 28, 34, 41, 45, 53]

In [15]:
inputs.input_ids[0, selection] = 103
inputs

{'input_ids': tensor([[  101,   100,   100,   100,  2180,  1996,   100,  7313,  4883,  2602,
           103,  2019,  3424,   103,  8864,  4132,  1010,   103,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,   103,  1996,  2406,   103,  2433,
          1996,   100,  1012,   100,   103,  2041,  1999,   100,  6863,  2043,
         22965,   103,  2749,  4457,   100,   103,  1999,   100,   100,  1010,
          2074,  2058,  1037,   103,  2044,   100,  1005,  1055, 17331,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[  101,   100

### Calculate Loss

In [16]:
outputs = model(**inputs)
outputs.keys()

odict_keys(['loss', 'logits'])

In [17]:
outputs.loss

tensor(4.1916, grad_fn=<NllLossBackward0>)

# Train with MLM

Link:  
https://www.youtube.com/watch?v=R6hcxMMOrPE&ab_channel=JamesBriggs

In [18]:
import torch
import transformers

from transformers import AutoTokenizer, AutoModelForMaskedLM

from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

In [19]:
tokenizer_checkpoint = 'bert-base-uncased'
model_checkpoint = 'distilroberta-base'

In [20]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint, do_lower_case=False, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [21]:
len(tokenizer.vocab)

30522

In [22]:
with open('./clean.txt', 'r') as fp:
    text = fp.read().split('\n')
    
text[:5]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.',
 'From my great-grandfather, not to have frequented public schools, and to have had good teachers at home, and to know that on such things a man should spend liberally.',
 "From my governor, to be neither of the green nor of the blue party at the games in the Circus, nor a partizan either of the Parmularius or the Scutarius at the gladiators' fights; from him too I learned endurance of labour, and to want little, and to work with my own hands, and not to meddle with other people's affairs, and not to be ready to listen to slander."]

In [23]:
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs

{'input_ids': tensor([[  101,   100,  2026,  ...,     0,     0,     0],
        [  101,   100,  1996,  ...,     0,     0,     0],
        [  101,   100,  2026,  ...,     0,     0,     0],
        ...,
        [  101,   100,  2185,  ...,     0,     0,     0],
        [  101,   100, 15223,  ...,     0,     0,     0],
        [  101,   100,  3288,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [24]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [25]:
inputs

{'input_ids': tensor([[  101,   100,  2026,  ...,     0,     0,     0],
        [  101,   100,  1996,  ...,     0,     0,     0],
        [  101,   100,  2026,  ...,     0,     0,     0],
        ...,
        [  101,   100,  2185,  ...,     0,     0,     0],
        [  101,   100, 15223,  ...,     0,     0,     0],
        [  101,   100,  3288,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[  101,   100,  2026,  ...,     0,     0,     0],
        [  101,   100,  1996,  ...,     0,     0,     0],
        [  101,   100, 

In [26]:
# CLS 101, SEP 102, PAD 0
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        ...,
        [False,  True, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [27]:
mask_arr.shape

torch.Size([507, 512])

In [28]:
# Take indices of True values for each row
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )
    
selection[:5]

[[3, 6, 12, 15],
 [5, 10, 16],
 [1, 4, 5, 7, 14, 17, 24, 27, 31, 33, 36, 42],
 [4, 5, 10, 11, 12, 20],
 [3, 10, 24, 28, 35, 41, 47, 48, 51, 52, 53, 59, 73, 77]]

In [29]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103
    
inputs.input_ids

tensor([[ 101,  100, 2026,  ...,    0,    0,    0],
        [ 101,  100, 1996,  ...,    0,    0,    0],
        [ 101,  103, 2026,  ...,    0,    0,    0],
        ...,
        [ 101,  103, 2185,  ...,    0,    0,    0],
        [ 101,  100,  103,  ...,    0,    0,    0],
        [ 101,  100, 3288,  ...,    0,    0,    0]])

In [30]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        # Here key is
        # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)
    
# DataLoader requires these two methods __getitem__ and __len__
# __len__ to check length of dataset it's looking at
# __getitem__ Dictionary formatted batch of those items

In [31]:
dataset = CustomDataset(inputs)
dataset

<__main__.CustomDataset at 0x7f16e36bdd00>

In [32]:
loader = DataLoader(dataset, batch_size=16, shuffle=True)
loader

<torch.utils.data.dataloader.DataLoader at 0x7f16e36bd1c0>

In [33]:
optim = AdamW(model.parameters(), lr=5e-5)
optim

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 5e-05
    weight_decay: 0.01
)

In [34]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
device

device(type='cuda')

In [35]:
# Activate train mode
model.train()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [36]:
vars(loader)

{'dataset': <__main__.CustomDataset at 0x7f16e36bdd00>,
 'num_workers': 0,
 'prefetch_factor': 2,
 'pin_memory': False,
 'timeout': 0,
 'worker_init_fn': None,
 '_DataLoader__multiprocessing_context': None,
 '_dataset_kind': 0,
 'batch_size': 16,
 'drop_last': False,
 'sampler': <torch.utils.data.sampler.RandomSampler at 0x7f16e36bd040>,
 'batch_sampler': <torch.utils.data.sampler.BatchSampler at 0x7f16e36bddc0>,
 'generator': None,
 'collate_fn': <function torch.utils.data._utils.collate.default_collate(batch)>,
 'persistent_workers': False,
 '_DataLoader__initialized': True,
 '_IterableDataset_len_called': None,
 '_iterator': None}

In [37]:
from tqdm import tqdm 

epochs = 2

for epoch in range(epochs):
    
    loop = tqdm(loader, leave=True)
    for batch in loop:
        
        optim.zero_grad() # Want gradients initialized and set to 0
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        
        loss = outputs.loss
        loss.backward()

        optim.step()
        
    print(f'Epoch {epoch + 1}')
    print(f'Loss {loss.item()}')
#         loop.set_description(f'Epoch {epoch}')
#         loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|█████████████████████████████████████████████████| 32/32 [00:14<00:00,  2.18it/s]


Epoch 1
Loss 0.20957337319850922


100%|█████████████████████████████████████████████████| 32/32 [00:14<00:00,  2.16it/s]

Epoch 2
Loss 0.3458373546600342





In [38]:
### Trainer

from transformers import Trainer
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir='output_files',
    per_device_train_batch_size=16,
    num_train_epochs=2
)

In [39]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset
)

In [40]:
trainer.train()


***** Running training *****
  Num examples = 507
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 64
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=64, training_loss=0.2169383466243744, metrics={'train_runtime': 30.0868, 'train_samples_per_second': 33.703, 'train_steps_per_second': 2.127, 'total_flos': 134478511884288.0, 'train_loss': 0.2169383466243744, 'epoch': 2.0})

In [41]:
len(tokenizer.vocab)

30522