# Language Model retraining using MLM task

## 0. Define Hyperparameters

In [1]:
AVAILABLE_GPU = 2 # Available GPU with 0% usage
HUGGINGFACE_MODEL = "dccuchile/bert-base-spanish-wwm-cased" # HuggingFace checkpoint model to train
MODEL_SAVE_PATH = "./output/old-spanish-beto-cased.pt" # Path to save the model

MASK_PROB = 0.15 # Probability of masking within a text
LR = 2e-5 # Learning rate for Adam
EPOCHS = 5 # Number of epochs to train
BATCH_SIZE = 32 # Batch size for training
MAX_TOKENIZER_LENGTH = 512 # Maximum length of the texts in the tokenizer

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = f"{AVAILABLE_GPU}"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
tf_device=f'/gpu:{AVAILABLE_GPU}'

from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.nn.parallel import DataParallel
from torch.optim import Adam
import nltk
import pandas as pd
from datasets import Dataset
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/historynlp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 1. Load pre-trained LM for MLM and it's tokenizer

Define the tokenizer and the model, using the Model's `transformers` ID from Hugging Face. It's important to consider the possible differences between _cased_ and _uncased_ models.

In [3]:
tokenizer = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL)
model = AutoModelForMaskedLM.from_pretrained(HUGGINGFACE_MODEL)

In [4]:
CLS_TOKEN = tokenizer.cls_token_id
SEP_TOKEN = tokenizer.sep_token_id
PAD_TOKEN = tokenizer.pad_token_id
MASK_TOKEN = tokenizer.mask_token_id
CLS_TOKEN, SEP_TOKEN, PAD_TOKEN, MASK_TOKEN

(4, 5, 1, 0)

## 2. Load and pre-process dataset

For training the model, we'll take the training split from the full spanish corpus, after cleaning. Given after the preparation stage:
> This corpus is already chunked for no more than 512 tokens, so there will be no chunking required for models that has this maximum length

In [5]:
df = pd.read_csv("./data/old-spanish-corpus-chunked.tsv", sep="\t", usecols=["text", "source"])
#df = df[(df.source == "LatamXIX")]
#df.reset_index(drop=True, inplace=True)

dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['source', 'text'],
    num_rows: 1141490
})

A random sample of a text:

In [6]:
dataset[10000]["text"]

'462 el que ahora habla de los mismos sentimientos que animan ai sr. preopinante, y guiad* por idénticos principios, trata no mas de economizar en lavor del erario los capitales quo pudicndo producir tantos bienes, como su señoría ha indicado, se aplicarán tal vez fuera de tiempo á objetos infructuosos: que estas mismas ideas se confirman por la simple lectura de las proposiciones siguientes, que todas conspiran á la conservación de los caudales públicos, y que no solo no se oponen á que haya una fábrica, sino que se eccitc al gobierno para que las establezca si es po sible en todos los pueblos del Estado, ó á lo menos en aquellos que crea que pueden producir alguna utilidad á los fondos públicos. El sr. Vallaría dijo, que ante todas cosas debia ecsaminarse si convenia que la fábrica de labrados se pusiese en varios puntos del Estado.'

## 3. Tokenize and Mask the dataset

Now, it's possible to start the retraining process of the model for the MLM task. For that, we'll first tokenize the input texts:

In [7]:
%%capture tokenizer_output
%%time

# All the texts of our dataset are stored in dataset["text"]
inputs = tokenizer(dataset["text"], return_tensors='pt', max_length=MAX_TOKENIZER_LENGTH, truncation=True, padding='max_length')
inputs

In [8]:
tokenizer_output.show()

CPU times: user 17min 52s, sys: 3min 25s, total: 21min 18s
Wall time: 3min 43s


{'input_ids': tensor([[    4,  1148, 25143,  ...,     1,     1,     1],
        [    4, 26182,  1005,  ...,     1,     1,     1],
        [    4,  1148, 25143,  ...,     1,     1,     1],
        ...,
        [    4, 23796,  1213,  ...,     1,     1,     1],
        [    4,  1198,  4675,  ...,     1,     1,     1],
        [    4,   948,  1949,  ...,     1,     1,     1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

Here's a comparation between the masked sentence and the original one:

In [9]:
dataset[100]['text']

'Can sábanse los Españoles de tanto resistir, sin espe ranza de vencer ; y ya empezaba en ellos el valor á quejarse de las fuerzas, quando Hernán Cortés (que andaba en la batalla como soldado, sin traer embarazadas las atenciones de Capitán) descubrió una elevación del terreno, poco distante del cami no, que mandaba per todas partes la compaña, so-'

In [10]:
' '.join([tokenizer.decode(x).replace(' ', '') for x in inputs['input_ids'][100]])

'[CLS] Can sá ##ban ##se los Español ##es de tanto resistir , sin espe ran ##za de vencer ; y ya empe ##zaba en ellos el valor á queja ##rs ##e de las fuerzas , qu ##ando Hern ##án Cortés ( que anda ##ba en la batalla como soldado , sin traer embarazadas las aten ##ciones de Capitán ) descubrió una elevación del terreno , poco distante del cami no , que manda ##ba per todas partes la compañ ##a , so - [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

Then, the `labels` of the inputs are defined as a copy of the `input_ids`, and some percentage of the inputs are masked:

In [11]:
inputs['labels'] = inputs.input_ids.detach().clone()
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < MASK_PROB) * (inputs.input_ids != CLS_TOKEN) * (inputs.input_ids != SEP_TOKEN) * (inputs.input_ids != PAD_TOKEN)
mask_arr

tensor([[False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False]])

In [12]:
mask = mask_arr.bool()
indices = mask.nonzero(as_tuple=False)
inputs.input_ids[indices[:, 0], indices[:, 1]] = MASK_TOKEN 

inputs.input_ids

tensor([[    4,     0, 25143,  ...,     1,     1,     1],
        [    4, 26182,  1005,  ...,     1,     1,     1],
        [    4,     0, 25143,  ...,     1,     1,     1],
        ...,
        [    4, 23796,  1213,  ...,     1,     1,     1],
        [    4,  1198,  4675,  ...,     1,     1,     1],
        [    4,   948,     0,  ...,     1,     1,     1]])

## 4. Train with Optimizer

For the processing, and efficiency for training the model, the `inputs` object will be converted to a Dataset object, and there will be defined an optimizer for the training. Also, if there's available GPU, the model will be moved to the GPU:

In [13]:
class OldSpanishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return self.encodings.input_ids.shape[0]

dataset = OldSpanishDataset(inputs)

In [14]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.train()
optim = Adam(model.parameters(), lr=LR)

In [15]:
#model = DataParallel(model, device_ids=[1, 2])
model.to(device);

In [None]:
%%capture training_output
%%time

for epoch in range(EPOCHS):
    for step, batch in enumerate(dataloader):
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optim.step()
        
        if epoch % 1 == 0 and step % 10000 == 0:
            print(f"Epoch {epoch} | step {step:03d} Loss: {loss.item()} ")

    print(f"Epoch {epoch} | step {step:03d} Loss: {loss.item()} [end]")

print("Training completed")

In [20]:
training_output.show()

Epoch 0 | step 000 Loss: 13.675179481506348 
Epoch 0 | step 35671 Loss: 0.12197628617286682 [end]
Epoch 1 | step 000 Loss: 0.10325197875499725 
Epoch 1 | step 35671 Loss: 0.1043149009346962 [end]
Epoch 2 | step 000 Loss: 0.08047603815793991 
Epoch 2 | step 35671 Loss: 0.09825348854064941 [end]
Epoch 3 | step 000 Loss: 0.08196956664323807 
Epoch 3 | step 35671 Loss: 0.06962953507900238 [end]
Epoch 4 | step 000 Loss: 0.07411856204271317 
Epoch 4 | step 35671 Loss: 0.08229523152112961 [end]
Training completed
CPU times: user 1d 23h 55min 4s, sys: 1min 59s, total: 1d 23h 57min 3s
Wall time: 1d 23h 56min 27s


And save the model into a file:


In [None]:
torch.save(model, MODEL_SAVE_PATH)

In [None]:
del model
torch.cuda.empty_cache()