# Language Model retraining using MLM task

## 0. Define Hyperparameters

In [1]:
AVAILABLE_GPU = 2 # Available GPU with 0% usage
HUGGINGFACE_MODEL = "dccuchile/bert-base-spanish-wwm-cased" # HuggingFace checkpoint model to train
MODEL_SAVE_PATH = "./output/latam-old-spanish-beto-cased.pt" # Path to save the model

MASK_PROB = 0.15 # Probability of masking within a text
LR = 2e-5 # Learning rate for Adam
EPOCHS = 5 # Number of epochs to train
BATCH_SIZE = 32 # Batch size for training
MAX_TOKENIZER_LENGTH = 512 # Maximum length of the texts in the tokenizer

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = f"{AVAILABLE_GPU}"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
tf_device=f'/gpu:{AVAILABLE_GPU}'

from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.nn.parallel import DataParallel
from torch.optim import Adam
import nltk
import pandas as pd
from datasets import Dataset
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/historynlp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 1. Load pre-trained LM for MLM and it's tokenizer

Define the tokenizer and the model, using the Model's `transformers` ID from Hugging Face. It's important to consider the possible differences between _cased_ and _uncased_ models.

In [3]:
tokenizer = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL)
model = AutoModelForMaskedLM.from_pretrained(HUGGINGFACE_MODEL)

In [4]:
CLS_TOKEN = tokenizer.cls_token_id
SEP_TOKEN = tokenizer.sep_token_id
PAD_TOKEN = tokenizer.pad_token_id
MASK_TOKEN = tokenizer.mask_token_id
CLS_TOKEN, SEP_TOKEN, PAD_TOKEN, MASK_TOKEN

(4, 5, 1, 0)

## 2. Load and pre-process dataset

For training the model, we'll take the training split from the full spanish corpus, after cleaning. Given after the preparation stage:
> This corpus is already chunked for no more than 512 tokens, so there will be no chunking required for models that has this maximum length

In [5]:
df = pd.read_csv("./data/old-spanish-corpus-chunked.tsv", sep="\t", usecols=["text", "source"])
df = df[(df.source == "LatamXIX")]
df.reset_index(drop=True, inplace=True)

dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['source', 'text'],
    num_rows: 29972
})

A random sample of a text:

In [6]:
dataset[10000]["text"]

'Agregábase un acontecimiento del exterior: el 4 de Agosto de 1741 la Suecia declaró la guerra al imperio ruso; pero ya el 25 del mismo mes el general ruso Keith, cuyo gobierno fué puesto al corriente de aquella declaración y de todos los planes de la Sue'

## 3. Tokenize and Mask the dataset

Now, it's possible to start the retraining process of the model for the MLM task. For that, we'll first tokenize the input texts:

In [7]:
%%capture tokenizer_output
%%time

# All the texts of our dataset are stored in dataset["text"]
inputs = tokenizer(dataset["text"], return_tensors='pt', max_length=MAX_TOKENIZER_LENGTH, truncation=True, padding='max_length')
inputs

In [8]:
tokenizer_output.show()

CPU times: user 27.8 s, sys: 3.84 s, total: 31.7 s
Wall time: 4.92 s


{'input_ids': tensor([[    4,  1032, 12204,  ...,     1,     1,     1],
        [    4,  1139,  1009,  ...,     1,     1,     1],
        [    4,  1044,  2570,  ...,     1,     1,     1],
        ...,
        [    4,  2991,  1054,  ...,     1,     1,     1],
        [    4,  1015, 29848,  ...,     1,     1,     1],
        [    4,  9907,  1019,  ...,     1,     1,     1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

Here's a comparation between the masked sentence and the original one:

In [9]:
dataset[100]['text']

'Palabra , el 2º Óvalo no parecia sino un panteon en los dias de Todos Santos. Al fin quiso hablar [aunque sin ocupar la tribuna] un Orangutan ; pero el infeliz padecia de una afeccion asmatica cruelisima y no pudo mas que grasnar y tocer largo rato; así es , que despues de haber paseado sobre todos sus camaradas , tiernas y agonizantes miradas, acabó por extender su larga mano y se despidió. La Hormiga abrazada de la Jirafa , y dejando correr un torrente de amargo llanto por sus hermosas , aunque tostadas mejillas , se despidió tambien , pero á la francesa con tres besos en los carrillos y tres abrazos fortísimos. Tanto que la palpitacion de la que hemos hablado , hubo de acabar con su interesante existencia.'

In [10]:
' '.join([tokenizer.decode(x).replace(' ', '') for x in inputs['input_ids'][100]])

'[CLS] palabra , el [UNK] ó ##valo no pareci ##a sino un pante ##on en los dias de todos santos . al fin quiso hablar [ aunque sin ocupar la tribuna ] un ora ##ng ##uta ##n ; pero el infeliz pad ##ecia de una afec ##cion asma ##tica cruel ##isi ##ma y no pudo mas que gras ##nar y toc ##er largo rato ; así es , que despues de haber pase ##ado sobre todos sus camaradas , tier ##nas y agon ##izantes miradas , acabó por extender su larga mano y se despidió . la horm ##iga abra ##zada de la ji ##ra ##fa , y dejando correr un torrente de amargo llanto por sus hermosas , aunque tos ##tadas mejillas , se despidió tambien , pero á la francesa con tres besos en los carril ##los y tres abrazo ##s fort ##ísimos . tanto que la palp ##ita ##cion de la que hemos hablado , hubo de acabar con su interesante existencia . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

Then, the `labels` of the inputs are defined as a copy of the `input_ids`, and some percentage of the inputs are masked:

In [11]:
inputs['labels'] = inputs.input_ids.detach().clone()
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < MASK_PROB) * (inputs.input_ids != CLS_TOKEN) * (inputs.input_ids != SEP_TOKEN) * (inputs.input_ids != PAD_TOKEN)
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [12]:
mask = mask_arr.bool()
indices = mask.nonzero(as_tuple=False)
inputs.input_ids[indices[:, 0], indices[:, 1]] = MASK_TOKEN 

inputs.input_ids

tensor([[    4,  1032, 12204,  ...,     1,     1,     1],
        [    4,  1139,     0,  ...,     1,     1,     1],
        [    4,  1044,  2570,  ...,     1,     1,     1],
        ...,
        [    4,  2991,  1054,  ...,     1,     1,     1],
        [    4,  1015, 29848,  ...,     1,     1,     1],
        [    4,  9907,  1019,  ...,     1,     1,     1]])

## 4. Train with Optimizer

For the processing, and efficiency for training the model, the `inputs` object will be converted to a Dataset object, and there will be defined an optimizer for the training. Also, if there's available GPU, the model will be moved to the GPU:

In [13]:
class OldSpanishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return self.encodings.input_ids.shape[0]

dataset = OldSpanishDataset(inputs)

In [14]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.train()
optim = Adam(model.parameters(), lr=LR)

In [15]:
#model = DataParallel(model, device_ids=[1, 2])
model.to(device);

In [16]:
%%capture training_output
%%time

for epoch in range(EPOCHS):
    for step, batch in enumerate(dataloader):
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optim.step()
        
        if epoch % 1 == 0 and step % 100 == 0:
            print(f"Epoch {epoch} | step {step:03d} Loss: {loss.item()} ")

    print(f"Epoch {epoch} | step {step:03d} Loss: {loss.item()} [end]")

print("Training completed")

In [17]:
training_output.show()

Epoch 0 | step 000 Loss: 13.151481628417969 
Epoch 0 | step 936 Loss: 0.22948646545410156 [end]
Epoch 1 | step 000 Loss: 0.23694846034049988 
Epoch 1 | step 936 Loss: 0.1889580637216568 [end]
Epoch 2 | step 000 Loss: 0.19381776452064514 
Epoch 2 | step 936 Loss: 0.13998401165008545 [end]
Epoch 3 | step 000 Loss: 0.18480820953845978 
Epoch 3 | step 936 Loss: 0.1526048332452774 [end]
Epoch 4 | step 000 Loss: 0.1915740668773651 
Epoch 4 | step 936 Loss: 0.16695943474769592 [end]
Training completed
CPU times: user 1h 15min 12s, sys: 1.95 s, total: 1h 15min 14s
Wall time: 1h 15min 13s


And save the model into a file:


In [18]:
torch.save(model, MODEL_SAVE_PATH)

In [19]:
del model
torch.cuda.empty_cache()