# Fine-tuning a masked language model

Welcome to the 'simple_mlm_imdb.ipynb' notebook! 

Here we will fine-tune the pretrained model for masked language modeling task (in English) via Trainer API/Accelerate to achieve domain adaptation.

Mostly based on chapter from Hugging Face NLP Course: https://huggingface.co/learn/nlp-course/chapter7/3

In [None]:
from transformers import AutoModelForMaskedLM

# pretrained model for masked language modeling
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # to produce input for model

import torch

# imdb dataset
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# tokenize our data
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)

chunk_size = 128 

# concatenate examples
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

# lets apply concatenation
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

# we need to insert [MASK] tokens at random positions into our data (input_ids)
# we will use DataCollatorForLanguageModeling with mlm_probability set to 0.15 (15%)
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# if we want to perform whole word masking not only token masking:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2 # mask 20% of words

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

# downsample data to 11000 samples
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)

In [None]:
# log in to hub
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import accelerate
# define TrainingArguments
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    #fp16=True,                                          # to enable mixed-precision training, which gives us another boost in speed
    logging_steps=logging_steps,                        # to show the training los
#    remove_unused_columns=False,                       # if we use whole word masking collator, uncomment this to ensure we don’t lose the word_ids column during training
)

# instantiate the Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,                    # we are using standard data collator which masks tokens not whole words here
    tokenizer=tokenizer,
)

# We’re now ready to run trainer.train()

In [None]:
# we'll be using perplexity as our metrics
import math
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"

# take a look at start perplexity
eval_results = trainer.evaluate()
print(f">>> Perplexity before training: {math.exp(eval_results['eval_loss']):.2f}")

# run the training loop
trainer.train()

# compute the resulting perplexity on the test set as before
eval_results = trainer.evaluate()
print(f">>> Perplexity after training: {math.exp(eval_results['eval_loss']):.2f}")

# push model to hub
trainer.push_to_hub()

In [None]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model= "SUPERSOKOL/distilbert-base-uncased-finetuned-imdb" # put yours checkpoint here
)
text = "This is a great [MASK]."
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")