In [1]:
# We will use DistilBERT model for this Masked Language Model (MLM) fine-tuning task
# DistilBERT is a small, fast, cheap and light Transformer model trained by distilling Bert base.
# It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of Bert’s performances as measured on the GLUE language understanding benchmark.
# DistilBERT is thus a good solution for quick prototyping and for production environments where real-time inference is necessary.

from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [2]:
# how many parameters does our model have?
distilbert_num_parameters = model.num_parameters()
print(f"Number of parameters in {model_checkpoint}: {distilbert_num_parameters}")

Number of parameters in distilbert-base-uncased: 66985530


Testing the model as is...

In [3]:
# test example
text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
# tokenize our text, pass it to the model and get output predictions
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits

# get the top 5 predicted tokens and their probabilities for the masked token
masked_index = (inputs["input_ids"][0] == tokenizer.mask_token_id).nonzero().item()
probs = torch.nn.functional.softmax(token_logits[0, masked_index], dim=-1)
top_5_tokens = torch.topk(token_logits[0, masked_index], 5, dim=-1).indices.tolist()
top_5_probabilities = torch.topk(probs, 5, dim=-1).values.tolist()

# print the results
for token, prob in zip(top_5_tokens, top_5_probabilities):
    print(tokenizer.decode([token]), prob)
    
    

deal 0.0365118607878685
success 0.0239587239921093
adventure 0.0237447340041399
idea 0.016085002571344376
feat 0.010877519845962524


These are everyday choices

To showcase domain adaptation, we'll use the famous Large Movie Dataset (or IMDb for short), which is a corpus of movie reviews that is often used to benchmark sentiment analysis models.

By fine-tuning DistilBERT on this corpus, we expect the language model will adapt its vocabulary from the factual data of Wikipedia that it was pretrained on to the more subjective elements of movie reviews.

We can get the movie reviews data from the Hugging Face Hub with load_dataset() function from Datasets:

In [13]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to C:/Users/Raj/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to C:/Users/Raj/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})