In [2]:
# We will use DistilBERT model for this Masked Language Model (MLM) fine-tuning task
# DistilBERT is a small, fast, cheap and light Transformer model trained by distilling Bert base.
# It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of Bert’s performances as measured on the GLUE language understanding benchmark.
# DistilBERT is thus a good solution for quick prototyping and for production environments where real-time inference is necessary.

from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [3]:
# how many parameters does our model have?
distilbert_num_parameters = model.num_parameters()
print(f"Number of parameters in {model_checkpoint}: {distilbert_num_parameters}")

Number of parameters in distilbert-base-uncased: 66985530


Testing the model as is...

In [4]:
# test example
text = "This is a great [MASK]."

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
# tokenize our text, pass it to the model and get output predictions
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits

# get the top 5 predicted tokens and their probabilities for the masked token
masked_index = (inputs["input_ids"][0] == tokenizer.mask_token_id).nonzero().item()
probs = torch.nn.functional.softmax(token_logits[0, masked_index], dim=-1)
top_5_tokens = torch.topk(token_logits[0, masked_index], 5, dim=-1).indices.tolist()
top_5_probabilities = torch.topk(probs, 5, dim=-1).values.tolist()

# print the results
for token, prob in zip(top_5_tokens, top_5_probabilities):
    print(tokenizer.decode([token]), prob)
    
    

deal 0.0365118607878685
success 0.0239587239921093
adventure 0.0237447340041399
idea 0.016085002571344376
feat 0.010877519845962524


These are everyday choices

To showcase domain adaptation, we'll use the famous Large Movie Dataset (or IMDb for short), which is a corpus of movie reviews that is often used to benchmark sentiment analysis models.

By fine-tuning DistilBERT on this corpus, we expect the language model will adapt its vocabulary from the factual data of Wikipedia that it was pretrained on to the more subjective elements of movie reviews.

We can get the movie reviews data from the Hugging Face Hub with load_dataset() function from Datasets:

In [7]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset

Found cached dataset imdb (C:/Users/Raj/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [8]:
# let's checkout the train dataset
sample_train = imdb_dataset["train"].shuffle(seed=42).select(range(3))
for row in sample_train:
    print(f"text: ", row["text"])
    print(f"label: ", row["label"])
    print()

Loading cached shuffled indices for dataset at C:\Users\Raj\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-9c48ce5d173413c7.arrow


text:  There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...
label:  1

text:  This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stub your toe on the m

In [9]:
# useful sanity check on the test dataset that the labels are indeed correct
sample_test = imdb_dataset["test"].shuffle(seed=42).select(range(3))
for row in sample_test:
    print(f"text: ", row["text"])
    print(f"label: ", row["label"])
    print()

Loading cached shuffled indices for dataset at C:\Users\Raj\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-c1eaa46e94dfbfd3.arrow


text:  <br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, however, I realized that this story was about A Thousand Other Things besides just Acres. I started crying and couldn't stop until long after the movie ended. Thank you Jane, Laura and Jocelyn, for bringing us such a wonderfully subtle and compassionate movie! Thank you cast, for being involved and portraying the characters with such depth and gentleness!<br /><br />I recognized the Angry sister; the Runaway sister and the sister in Denial. I recognized the Abusive Husband and why he was there and then the Father, oh oh the Father... all superbly played. I also recognized myself and this movie was an eye-opener, a relief, a chance to face my OWN truth and finally doing something about it. I truly hope A Thousand Acres has had the same effect on some others out there.<br /><br /

In [10]:
# what does the unsupervised dataset look like?
sample_unsupervised = imdb_dataset["unsupervised"].shuffle(seed=42).select(range(3))
for row in sample_unsupervised:
    print(f"text: ", row["text"])
    print(f"label: ", row["label"])
    print()

Loading cached shuffled indices for dataset at C:\Users\Raj\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-a1b3692aa5b43ab2.arrow


text:  If you've seen the classic Roger Corman version starring Vincent Price it's hard to put it out of your head, but you probably should do because this one is totally different. Subtlety has been abandoned in favour of gross-out horror - nudity, gore and all-round unpleasantness. OK it's ridiculous, trashy, sensationalised and historically dubious (did any members of the Inquisition really wear horn-rimmed glasses?), but despite all this it is strangely compelling. I literally couldn't tear myself away from the screen until the end of the movie. If there's a bigger compliment you can pay to a film I don't know what it is.
label:  -1

text:  For me, this was the most moving film of the decade. Samira Makhmalbaf shows pure bravery and vision in the making. She has an intelligence and gift for speaking to the people, regardless of their nationality or beliefs. I am inspired and touched by her humanity and can only hope that she has touched many people the same way. Her message in this

In [11]:
# concatenate all the examples and then split them into equal chunks so we don't lose information if we have to truncate them using individual examples.
# we will not set truncate=True in the tokenizer
# we will also grab the word ids for masking later on
# we will wrap this in a simple function and remove text and label columns since we don't need them

def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast: # use fast tokenizers to grab the word ids for masking later on
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# Use batched=True and activate fast multithreading!

tokenized_datasets = imdb_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text", "label"]
)

tokenized_datasets

Loading cached processed dataset at C:\Users\Raj\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-3dc38a01e511f796.arrow


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
Loading cached processed dataset at C:\Users\Raj\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-c9500a978766f0c3.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [12]:
# next step is to group the tokenized examples into chunks of equal sizes
tokenizer.model_max_length

512

In [13]:
# in order to run our experiments on a GPU that we have, we will pick something smaller than the maximum length of 512
# we will pick 128
chunk_size = 128

In [14]:
# slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]
for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"Review {idx} length of input ids: {len(sample)}")

Review 0 length of input ids: 363
Review 1 length of input ids: 304
Review 2 length of input ids: 133


In [15]:
# we can concatenate all these examples with a simple dictionary comprehension
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}

total_length = len(concatenated_examples["input_ids"])
print(f"Total length of concatenated reviews: {total_length}")

Total length of concatenated reviews: 800


In [16]:
# now we can group the concatenated examples into chunks of equal sizes
# use list comprehension to create slices of each feature
chunks = {
    k: [concatenated_examples[k][i:i+chunk_size] for i in range(0, total_length, chunk_size)]
    for k in concatenated_examples.keys()
}

# print the chunk lengths
for k, v in chunks.items():
    print(f"Length of {k} chunks: {len(v)}")

# print the chunk lengths of input_ids
for idx, chunk in enumerate(chunks["input_ids"]):
    print(f"Chunk {idx} length of input ids: {len(chunk)}")



Length of input_ids chunks: 7
Length of attention_mask chunks: 7
Length of word_ids chunks: 7
Chunk 0 length of input ids: 128
Chunk 1 length of input ids: 128
Chunk 2 length of input ids: 128
Chunk 3 length of input ids: 128
Chunk 4 length of input ids: 128
Chunk 5 length of input ids: 128
Chunk 6 length of input ids: 32


In [17]:
# let's wrap the above logic in a function
def group_texts(examples):
    # concatenate all the input_ids into one long list
    concatenated_examples = {
        k: sum(examples[k], []) for k in examples.keys()
    }
    # compute the total length of the concatenated examples
    total_length = len(concatenated_examples["input_ids"])
    # we drop the last chunk if it is smaller than chunk size
    total_length = (total_length // chunk_size) * chunk_size
    # split the concatenated examples into chunks of equal size
    result = {
        k: [concatenated_examples[k][i:i+chunk_size] for i in range(0, total_length, chunk_size)]
        for k in concatenated_examples.keys()
    }
    # create a new labels column. By providing a copy of the input_ids, we provide the ground truth for the model to train on
    result["labels"] = result["input_ids"].copy()
    return result


In [18]:
# apply group texts to our tokenized datasets using our dataset.map() method
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
)
lm_datasets

Loading cached processed dataset at C:\Users\Raj\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-1f4439da88ac132d.arrow


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Loading cached processed dataset at C:\Users\Raj\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-c2f7e7ffd8c7e60a.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [19]:
# let's take a look at the first example
tokenizer.decode(lm_datasets["train"][0]["input_ids"])

'[CLS] i rented i am curious - yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u. s. customs if it ever tried to enter this country, therefore being a fan of films considered " controversial " i really had to see this for myself. < br / > < br / > the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life. in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such'

In [20]:
# let's take a look at the second example
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [21]:
# let's take a look at the third example
tokenizer.decode(lm_datasets["train"][2]["input_ids"])

'arguably their answer to good old boy john ford, had sex scenes in his films. < br / > < br / > i do commend the filmmakers for the fact that any sex shown in the film is shown for artistic purposes rather than just to shock people and make money to be shown in pornographic theaters in america. i am curious - yellow is a good film for anyone wanting to study the meat and potatoes ( no pun intended ) of swedish cinema. but really, this film doesn\'t have much of a plot. [SEP] [CLS] " i am curious : yellow " is a risible and pretentious steaming pile. it doesn'

In [22]:
# let's take a look at the third label
tokenizer.decode(lm_datasets["train"][2]["labels"])

'arguably their answer to good old boy john ford, had sex scenes in his films. < br / > < br / > i do commend the filmmakers for the fact that any sex shown in the film is shown for artistic purposes rather than just to shock people and make money to be shown in pornographic theaters in america. i am curious - yellow is a good film for anyone wanting to study the meat and potatoes ( no pun intended ) of swedish cinema. but really, this film doesn\'t have much of a plot. [SEP] [CLS] " i am curious : yellow " is a risible and pretentious steaming pile. it doesn'

In [23]:
# Mask 15% of the input tokens at random for the model to train and predict
# we can use the hf transformers data collator for language modeling for this task
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [37]:
# let's take a look at the first example of the training dataset for masks
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] i rented i am curious - yellow from my video store because of all the controversy that surrounded it when climbing was first released [MASK] 1967 [MASK] i also heard that at first [MASK] was seized by u. s [MASK] customs if it ever tried to enter this country [MASK] therefore being a fan of films considered " controversial " i really had to see this for myself. < br [MASK] > < br / [MASK] the plot [MASK] centered [MASK] a young swedish drama student named [MASK] who [MASK] to learn everything she can about life. in particular she wants [MASK] focus her attentions to making [MASK] sort of documentary on what the average swede thought about [MASK] political issues such'

'>>> as the vietnam war and race [MASK] in the united states. in between asking bobbed [MASK] ordinary [MASK] [MASK]ns [MASK] stockholm about [MASK] opinions on politics [MASK] she has sex with her drama teacher, classmates, [MASK] married men [MASK] < br [MASK] [MASK] < [MASK] / > what kills me about [MASK] 

In [38]:
samples

[{'input_ids': [101,
   1045,
   12524,
   1045,
   2572,
   8025,
   1011,
   3756,
   2013,
   2026,
   2678,
   3573,
   2138,
   1997,
   2035,
   1996,
   6704,
   2008,
   5129,
   2009,
   2043,
   2009,
   2001,
   2034,
   2207,
   1999,
   3476,
   1012,
   1045,
   2036,
   2657,
   2008,
   2012,
   2034,
   2009,
   2001,
   8243,
   2011,
   1057,
   1012,
   1055,
   1012,
   8205,
   2065,
   2009,
   2412,
   2699,
   2000,
   4607,
   2023,
   2406,
   1010,
   3568,
   2108,
   1037,
   5470,
   1997,
   3152,
   2641,
   1000,
   6801,
   1000,
   1045,
   2428,
   2018,
   2000,
   2156,
   2023,
   2005,
   2870,
   1012,
   1026,
   7987,
   1013,
   1028,
   1026,
   7987,
   1013,
   1028,
   1996,
   5436,
   2003,
   8857,
   2105,
   1037,
   2402,
   4467,
   3689,
   3076,
   2315,
   14229,
   2040,
   4122,
   2000,
   4553,
   2673,
   2016,
   2064,
   2055,
   2166,
   1012,
   1999,
   3327,
   2016,
   4122,
   2000,
   3579,
   2014,
   3086,
   20

In [40]:
# let's take a look at the first example of the training dataset for labels
samples = lm_datasets["train"][:2]
samples

{'input_ids': [[101,
   1045,
   12524,
   1045,
   2572,
   8025,
   1011,
   3756,
   2013,
   2026,
   2678,
   3573,
   2138,
   1997,
   2035,
   1996,
   6704,
   2008,
   5129,
   2009,
   2043,
   2009,
   2001,
   2034,
   2207,
   1999,
   3476,
   1012,
   1045,
   2036,
   2657,
   2008,
   2012,
   2034,
   2009,
   2001,
   8243,
   2011,
   1057,
   1012,
   1055,
   1012,
   8205,
   2065,
   2009,
   2412,
   2699,
   2000,
   4607,
   2023,
   2406,
   1010,
   3568,
   2108,
   1037,
   5470,
   1997,
   3152,
   2641,
   1000,
   6801,
   1000,
   1045,
   2428,
   2018,
   2000,
   2156,
   2023,
   2005,
   2870,
   1012,
   1026,
   7987,
   1013,
   1028,
   1026,
   7987,
   1013,
   1028,
   1996,
   5436,
   2003,
   8857,
   2105,
   1037,
   2402,
   4467,
   3689,
   3076,
   2315,
   14229,
   2040,
   4122,
   2000,
   4553,
   2673,
   2016,
   2064,
   2055,
   2166,
   1012,
   1999,
   3327,
   2016,
   4122,
   2000,
   3579,
   2014,
   3086,
   20