#### Understanding MCQ Data Processing for BERT



In [214]:
# Let's import the public training set and take a look
import pandas as pd

train_df = pd.read_csv('./data/custom_example.csv')
train_df.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,My pet always barks,it is a dog,it is a cat,it is a monkey,it is a horse,it is a rabbit,A
1,1,My pet always meow,it is a dog,it is a cat,it is a monkey,it is a horse,it is a rabbit,B


In [215]:
print('id:',train_df['id'].iloc[0])
print('prompt:',train_df['prompt'].iloc[0])
print('A:',train_df['A'].iloc[0])
print('B:',train_df['B'].iloc[0])
print('C:',train_df['C'].iloc[0])
print('D:',train_df['D'].iloc[0])
print('E:',train_df['E'].iloc[0])
print('answer:',train_df['answer'].iloc[0])

id: 0
prompt: My pet always barks
A: it is a dog
B: it is a cat
C: it is a monkey
D: it is a horse
E: it is a rabbit
answer: A


In [216]:
# For convenience we'll turn our pandas Dataframe into a Dataset
from datasets import Dataset
train_ds = Dataset.from_pandas(train_df)

In [217]:
train_ds

Dataset({
    features: ['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer'],
    num_rows: 2
})

In [218]:
train_ds[0]

{'id': 0,
 'prompt': 'My pet always barks',
 'A': 'it is a dog',
 'B': 'it is a cat',
 'C': 'it is a monkey',
 'D': 'it is a horse',
 'E': 'it is a rabbit',
 'answer': 'A'}

In [219]:
from transformers import AutoTokenizer
import pprint
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [220]:
# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]

    return tokenized_example

tokenized_train_ds = train_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [221]:
#Loop through and print each entry in the dataset
#Visualize the tokenised dataset
for i in range(len(tokenized_train_ds)):
    print(f"Entry {i}: {tokenized_train_ds[i]}")

Entry 0: {'id': 0, 'input_ids': [[101, 2026, 9004, 2467, 11286, 2015, 102, 2009, 2003, 1037, 3899, 102], [101, 2026, 9004, 2467, 11286, 2015, 102, 2009, 2003, 1037, 4937, 102], [101, 2026, 9004, 2467, 11286, 2015, 102, 2009, 2003, 1037, 10608, 102], [101, 2026, 9004, 2467, 11286, 2015, 102, 2009, 2003, 1037, 3586, 102], [101, 2026, 9004, 2467, 11286, 2015, 102, 2009, 2003, 1037, 10442, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'label': 0}
Entry 1: {'id': 1, 'input_ids': [[101, 2026, 9004, 2467, 2033, 5004, 102, 2009, 2003, 1037, 3899, 102], [101, 2026, 9004, 2467, 2033, 5004, 102, 2009, 2003, 1037, 4937, 102

In [222]:
#elements
tokenized_train_ds['input_ids']

[[[101, 2026, 9004, 2467, 11286, 2015, 102, 2009, 2003, 1037, 3899, 102],
  [101, 2026, 9004, 2467, 11286, 2015, 102, 2009, 2003, 1037, 4937, 102],
  [101, 2026, 9004, 2467, 11286, 2015, 102, 2009, 2003, 1037, 10608, 102],
  [101, 2026, 9004, 2467, 11286, 2015, 102, 2009, 2003, 1037, 3586, 102],
  [101, 2026, 9004, 2467, 11286, 2015, 102, 2009, 2003, 1037, 10442, 102]],
 [[101, 2026, 9004, 2467, 2033, 5004, 102, 2009, 2003, 1037, 3899, 102],
  [101, 2026, 9004, 2467, 2033, 5004, 102, 2009, 2003, 1037, 4937, 102],
  [101, 2026, 9004, 2467, 2033, 5004, 102, 2009, 2003, 1037, 10608, 102],
  [101, 2026, 9004, 2467, 2033, 5004, 102, 2009, 2003, 1037, 3586, 102],
  [101, 2026, 9004, 2467, 2033, 5004, 102, 2009, 2003, 1037, 10442, 102]]]

In [223]:
#elements
tokenized_train_ds['token_type_ids']

[[[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]],
 [[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]]

In [224]:
#elements
tokenized_train_ds['attention_mask']

[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]

In [225]:
#elements
tokenized_train_ds['label']

[0, 1]

In [226]:
# Example: Decode the first example
for i in range(len(tokenized_train_ds)):
    # Access the input_ids for the ith example
    input_ids = tokenized_train_ds[i]['input_ids']
    
    # If input_ids is a nested list, flatten it
    if isinstance(input_ids[0], list):
        input_ids = [item for sublist in input_ids for item in sublist]
    
    # Decode the input_ids
    decoded_text = tokenizer.decode(input_ids, skip_special_tokens=False)
    
    # Print the decoded text
    print(f"Decoded Entry {i}: {decoded_text}")

Decoded Entry 0: [CLS] my pet always barks [SEP] it is a dog [SEP] [CLS] my pet always barks [SEP] it is a cat [SEP] [CLS] my pet always barks [SEP] it is a monkey [SEP] [CLS] my pet always barks [SEP] it is a horse [SEP] [CLS] my pet always barks [SEP] it is a rabbit [SEP]
Decoded Entry 1: [CLS] my pet always meow [SEP] it is a dog [SEP] [CLS] my pet always meow [SEP] it is a cat [SEP] [CLS] my pet always meow [SEP] it is a monkey [SEP] [CLS] my pet always meow [SEP] it is a horse [SEP] [CLS] my pet always meow [SEP] it is a rabbit [SEP]


In [227]:
### Example to Undestand:
# from dataclasses import dataclass
# from typing import Union, Optional
# import torch
# from transformers import PreTrainedTokenizerBase, BertTokenizer
# from torch.utils.data import DataLoader
# from datasets import Dataset

# # Define the DataCollatorForMultipleChoice class
# @dataclass
# class DataCollatorForMultipleChoice:
#     tokenizer: PreTrainedTokenizerBase
#     padding: Union[bool, str] = True
#     max_length: Optional[int] = None
#     pad_to_multiple_of: Optional[int] = None
    
#     def __call__(self, features):
#         label_name = "label" if 'label' in features[0].keys() else 'labels'
#         labels = [feature.pop(label_name) for feature in features]
#         batch_size = len(features)
#         num_choices = len(features[0]['input_ids'])  # Access the number of choices

#         # Flatten the features for each choice
#         flattened_features = []
#         for feature in features:
#             for i in range(num_choices):
#                 flattened_features.append({
#                     'input_ids': feature['input_ids'][i],
#                     'attention_mask': feature['attention_mask'][i]
#                 })
        
#         # Pad the sequences
#         batch = self.tokenizer.pad(
#             flattened_features,
#             padding=self.padding,
#             max_length=self.max_length,
#             pad_to_multiple_of=self.pad_to_multiple_of,
#             return_tensors='pt',
#         )

#         # Reshape the batch
#         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
#         batch['labels'] = torch.tensor(labels, dtype=torch.int64)
#         return batch

# # Load a pre-trained tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Create a sample MCQ dataset
# data = {
#     'input_ids': [
#         # Each question has four choices represented as lists of token IDs.
#         [
#             [101, 2054, 2003, 1996, 2851, 1997, 2000, 2017, 102],  # Choice 1: "Paris"
#             [101, 2003, 2129, 1997, 2017, 2000, 2018, 102],  # Choice 2: "London"
#             [101, 2003, 1996, 2798, 2003, 2061, 1997, 2000, 102],  # Choice 3: "Berlin"
#             [101, 2054, 2003, 1996, 3451, 2000, 2017, 102],  # Choice 4: "Madrid"
#         ],
#         [
#             [101, 2054, 2003, 1996, 2015, 2001, 2004, 102],  # Choice 1: "3"
#             [101, 2054, 2003, 1996, 2040, 2001, 2004, 102],  # Choice 2: "4"
#             [101, 2054, 2003, 1996, 2000, 2054, 102],  # Choice 3: "5"
#             [101, 2054, 2003, 1996, 2045, 2001, 2004, 102],  # Choice 4: "6"
#         ]
#     ],
#     'attention_mask': [
#         [
#             [1] * 9,  # Attention mask for Choice 1
#             [1] * 8,  # Attention mask for Choice 2
#             [1] * 9,  # Attention mask for Choice 3
#             [1] * 8   # Attention mask for Choice 4
#         ],
#         [
#             [1] * 8,  # Attention mask for Choice 1
#             [1] * 8,  # Attention mask for Choice 2
#             [1] * 7,  # Attention mask for Choice 3
#             [1] * 8   # Attention mask for Choice 4
#         ]
#     ],
#     'label': [0, 1]  # Correct answer index for each question
# }

# # Create a Dataset from the data
# tokenized_train_ds = Dataset.from_dict(data)

# # Create a DataLoader using the DataCollator
# data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer, padding=True)
# dataloader = DataLoader(tokenized_train_ds, batch_size=2, collate_fn=data_collator)

# # Get a batch from the DataLoader
# batch = next(iter(dataloader))

# # Print the resulting batch
# print(batch)

'''
the batch output will look like this for batch_size=1:

{'input_ids': tensor([[[ 101, 2054, 2003, 1996, 2851, 1997, 2000, 2017,  102],
         [ 101, 2003, 2129, 1997, 2017, 2000, 2018,  102,    0],
         [ 101, 2003, 1996, 2798, 2003, 2061, 1997, 2000,  102],
         [ 101, 2054, 2003, 1996, 3451, 2000, 2017,  102,    0]]]), 'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 0]]]), 'labels': tensor([0])}

the batch output will look like this for batch_size=2:

{'input_ids': tensor([[[ 101, 2054, 2003, 1996, 2851, 1997, 2000, 2017,  102],
         [ 101, 2003, 2129, 1997, 2017, 2000, 2018,  102,    0],
         [ 101, 2003, 1996, 2798, 2003, 2061, 1997, 2000,  102],
         [ 101, 2054, 2003, 1996, 3451, 2000, 2017,  102,    0]],

        [[ 101, 2054, 2003, 1996, 2015, 2001, 2004,  102,    0],
         [ 101, 2054, 2003, 1996, 2040, 2001, 2004,  102,    0],
         [ 101, 2054, 2003, 1996, 2000, 2054,  102,    0,    0],
         [ 101, 2054, 2003, 1996, 2045, 2001, 2004,  102,    0]]]), 'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 0]],

        [[1, 1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 0]]]), 'labels': tensor([0, 1])}
         
''';

In [228]:
# Following datacollator (adapted from https://huggingface.co/docs/transformers/tasks/multiple_choice)
# will dynamically pad our questions at batch-time so we don't have to make every question the length
# of our longest question.
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [229]:
# Now we'll instatiate the model that we'll finetune on our public dataset, then use to
# make prediction on the private dataset.
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
model = AutoModelForMultipleChoice.from_pretrained('bert-base-uncased')

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [230]:
# The arguments here are selected to run quickly; feel free to play with them.
model_dir = 'finetuned_bert'
training_args = TrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    report_to='none'
)



In [231]:
# Generally it's a bad idea to validate on your training set, but because our training set
# for this problem is so small we're going to train on all our data.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)

In [232]:
# Training should take about a minute
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.610346
2,No log,1.597996
3,No log,1.599787
4,No log,1.586468
5,No log,1.568846
6,No log,1.558079
7,No log,1.547341
8,No log,1.537585
9,No log,1.528217
10,No log,1.524506


TrainOutput(global_step=10, training_loss=1.6088575363159179, metrics={'train_runtime': 23.5736, 'train_samples_per_second': 0.848, 'train_steps_per_second': 0.424, 'total_flos': 616660999200.0, 'train_loss': 1.6088575363159179, 'epoch': 10.0})

In [233]:
# Now we can actually make predictions on our questions
predictions = trainer.predict(tokenized_train_ds)

In [234]:
#logits
predictions

PredictionOutput(predictions=array([[ 0.09753699, -0.06091017, -0.10660436, -0.02741827, -0.05923901],
       [-0.073296  , -0.01930785, -0.10353055, -0.03636026, -0.08368777]],
      dtype=float32), label_ids=array([0, 1]), metrics={'test_loss': 1.5245064496994019, 'test_runtime': 0.0673, 'test_samples_per_second': 29.733, 'test_steps_per_second': 14.866})

In [235]:
#predicted class
np.argmax(predictions.predictions,axis=1)

array([0, 1])

In [236]:
#predicted class in sorted manner
np.argsort(-predictions.predictions)

array([[0, 3, 4, 1, 2],
       [1, 3, 0, 4, 2]])

In [245]:
# The following function gets the indices of the highest scoring answers for each row
# and converts them back to our answer format (A, B, C, D, E)
import numpy as np
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    print('sorted indices:')
    print(sorted_answer_indices)
    
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    print('top answer indices:')    
    print(top_answer_indices)
    
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    print('top_answers:')
    print(top_answers)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)
    

In [246]:
# Let's double check our output looks correct:
predictions_to_map_output(predictions.predictions)

sorted indices:
[[0 3 4 1 2]
 [1 3 0 4 2]]
top answer indices:
[[0 3 4]
 [1 3 0]]
top_answers:
[['A' 'D' 'E']
 ['B' 'D' 'A']]


array(['A D E', 'B D A'], dtype='<U5')