#Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture

!pip install transformers datasets evaluate
!pip install --upgrade accelerate

In [None]:
from datasets import DatasetDict, Dataset, load_dataset, load_metric
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

#Instantiate model

In [None]:
from transformers import AutoModelForMultipleChoice, AutoTokenizer, TrainingArguments, Trainer
model_checkpoint = "bert-base-uncased"
model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

batch_size = 16

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

#Load and Preprocess data

In [None]:
data = np.load('/content/drive/MyDrive/brain_teaser/datasets/WP-train.npy', allow_pickle=True)

In [None]:
data_dict = {'id':[],
             'question':[],
             'answer':[],
             'distractor1':[],
             'distractor2':[],
             'distractor(unsure)':[],
             'label':[],
             'choice_list':[],
             'choice_order':[]}
for i in data:
  for k,v in i.items():
    data_dict[k].append(v)

In [None]:
df = pd.DataFrame(data_dict, columns=['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'label', 'choice_list', 'choice_order'])
df = df.drop(columns=['distractor1','distractor2', 'distractor(unsure)'])
df

Unnamed: 0,id,question,answer,label,choice_list,choice_order
0,WP-0,How do you spell COW in thirteen letters?,SEE O DOUBLE YOU.,1,"[SEE OH DEREFORD, SEE O DOUBLE YOU., COWCOWCOW...","[2, 0, 1, 3]"
1,WP-0_SR,"In thirteen letters, how do you spell COW?",SEE O DOUBLE YOU.,2,"[SEE OH DEREFORD, COWCOWCOWCOWW, SEE O DOUBLE ...","[2, 1, 0, 3]"
2,WP-0_CR,How do you spell COB in seven letters?,SEE O BEE,2,"[COBCOBB, COBBLER, SEE O BEE, None of above.]","[1, 2, 0, 3]"
3,WP-1,"If eleven plus two equals one, what does nine ...",Two.,1,"[Four., Two., Three., None of above.]","[2, 0, 1, 3]"
4,WP-1_SR,What does nine plus five equal if eleven plus ...,Two.,1,"[Three., Two., Four., None of above.]","[1, 0, 2, 3]"
...,...,...,...,...,...,...
391,WP-162_SR,Which sort of bell doesn't ring?,A dumbbell.,0,"[A dumbbell., A doorbell., A cowbell., None of...","[0, 1, 2, 3]"
392,WP-162_CR,"What kind of ""stone"" can't be found in a quarry?",Milestone.,2,"[Cobblestone., Sandstone., Milestone., None of...","[1, 2, 0, 3]"
393,WP-163,What type of ice never melts?,Dice.,0,"[Dice., Flaked ice., Glacier ice., None of abo...","[0, 2, 1, 3]"
394,WP-163_SR,What kind of ice doesn't melt?,Dice.,1,"[Glacier ice., Dice., Flaked ice., None of abo...","[1, 0, 2, 3]"


In [None]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
dataset = Dataset.from_pandas(df)

In [None]:
def show_one(example):
    print(f"Question: {example['question']}")
    print(f"  A - {example['choice_list'][0].strip()}")
    print(f"  B - {example['choice_list'][1].strip()}")
    print(f"  C - {example['choice_list'][2].strip()}")
    print(f"  D - {example['choice_list'][3].strip()}")
    print(f"\nGround truth: option {['A', 'B', 'C', 'D'][example['label']]}")

In [None]:
show_one(dataset[0])

Question: What bird is very rude?
  A - Humming bird.
  B - Eagle.
  C - Mocking bird.
  D - None of above.

Ground truth: option C


In [None]:
def preprocess_function(example):
  question = [[q] * 4 for q in example['question']]
  choice = [example['choice_list'][i] for i in range(len(question))]

  question = sum(question, [])
  choice = sum(choice, [])
  print(question, len(question))
  print(choice, len(choice))
  tokenized_example = tokenizer(question, choice, truncation=True)
  return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_example.items()}

In [None]:
examples = dataset[:5]
features = preprocess_function(examples)
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])

['What bird is very rude?', 'What bird is very rude?', 'What bird is very rude?', 'What bird is very rude?', 'What type of currency do vampires use?', 'What type of currency do vampires use?', 'What type of currency do vampires use?', 'What type of currency do vampires use?', 'Guess next letters in the series GNL.', 'Guess next letters in the series GNL.', 'Guess next letters in the series GNL.', 'Guess next letters in the series GNL.', 'What is at the end of a cow and in front of a woman?', 'What is at the end of a cow and in front of a woman?', 'What is at the end of a cow and in front of a woman?', 'What is at the end of a cow and in front of a woman?', "Which sort of bell doesn't ring?", "Which sort of bell doesn't ring?", "Which sort of bell doesn't ring?", "Which sort of bell doesn't ring?"] 20
['Humming bird.', 'Eagle.', 'Mocking bird.', 'None of above.', 'New money.', 'Blood money.', 'Old money.', 'None of above.', 'T,M,G.', 'I, T, S.\n', 'T,N,T.', 'None of above.', 'The letter

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)
dataset_train_test = encoded_dataset.train_test_split(test_size=0.1, seed=42)

Map:   0%|          | 0/396 [00:00<?, ? examples/s]

['What bird is very rude?', 'What bird is very rude?', 'What bird is very rude?', 'What bird is very rude?', 'What type of currency do vampires use?', 'What type of currency do vampires use?', 'What type of currency do vampires use?', 'What type of currency do vampires use?', 'Guess next letters in the series GNL.', 'Guess next letters in the series GNL.', 'Guess next letters in the series GNL.', 'Guess next letters in the series GNL.', 'What is at the end of a cow and in front of a woman?', 'What is at the end of a cow and in front of a woman?', 'What is at the end of a cow and in front of a woman?', 'What is at the end of a cow and in front of a woman?', "Which sort of bell doesn't ring?", "Which sort of bell doesn't ring?", "Which sort of bell doesn't ring?", "Which sort of bell doesn't ring?", 'How did the champ repair the leaking facet?', 'How did the champ repair the leaking facet?', 'How did the champ repair the leaking facet?', 'How did the champ repair the leaking facet?', 'Wh

In [None]:
idx = 0
[tokenizer.decode(encoded_dataset["input_ids"][idx][i]) for i in range(4)]

['[CLS] what bird is very rude? [SEP] humming bird. [SEP]',
 '[CLS] what bird is very rude? [SEP] eagle. [SEP]',
 '[CLS] what bird is very rude? [SEP] mocking bird. [SEP]',
 '[CLS] what bird is very rude? [SEP] none of above. [SEP]']

In [None]:
dataset_train_test

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'label', 'choice_list', 'choice_order', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 356
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'label', 'choice_list', 'choice_order', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40
    })
})

#Train Models

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in dataset_train_test["train"][i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(4)]

['[CLS] what bird is very rude? [SEP] humming bird. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] what bird is very rude? [SEP] eagle. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] what bird is very rude? [SEP] mocking bird. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] what bird is very rude? [SEP] none of above. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']

In [None]:
import numpy as np

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    output_dir=f"{model_name}-WP-finetuned",
    seed=0,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_steps=10,
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model="accuracy",
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=dataset_train_test["train"],
    eval_dataset=dataset_train_test["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

##<font color=yellow>BERT-base</font>

###<font color=greeen>word puzzle</font>

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.787947,0.675
2,No log,0.419304,0.775
3,No log,0.314118,0.85
4,No log,0.574959,0.825
5,No log,0.585125,0.85
6,No log,0.688956,0.825
7,No log,0.685295,0.825
8,No log,0.738119,0.875
9,No log,0.682928,0.85
10,No log,0.704824,0.825


TrainOutput(global_step=230, training_loss=0.19897376350734544, metrics={'train_runtime': 185.5141, 'train_samples_per_second': 19.19, 'train_steps_per_second': 1.24, 'total_flos': 427461182498784.0, 'train_loss': 0.19897376350734544, 'epoch': 10.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/brain_teaser/final_best_models/BERT_base_uncased_word_puzzle")

In [None]:
trainer.evaluate()

{'eval_loss': 0.31411826610565186,
 'eval_accuracy': 0.8500000238418579,
 'eval_runtime': 0.6505,
 'eval_samples_per_second': 61.49,
 'eval_steps_per_second': 4.612,
 'epoch': 10.0}