#Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture

!pip install transformers datasets evaluate
!pip install --upgrade accelerate

In [None]:
from datasets import DatasetDict, Dataset, load_dataset, load_metric
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

#Instantiate model

In [None]:
from transformers import RobertaForMultipleChoice, AutoTokenizer, TrainingArguments, Trainer
model_checkpoint = "roberta-large"
model = RobertaForMultipleChoice.from_pretrained(model_checkpoint)

batch_size = 8

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

#Load and Preprocess data

In [None]:
data = np.load('/content/drive/MyDrive/brain_teaser/datasets/BT_data/SP-train.npy', allow_pickle=True)

In [None]:
data_dict = {'id':[],
             'question':[],
             'answer':[],
             'distractor1':[],
             'distractor2':[],
             'distractor(unsure)':[],
             'label':[],
             'choice_list':[],
             'choice_order':[]}
for i in data:
  for k,v in i.items():
    data_dict[k].append(v)

In [None]:
df = pd.DataFrame(data_dict, columns=['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'label', 'choice_list', 'choice_order'])
df = df.drop(columns=['distractor1','distractor2', 'distractor(unsure)'])
df

Unnamed: 0,id,question,answer,label,choice_list,choice_order
0,SP-0,Mr. and Mrs. Mustard have six daughters and ea...,Each daughter shares the same brother.,1,[Some daughters get married and have their own...,"[1, 0, 2, 3]"
1,SP-0_SR,The six daughters of Mr. and Mrs. Mustard each...,Each daughter shares the same brother.,2,[Some brothers were not loved by family and mo...,"[2, 1, 0, 3]"
2,SP-0_CR,"A chess team has five players, and each player...",Each player shares the same coach.,0,"[Each player shares the same coach., Some play...","[0, 2, 1, 3]"
3,SP-1,A woman shoots her husband. Then she holds him...,The woman was a photographer. She shot a pictu...,2,[The woman gets arrested for murder after dinn...,"[1, 2, 0, 3]"
4,SP-1_SR,An individual shoots their spouse. She continu...,The woman was a photographer. She shot a pictu...,1,[The woman gets arrested for murder after dinn...,"[1, 0, 2, 3]"
...,...,...,...,...,...,...
502,SP-207_SR,"Ten pears hung high, ten men passed past, Each...",EACH is the name of one of the men.,2,"[Other men can't reach the pear., Other men di...","[1, 2, 0, 3]"
503,SP-207_CR,There were twenty boys with different names in...,All is the name of one of the boys.,1,"[Other boys don't have names., All is the name...","[2, 0, 1, 3]"
504,SP-208,"The more you take, the more you leave behind",Footsteps.,1,"[Love., Footsteps., Money., None of above.]","[1, 0, 2, 3]"
505,SP-208_SR,"The more you take, the more you abandon.",Footsteps.,1,"[Love., Footsteps., Money., None of above.]","[1, 0, 2, 3]"


In [None]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
dataset = Dataset.from_pandas(df)

In [None]:
def show_one(example):
    print(f"Question: {example['question']}")
    print(f"  A - {example['choice_list'][0].strip()}")
    print(f"  B - {example['choice_list'][1].strip()}")
    print(f"  C - {example['choice_list'][2].strip()}")
    print(f"  D - {example['choice_list'][3].strip()}")
    print(f"\nGround truth: option {['A', 'B', 'C', 'D'][example['label']]}")

In [None]:
show_one(dataset[0])

Question: How can a man goes to football team every day but doesn't play football at all.
  A - He is a coach.
  B - Every day the weather is raining.
  C - The football team will play tennis sometime.
  D - None of above.

Ground truth: option A


In [None]:
def preprocess_function(example):
  question = [[q] * 4 for q in example['question']]
  choice = [example['choice_list'][i] for i in range(len(question))]

  question = sum(question, [])
  choice = sum(choice, [])
  print(question, len(question))
  print(choice, len(choice))
  tokenized_example = tokenizer(question, choice, truncation=True)
  return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_example.items()}

In [None]:
examples = dataset[:5]
features = preprocess_function(examples)
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])

["How can a man goes to football team every day but doesn't play football at all.", "How can a man goes to football team every day but doesn't play football at all.", "How can a man goes to football team every day but doesn't play football at all.", "How can a man goes to football team every day but doesn't play football at all.", 'There was a farmer who wanted to go investigate a problem with his chicken coop. As soon as he arrived, he saw a rooster perched atop the barn. Now the barn had a top that resembled a triangle. However, the rooster unexpectedly laid an egg. How did it roll on that side?', 'There was a farmer who wanted to go investigate a problem with his chicken coop. As soon as he arrived, he saw a rooster perched atop the barn. Now the barn had a top that resembled a triangle. However, the rooster unexpectedly laid an egg. How did it roll on that side?', 'There was a farmer who wanted to go investigate a problem with his chicken coop. As soon as he arrived, he saw a roost

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)
dataset_train_test = encoded_dataset.train_test_split(test_size=0.1, seed=42)

Map:   0%|          | 0/507 [00:00<?, ? examples/s]



In [None]:
idx = 0
[tokenizer.decode(encoded_dataset["input_ids"][idx][i]) for i in range(4)]

["<s>How can a man goes to football team every day but doesn't play football at all.</s></s>He is a coach.</s>",
 "<s>How can a man goes to football team every day but doesn't play football at all.</s></s>Every day the weather is raining.</s>",
 "<s>How can a man goes to football team every day but doesn't play football at all.</s></s>The football team will play tennis sometime.</s>",
 "<s>How can a man goes to football team every day but doesn't play football at all.</s></s>None of above.</s>"]

In [None]:
dataset_train_test

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'label', 'choice_list', 'choice_order', 'input_ids', 'attention_mask'],
        num_rows: 456
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'label', 'choice_list', 'choice_order', 'input_ids', 'attention_mask'],
        num_rows: 51
    })
})

#Train Models

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in dataset_train_test["train"][i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(4)]

['<s>James lives in a remote mountainous region that experiences heavy snowfall during the winter season. He wants to set up a cozy cabin on a picturesque plateau, but he needs to transport construction materials and furniture to the location. Unfortunately, there are no roads or accessible paths to reach the plateau. How does James overcome this challenge?</s></s>He waits for the heavy snowfall during the winter months and uses a sled to transport the construction materials and furniture.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<s>James lives in a remote mountainous region that experiences heavy snowfall during the winter season. He wants to set up a cozy cabin on a picturesque plateau, but he needs to transport construction materials and furniture to the 

In [None]:
import numpy as np

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    output_dir=f"{model_name}-SP-finetuned",
    seed=0,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_steps=10,
    learning_rate=1e-4,
    # learning_rate=1e-5,
    # learning_rate=1e-6,
    # learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model="accuracy",
    # fp16=True
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=dataset_train_test["train"],
    eval_dataset=dataset_train_test["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

##<font color=yellow>RoBERTa-large</font>

###<font color=greeen>sentence-puzzle</font>

10 epochs batchsize=8, learning_rate=1e-5

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.2967,1.072988,0.607843
2,0.5132,0.715033,0.745098
3,0.1456,0.656325,0.823529
4,0.2018,0.687917,0.803922
5,0.02,0.845517,0.823529
6,0.0034,0.663994,0.882353
7,0.0009,0.756025,0.862745
8,0.0043,0.87804,0.862745
9,0.0005,0.890724,0.862745
10,0.0003,0.90287,0.862745


TrainOutput(global_step=570, training_loss=0.25715642145328355, metrics={'train_runtime': 772.102, 'train_samples_per_second': 5.906, 'train_steps_per_second': 0.738, 'total_flos': 3471199617063168.0, 'train_loss': 0.25715642145328355, 'epoch': 10.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/brain_teaser/final_best_models/RoBERT_large_sentence_puzzle_post_eval")