# Installing and importing required libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%%capture
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!pip install transformers
!pip install rouge_score

In [None]:
from IPython.display import clear_output
from __future__ import unicode_literals

import torch
import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import evaluate
from tqdm import tqdm
from datasets import DatasetDict, Dataset, load_dataset, load_metric

rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Preparing and preprocessing the dataset

###Load source and target files

In [None]:
data = np.load('/content/drive/MyDrive/brain_teaser/datasets/BT_data/WP-train.npy', allow_pickle=True)

In [None]:
data_dict = {'id':[],
             'question':[],
             'answer':[],
             'distractor1':[],
             'distractor2':[],
             'distractor(unsure)':[],
             'label':[],
             'choice_list':[],
             'choice_order':[]}
for i in data:
  for k,v in i.items():
    data_dict[k].append(v)

df = pd.DataFrame(data_dict, columns=['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'label', 'choice_list', 'choice_order'])
df = df.drop(columns=['distractor1','distractor2', 'distractor(unsure)'])
df

Unnamed: 0,id,question,answer,label,choice_list,choice_order
0,WP-0,How do you spell COW in thirteen letters?,SEE O DOUBLE YOU.,1,"[SEE OH DEREFORD, SEE O DOUBLE YOU., COWCOWCOW...","[2, 0, 1, 3]"
1,WP-0_SR,"In thirteen letters, how do you spell COW?",SEE O DOUBLE YOU.,2,"[SEE OH DEREFORD, COWCOWCOWCOWW, SEE O DOUBLE ...","[2, 1, 0, 3]"
2,WP-0_CR,How do you spell COB in seven letters?,SEE O BEE,2,"[COBCOBB, COBBLER, SEE O BEE, None of above.]","[1, 2, 0, 3]"
3,WP-1,"If eleven plus two equals one, what does nine ...",Two.,1,"[Four., Two., Three., None of above.]","[2, 0, 1, 3]"
4,WP-1_SR,What does nine plus five equal if eleven plus ...,Two.,1,"[Three., Two., Four., None of above.]","[1, 0, 2, 3]"
...,...,...,...,...,...,...
391,WP-162_SR,Which sort of bell doesn't ring?,A dumbbell.,0,"[A dumbbell., A doorbell., A cowbell., None of...","[0, 1, 2, 3]"
392,WP-162_CR,"What kind of ""stone"" can't be found in a quarry?",Milestone.,2,"[Cobblestone., Sandstone., Milestone., None of...","[1, 2, 0, 3]"
393,WP-163,What type of ice never melts?,Dice.,0,"[Dice., Flaked ice., Glacier ice., None of abo...","[0, 2, 1, 3]"
394,WP-163_SR,What kind of ice doesn't melt?,Dice.,1,"[Glacier ice., Dice., Flaked ice., None of abo...","[1, 0, 2, 3]"


In [None]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
label_texts={0:"A", 1:"B", 2:"C", 3:"D"}
answers=[]
questions=[]
contexts=[]
ids=[]

for i in tqdm(range(len(df))):
  sample=df.iloc[i]
  question=sample['question'].strip()
  label=sample['label']
  choice_list=[f"{label_texts[k]}: {val}" for k,val in zip(range(4),sample['choice_list'])]
  context="Choices:"
  for choice in choice_list:
    context+=" "+choice

  answer_text=choice_list[label]

  answer_start=context.index(answer_text)
  sample_dict={
      "answers":{
          "answer_start":[answer_start],
          'text':[answer_text]
      },
      "context":context,
      "question":question,
      "id":sample['id']
  }
  answers.append({
          "answer_start":[answer_start],
          'text':[answer_text]
      })
  contexts.append(context)
  questions.append(question)
  ids.append(sample['id'])



print()
d={
'answers': answers,
'context': contexts,
'question':questions,
'id':ids
}

train_df=pd.DataFrame(data=d)

100%|██████████| 396/396 [00:00<00:00, 9296.83it/s]







In [None]:
train_df.iloc[0]

answers     {'answer_start': [36], 'text': ['C: Mocking bi...
context     Choices: A: Humming bird. B: Eagle. C: Mocking...
question                              What bird is very rude?
id                                                      WP-31
Name: 0, dtype: object

In [None]:
dataset = Dataset.from_pandas(train_df)
dataset_train_test = dataset.train_test_split(test_size=0.1, seed=42)

In [None]:
dataset_train_test

DatasetDict({
    train: Dataset({
        features: ['answers', 'context', 'question', 'id'],
        num_rows: 356
    })
    test: Dataset({
        features: ['answers', 'context', 'question', 'id'],
        num_rows: 40
    })
})

#Loading and Pre-processing the dataset

Loading the tokenizer

In [None]:
from transformers import AutoTokenizer

model_name="t5-large"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_datasets = dataset_train_test.map(preprocess_function, batched=True, remove_columns=dataset_train_test['train'].column_names)
tokenized_datasets

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 356
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 40
    })
})

# Fine-tuning BART with the Trainer API

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("t5-large")
model.to('cuda')

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at t5-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


T5ForQuestionAnswering(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (dropo

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
batch_size=8
num_epochs=20

training_args = TrainingArguments(
    output_dir="/content/brainteaser_t5_base",
    seed=0,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    push_to_hub=False,
    save_total_limit=2,
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,4.0588,2.845515
2,2.0993,1.528631
3,1.6497,1.425109
4,1.6295,1.432126
5,1.5498,1.450054
6,1.4379,1.394873
7,1.4838,1.370882
8,1.3854,1.359361
9,1.345,1.31817
10,1.3889,1.26281


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=900, training_loss=1.5261177455054389, metrics={'train_runtime': 1569.6168, 'train_samples_per_second': 4.536, 'train_steps_per_second': 0.573, 'total_flos': 1.1561386189824e+16, 'train_loss': 1.5261177455054389, 'epoch': 20.0})

In [None]:
path="/content/drive/MyDrive/brain_teaser/final_best_models/T5_large_word_puzzle"
trainer.save_model(path)