# COT Finetuning and Comparison

## Environment Setup

### Installation

In [None]:
# !pip install datasets==2.12.0

In [17]:
# from IPython.display import clear_output
# !pip install transformers

# !pip install torchmetrics
# !pip install SentencePiece
# !pip install accelerate
# !pip install tensorboard
# !pip install protobuf==3.20
# !pip install scikit-learn
# !pip install matplotlib

# clear_output()



### Importing Libraries

In [2]:
from datasets import load_dataset
# import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# import tensorflow_datasets as tfds
import transformers
import datasets
from transformers import AutoTokenizer, TFT5ForConditionalGeneration
import datetime
import os
# %load_ext tensorboard
import math
import torch.optim
import torch
from torch.utils.tensorboard import SummaryWriter
import torchmetrics
import torch
from tqdm import tqdm
# from transformers import T5ForConditionalGeneration
import torch.nn as nn
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import csv



### Checking Versions and Device

In [3]:
# Check PyTorch version
torch_version = torch.__version__
print("PyTorch: ", torch_version)

# Check Transformers version
transformers_version = transformers.__version__
print("Transformers: ", transformers_version)

# Check Datasets version
datasets_version = datasets.__version__
print("Datasets: ", datasets_version)


# Get the GPU device
DEVICE = "cpu"
if torch.cuda.is_available():
    DEVICE = "cuda:0"
elif torch.backends.mps.is_available():
    DEVICE = "mps"

device = torch.device(DEVICE)

# Check if the device is available
if device.type == "cuda":
    print("Using GPU")
elif device.type == "mps":
    print("Using Macbook GPU")
else:
    print("Using CPU")
# # Check if TensorFlow version is 2.3+
# tf_version_split = tf_version.split('.')
# assert int(tf_version_split[0]) == 2 and int(tf_version_split[-2]) >= 3, f"Tensorflow version should be '2.3+,x', given {tf_version}"

PyTorch:  2.0.1+cpu
Transformers:  4.29.2
Datasets:  2.12.0
Using CPU


### Hyper parameters

In [4]:
# !ls -a
if os.path.exists("./output") == False:
    !mkdir output
# model_name = "t5-base"
model_name = "google/t5-v1_1-base"
model_folder_name ="t5-v1_1-base"
# model_name = "google/flan-t5-base"
seq_length = 512


# Data Loader Parameters
batch_size = 4 #16
num_workers = 0 # 0 for macbook
pin_memory = False #False for GPU on COLAB

# Training
# You can change these values according to your needs
warmup_steps = 1e4 # Number of steps for the warmup phase
initial_lr = 0.01 # Initial learning rate
lr = 0.01 # Learning rate
num_epochs = 50

data_dir = "./output"
if os.path.exists(f"{data_dir}/experiments") ==False:
    os.mkdir(f"{data_dir}/experiments")

if os.path.exists(f"{data_dir}/experiments/{model_folder_name}")==False:
    os.mkdir(f"{data_dir}/experiments/{model_folder_name}")

if os.path.exists(f"{data_dir}/cache")==False:
    os.mkdir(f"{data_dir}/cache")

if os.path.exists(f"{data_dir}/cache/{model_folder_name}")==False:
    os.mkdir(f"{data_dir}/cache/{model_folder_name}")


log_dir = f"{data_dir}/experiments/{model_folder_name}/logs"
save_path = f"{data_dir}/experiments/{model_folder_name}/models"
cache_path_train = f"{data_dir}/cache/{model_folder_name}.train"
cache_path_test = f"{data_dir}/cache/{model_folder_name}.test"

if os.path.exists(save_path) == False:
    os.mkdir(save_path)

## Dataset
### Loading Dataset for COT

In [5]:
# Read the  file and creating Training and Validation Datasets
import json
from datasets import Dataset as DatasetConvertor
import csv
def readDataset  (file_path):
    data = []
    with open(file_path, 'r') as f:
        lines = csv.reader(f,delimiter ="\t", quotechar = '"')
        for line in lines:
            data.append({ "input" :line[0], "output": (line[1]+". " + line[2]) })
    return  data

complete_data = readDataset("./Dataset/qed_train.tsv")
print(complete_data[0])
complete_dataset = DatasetConvertor.from_list(complete_data)

# print ( complete_dataset)
from sklearn.model_selection import train_test_split
train_data, validation_data = train_test_split(complete_data, test_size=0.2, random_state=0)

# print (train_data)
# print (validation_data)
train_dataset = DatasetConvertor.from_list(train_data)
validation_dataset = DatasetConvertor.from_list(validation_data)

print(train_dataset)
print(validation_dataset)


{'input': 'Passage: Webbed toes is the common name for syndactyly affecting the feet. It is characterised by the fusion of two or more digits of the feet. This is normal in many birds, such as ducks; amphibians, such as frogs; and mammals, such as kangaroos. In humans it is considered unusual, occurring in approximately one in 2,000 to 2,500 live births.\\n\\nQuestion: Based on this passage, what is the medical term for webbed toes?', 'output': 'syndactyly affecting the feet. The relevant information is: Webbed toes is the common name for syndactyly affecting the feet.'}
Dataset({
    features: ['input', 'output'],
    num_rows: 4123
})
Dataset({
    features: ['input', 'output'],
    num_rows: 1031
})



### Visualise Data

In [6]:
validation_dataset[2]

{'input': 'Clinicians classify cardiac arrest into "shockable" versus "non -- shockable", as determined by the ECG rhythm. This refers to whether a particular class of cardiac dysrhythmia is treatable using defibrillation. The two "shockable" rhythms are ventricular fibrillation and pulseless ventricular tachycardia while the two "non -- shockable" rhythms are asystole and pulseless electrical activity.\\n\\nAnswer this question: what are the two shockable rhythms in cardiac arrest?',
 'output': 'ventricular fibrillation and pulseless ventricular tachycardia. This is the relevant information: The two "shockable" rhythms are ventricular fibrillation and pulseless ventricular tachycardia while the two "non -- shockable" rhythms are asystole and pulseless electrical activity.'}

## Pipeline
### The Tokenizer and Model

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_classes = tokenizer.vocab_size+28
vocab_size = num_classes
print(tokenizer)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)
print("TOKENIZER & MODEL LOADED")

T5TokenizerFast(name_or_path='google/t5-v1_1-base', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>'

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

TOKENIZER & MODEL LOADED


### DataLoader

In [8]:

class COTDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        output_text = 'answer: ' + example['output']
        input_text = example['input']
        # Tokenize the input and output texts
        # input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length).to(device)
        # output_ids = tokenizer.encode(output_text, return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length).to(device)
        input_idx = tokenizer.encode_plus(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length).to(device)
        output_idx = tokenizer.encode_plus(output_text, return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length).to(device)
        return {
            'input_ids' : input_idx['input_ids'].squeeze(),
            'attention_mask': input_idx['attention_mask'].squeeze(),
            'labels': output_idx['input_ids'].squeeze(),
            'labels_attention_mask': output_idx['attention_mask'].squeeze(),
        }


In [9]:
train_dataloader = DataLoader(COTDataset(train_dataset), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=num_workers, pin_memory=pin_memory)

print(train_dataloader)
valid_dataloader = DataLoader(COTDataset(validation_dataset), batch_size=batch_size, shuffle=False, drop_last=True, num_workers=num_workers, pin_memory=pin_memory)

<torch.utils.data.dataloader.DataLoader object at 0x0000021E95F162F0>


### Visualise Training and Validation Examples through Loader

In [10]:
print("-"*50)
# Print first 5 examples of train_dataloader
print('Train examples:')
for i, batch in enumerate(train_dataloader):
    if i == 5:
        break
    # print(batch)
    # Convert the input and output ids to lists of integers
    input_ids = batch['input_ids'][0].tolist()
    output_ids = batch['labels'][0].tolist()
    # Flatten the lists using the sum() function
    # print(input_ids)
    # input_ids = sum(input_ids, [])
    # output_ids = sum(output_ids, [])
    input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    print(f'Input: {input_text}')
    print(f'Output: {output_text}')
    print()

print("-"*50)
print("Print first 5 examples of valid_dataloader")
# Print first 5 examples of valid_dataloader
print('Valid examples:')
for i, batch in enumerate(valid_dataloader):
    if i == 5:
        break
    # Convert the input and output ids to lists of integers
    input_ids = batch['input_ids'][0].tolist()
    output_ids = batch['labels'][0].tolist()
    # Flatten the lists using the sum() function
    # print(input_ids)
    # input_ids = sum(input_ids, [])
    # output_ids = sum(output_ids, [])
    input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    print(f'Input: {input_text}')
    print(f'Output: {output_text}')
    print()

--------------------------------------------------
Train examples:
Input: Context: The series is centered around the exploits of Billy (voiced by Richard Steven Horvitz), a dimwitted and happy-go-lucky boy; Mandy (Grey DeLisle), a cynical and merciless girl, and Grim (Greg Eagles), the Grim Reaper who is often tormented by the duo. After cheating at a limbo match against Grim (in retaliation for putting the limbo rod too low for them to go under), he is enslaved in a permanently unwanted friendship with the children. Grim is miserable in the first days of his servitude, and even fantasizes about killing them multiple times. However, as the time passes, he gradually adapts to the new life, and even grows to care for Billy and Mandy, if only somewhat. Despite this, he retains a love-hate relationship with the two and desires to eventually break free from his servitude.nnQuestion: who is the voice of grim from billy and mandy?
Output: answer: Greg Eagles. The relevant sentence in the pass

### Dataset Summary

In [11]:
ntrain = len(train_dataset)
nvalid = len(validation_dataset)
steps = int(np.ceil(ntrain/batch_size))
valid_steps = int(np.ceil(nvalid/batch_size))
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)

Total Steps:  1031
Total Validation Steps:  258


### optimizer and Learning rate Decay

In [12]:
# Define a custom function for the learning rate decay
def lr_lambda(step):
  m = max(warmup_steps, step)
  lr = initial_lr / math.sqrt(m)
  return lr

# Create an optimizer object using torch.optim.Adam
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Create a LambdaLR object and pass it your optimizer
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

### Logging

In [13]:
from datetime import datetime
dt = datetime.now()

# Create a SummaryWriter object and pass it the log_path
writer = SummaryWriter(log_dir)

# Write your own custom logging functions
def log_scalar(tag, value, step):
  # Log a scalar value
  writer.add_scalar(tag, value, step)

def log_histogram(tag, values, step):
  # Log a histogram of tensor values
  writer.add_histogram(tag, values, step)

def log_graph(model, input):
  # Log the model graph
  writer.add_graph(model, input)

# Write your own custom saving function
def save_model(model, epoch, val_loss, val_accuracy, val_f1):
  # Save the model checkpoint
  checkpoint_filepath = f"{save_path}/M-{epoch:04d}-{val_loss:.4f}-{val_accuracy:.4f}-{val_f1:.4f}.ckpt"
  torch.save(model.state_dict(), checkpoint_filepath)

### Evaluation Matrix

In [14]:
# create an F1Score instance and move it to the same device as the input tensors
f1_score = torchmetrics.F1Score(task='multiclass', num_classes=num_classes)
f1_score = f1_score.to(device)
# create an Accuracy instance and move it to the same device as the input tensors
accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=num_classes)
accuracy = accuracy.to(device)
#
exact_match = torchmetrics.classification.MulticlassExactMatch(num_classes=num_classes).to(device)


In [16]:
def evaluate_performance(dataloader, text = "", epoch = -1):
    loss = 0.0
    f1 = 0.0
    acc = 0.0
    em = 0.0
    f1_score.reset()
    accuracy.reset()
    exact_match.reset()
    # create a tqdm progress bar for the data
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}, {text}")
    with torch.no_grad():
        for batch in pbar:
            # optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            # input_ids = input_ids.to(device)
            # output_ids = output_ids.to(device)
            # # print(input_ids.shape)
            # input_ids = torch.squeeze(input_ids, 1)
            # output_ids = torch.squeeze(output_ids, 1)
            # # input_ids = input_ids.un
            # # output_ids = output_ids.view(batch_size, seq_length)
            # outputs = model(input_ids=input_ids, labels=output_ids)
            iterationloss = outputs.loss
            logits = outputs.logits
            loss += iterationloss.item()
            # flatten inputs and pass them to F1Score and Accuracy
            logits = logits.view(-1, num_classes)
            labels = labels.view(-1)
            f1_current = f1_score(logits, labels).item()
            acc_current = accuracy(logits, labels).item()
            em_current = exact_match(logits, labels).item()
            f1 += f1_current
            acc += acc_current
            em += em_current
            # update the progress bar with the metrics
            pbar.set_postfix({"loss": iterationloss.item(), "f1": f1_current, "acc": acc_current, "em" :em_current})
        loss /= len(dataloader)
        f1 /= len(dataloader)
        acc /= len(dataloader)
        em /= len(dataloader)
    print ( f"\n{text}: Average Loss : {loss:.4f}  f1 : {f1:.4f} accuracy : {acc:.4f} Exact_Match: {em:.4f} \n\n")
    return loss,acc,f1

In [15]:
def get_output(input_text):
    encoded_query = tokenizer.encode_plus(input_text,  return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length).to(device)
    question = encoded_query['input_ids']#.squeeze()
    attention_mask = encoded_query['attention_mask']#.squeeze()
    # input_ids = encoded_query["input_ids"]
    model.eval()
    generated_answer = model.generate( question, attention_mask=attention_mask, max_length=seq_length, early_stopping= True,repetition_penalty=2.9) #top_p=0.95, top_k=50, repetition_penalty=2.0
    decoded_answer = tokenizer.decode(generated_answer[0])
    return decoded_answer


In [16]:
%%time
question = 'Passage: The Nobel Peace Prize (Swedish: Nobels fredspris) is one of the five Nobel Prizes created by the Swedish industrialist, inventor, and armaments manufacturer Alfred Nobel, along with the prizes in Chemistry, Physics, Physiology or Medicine, and Literature. Since March 1901, it has been awarded annually (with some exceptions) to those who have "done the most or the best work for fraternity between nations, for the abolition or reduction of standing armies and for the holding and promotion of peace congresses".\n\nQuestion: Based on this passage, who is the nobel peace prize given to?'
print(question)
input_text =  f"{question}"
print("Answer: ", get_output(input_text))

Passage: The Nobel Peace Prize (Swedish: Nobels fredspris) is one of the five Nobel Prizes created by the Swedish industrialist, inventor, and armaments manufacturer Alfred Nobel, along with the prizes in Chemistry, Physics, Physiology or Medicine, and Literature. Since March 1901, it has been awarded annually (with some exceptions) to those who have "done the most or the best work for fraternity between nations, for the abolition or reduction of standing armies and for the holding and promotion of peace congresses".

Question: Based on this passage, who is the nobel peace prize given to?
Answer:  <pad><extra_id_0>. Question: Based on this passage, who is the nobel peace prize given to? Answers: based upon these passage(es) are Alfred Nobel?<extra_id_1>!<extra_id_2>.... answer<extra_id_3>... answer<extra_id_4>.....<extra_id_5>......<extra_id_6>.<extra_id_7> from whom did they award it in 1901<extra_id_8>??<extra_id_9>???<extra_id_10>?!<extra_id_11>!!!<extra_id_12>!!<extra_id_13>!!!!!!!

### Performance of Model Before Fine Tunning

In [17]:
%%time
question ='Passage: The song "Year of the Cat" began as "Foot of the Stage", a song written by Stewart in 1966 after seeing a performance by comedian Tony Hancock whose patter about "being a complete loser" who might as well "end it all right here" drew laughs from the audience: Stewart\'s intuitive response that Hancock was in genuine despair led to the writing of "Foot of the Stage". It was the melody for this never-recorded song which Stewart set the lyrics of "Year of the Cat" to in 1975: pianist Peter Wood was given a co-writing credit on the song in recognition of his piano riff on the recorded track.nnQuestion: Based on this passage, who played piano on year of the cat?'

# Output: answer: pianist Peter Wood. This is the relevant information: It was the melody for this never-recorded song which Stewart set the lyrics of "Year of the Cat" to in 1975: pianist Peter Wood was given a co-writing credit on the song in recognition of his piano riff on the recorded track.'

print("Answer: ", get_output(question))


Answer:  <pad><extra_id_0>. nbQuestion: What is the song about?<extra_id_1>, who was it written by Stewart in 1966 and 1975 respectively)?<extra_id_2> this year<extra_id_3> these years<extra_id_4>this<extra_id_5> that<extra_id_6> of Hancock<extra_id_7> on his piano<extra_id_8>d<extra_id_9> as "Year Of The Cat" began<extra_id_15>day<extra_id_16>s<extra_id_17>e<extra_id_18>w<extra_id_19>nd<extra_id_20>a<extra_id_23><extra_id_27>'s lyrics to<extra_id_24><extra_id_25><extra_id_25> wrote<extra_id_26> for<extra_id_27> longer than stage-flourishingly long time when considering how much more has been recorded to date (and not only), but</s>
CPU times: total: 35 s
Wall time: 23.1 s


In [20]:
# print("Performance of Model before fine Tunning")
# # model.load_state_dict(torch.load(save_path+"/M-0001-1.3129-0.8890-0.8890.ckpt")) #M-0014-1.7052-0.8886-0.8886
# model.load_state_dict(torch.load(save_path+"/M-0036-0.0665-0.9829-0.9829.ckpt", map_location=torch.device('cpu'))) #
# model= model.to(device)

# # evaluate_performance(valid_dataloader, "Validation ")

Performance of Model before fine Tunning


### Training

In [None]:
from transformers import T5ForConditionalGeneration
from accelerate import Accelerator
# TOKENIZERS_PARALLELISM = False


for epoch in range(num_epochs):
    # create a tqdm progress bar for the training loop
    accelerator = Accelerator()
    model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, valid_dataloader
    )
    train_pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}, Train")
    i = 0
    model.train()
    for batch in train_pbar:
        input_ids = batch['input_ids'].to(device)
        attention_masks = batch['attention_mask'].to(device)
        output_ids = batch['labels'].to(device)
        input_ids = input_ids.view(batch_size, seq_length)
        output_ids = output_ids.view(batch_size, seq_length)
        outputs = model(input_ids=input_ids, labels=output_ids,attention_mask=attention_masks )
        loss = outputs.loss
        logits = outputs.logits

        # loss.backward()
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    # flatten inputs and pass them to F1Score and Accuracy
        if i % 10 == 0:
            logits = logits.view(-1, num_classes)
            output_ids = output_ids.view(-1)
            train_f1 = f1_score(logits, output_ids)
            train_acc = accuracy(logits, output_ids)
            # update the progress bar with the metrics
            if accelerator.is_main_process:
                train_pbar.set_postfix({"loss": loss.item(), "f1": train_f1.item(), "acc": train_acc.item()})
        i +=1
    valid_loss, valid_acc, valid_f1=evaluate_performance(valid_dataloader, "Validation ", epoch)

    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(save_path, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(save_path)
    if epoch == 0 or valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        save_model(model, epoch, valid_loss, valid_acc, valid_f1)
    f1_score.reset()
    accuracy.reset()
    exact_match.reset()

### Lets test our model!!

In [None]:
# save_model(model, 1, 15.4699, 0.0097, 0.0097)
# 

In [18]:
save_path

'./output/experiments/t5-v1_1-base/models'

In [19]:
print("Performance of Model after fine Tunning")
# model.load_state_dict(torch.load(save_path+"/M-0001-1.3129-0.8890-0.8890.ckpt")) #M-0014-1.7052-0.8886-0.8886
model.load_state_dict(torch.load(save_path+"/M-0036-0.0665-0.9829-0.9829.ckpt", map_location=torch.device('cpu'))) #
model= model.to(device)

# evaluate_performance(valid_dataloader, "Validation ")

Performance of Model after fine Tunning


In [20]:
%%time
question = "Passage: Webbed toes is the common name for syndactyly affecting the feet. It is characterised by the fusion of two or more digits of the feet. This is normal in many birds, such as ducks; amphibians, such as frogs; and mammals, such as kangaroos. In humans it is considered unusual, occurring in approximately one in 2,000 to 2,500 live births.\n\nQuestion: Based on this passage, what is the medical term for webbed toes?"
print(question)
print("Answer: ", get_output(question))

# short : syndactyly affecting the feet
# COT : The relevant information is: Webbed toes is the common name for syndactyly affecting the feet.

Passage: Webbed toes is the common name for syndactyly affecting the feet. It is characterised by the fusion of two or more digits of the feet. This is normal in many birds, such as ducks; amphibians, such as frogs; and mammals, such as kangaroos. In humans it is considered unusual, occurring in approximately one in 2,000 to 2,500 live births.

Question: Based on this passage, what is the medical term for webbed toes?
Answer:  <pad> answer: syndactyly affecting the feet. To answer this question, consider the following: Syndactyly affects the feet.</s>
CPU times: total: 18.3 s
Wall time: 10.5 s


In [21]:
%%time
question = 'Passage: The Nobel Peace Prize (Swedish: Nobels fredspris) is one of the five Nobel Prizes created by the Swedish industrialist, inventor, and armaments manufacturer Alfred Nobel, along with the prizes in Chemistry, Physics, Physiology or Medicine, and Literature. Since March 1901, it has been awarded annually (with some exceptions) to those who have "done the most or the best work for fraternity between nations, for the abolition or reduction of standing armies and for the holding and promotion of peace congresses".\n\nQuestion: Based on this passage, who is the nobel peace prize given to?'
print(question)
input_text =  f"{question}"
print("Answer: ", get_output(input_text))

Passage: The Nobel Peace Prize (Swedish: Nobels fredspris) is one of the five Nobel Prizes created by the Swedish industrialist, inventor, and armaments manufacturer Alfred Nobel, along with the prizes in Chemistry, Physics, Physiology or Medicine, and Literature. Since March 1901, it has been awarded annually (with some exceptions) to those who have "done the most or the best work for fraternity between nations, for the abolition or reduction of standing armies and for the holding and promotion of peace congresses".

Question: Based on this passage, who is the nobel peace prize given to?
Answer:  <pad> answer: Alfred Nobel. The relevant information to answering this question is: the Swedish industrialist, inventor, and arms manufacturer Alfred Nobel (Swedish: Nobels fredsprius) is: Alfred Nobel.</s>
CPU times: total: 19 s
Wall time: 10.2 s


In [36]:
%%time
question ='context: Apples are considered to be mythical. They are used in religion to represent transition of humans. In modern culture, Apple is the name one of the largest brand of innovative technologies. Question: What is the name of the largest brand?'
# 'context: The Water Cycle is the process by which water is circulated through the Earth\'s ecosystems. This process includes evaporation, transpiration, condensation, precipitation, and runoff. The water cycle is driven by solar radiation and the Earth\'s rotation. question: What is the Water Cycle?'
# 'Context: John Joseph Haley Jr (August 10, 1897 -- June 6, 1979) was an American vaudevillian, actor, radio host, comedian, singer and dancer best known for his portrayal of the Tin Man and his farmhand counterpart Hickory in the classic 1939 MGM film The Wizard of Oz. Question: who played the tin man in the wizard of oz 1939?'
# Short Answer : John Joseph Haley Jr
# The relevant information to answer the above question is: John Joseph Haley Jr (August 10, 1897 -- June 6, 1979) was an American vaudevillian, actor, radio host, comedian, singer and dancer best known for his portrayal of the Tin Man and his farmhand counterpart Hickory in the classic 1939 MGM film The Wizard of Oz.
# answer: the process by which water is circulated through the Earth's ecosystems
print("Answer: ", get_output(question))

Answer:  <pad> answer: Apple. In modern culture, Apple is the name one of the largest brand of innovative technologies. The relevant information to answering this question is: Apple is the name one of the large brand of innovations technologies.</s>
CPU times: total: 19.2 s
Wall time: 10.9 s


In [37]:
%%time
# question ='Context: John Joseph Haley Jr (August 10, 1897 -- June 6, 1979) was an American vaudevillian, actor, radio host, comedian, singer and dancer best known for his portrayal of the Tin Man and his farmhand counterpart Hickory in the classic 1939 MGM film The Wizard of Oz.\n\nQuestion: who played the tin man in the wizard of oz 1939?'
question= 'context: Baroness Sayeeda Warsi said that we should question ourselves and not to judge others, we should ask ourselves what we were saying what we believe and what we were doing what we were saying. We needed to overcome the fear of love to win, she added. Arfa\u2019smother Ms. SaminaAmjad shared memories of her daughter and spoke about her brilliant achievements. Arfa wanted to do something for the people of her country by providing free education to everyone, she added. question: What did Arfa want to do for the people of her country?'
# to become an internationally recognized research-intensive university focused on developing society through excellence in education, research, and entrepreneurship

# 'Context: Some of the fascinating projects demonstrated by ITU students included Istarfaa, a platform that can levitate particles using sound waves; Qufl, a digital locking system that ensures fool-proof security; Celox, an automated and secure table saw; Wall-climber, a robot that climbs walls and cleans glass windows of skyscrapers; and Hugo, a robot that gauges human emotions and attempts to mimic them. Question: What is Istarfaa?'
# Short Answer : John Joseph Haley Jr
# The relevant information to answer the above question is: John Joseph Haley Jr (August 10, 1897 -- June 6, 1979) was an American vaudevillian, actor, radio host, comedian, singer and dancer best known for his portrayal of the Tin Man and his farmhand counterpart Hickory in the classic 1939 MGM film The Wizard of Oz.
# a platform that can levitate particles using sound waves
print("Answer: ", get_output(question))

Answer:  <pad> answer: We should overcome the fear of love and not to judge others, she added. The relevant sentence in the passage is: Baroness Sayeeda Warsi spoke about her daughter Arfe.</s>
CPU times: total: 18 s
Wall time: 10.1 s


## Evaluation

### Formulas

In [76]:
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
            f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

In [79]:
def overall_validation_predictions(dataloader):
    model.eval()
    outputs = []
    # create a tqdm progress bar for the data
    pbar = tqdm(dataloader, desc=f"Evaluation ")
    with torch.no_grad():
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            # output_ids = batch['labels'].to(device)
            attention_mask=batch['attention_mask'].to(device)
            outs = model.generate(input_ids=input_ids,
                      attention_mask=attention_mask,
                      max_length=512,
                      early_stopping=True)
            outs = tokenizer.decode(outs[0])
            outputs.extend(outs)
    return outputs

In [80]:
answers = overall_validation_predictions (valid_dataloader)
references = []
for ref in validation_dataset:
    references.append(ref['output'])

print(evaluate(references,answers))


Evaluation : 100%|██████████| 257/257 [1:21:46<00:00, 19.09s/it]


{'exact_match': 99.51503394762366, 'f1': 37.63336566440349}
