# COT Finetuning and Comparison

## Environment Setup

### Installation

In [13]:

from IPython.display import clear_output
!pip install transformers
!pip install datasets
!pip install torchmetrics
!pip install SentencePiece
!pip install accelerate
!pip install tensorboard
!pip install protobuf==3.20
!pip install scikit-learn
!pip install matplotlib







### Importing Libraries

In [1]:
from datasets import load_dataset
# import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# import tensorflow_datasets as tfds
import transformers
import datasets
from transformers import AutoTokenizer, TFT5ForConditionalGeneration
import datetime
import os
# %load_ext tensorboard
import math
import torch.optim
import torch
from torch.utils.tensorboard import SummaryWriter
import torchmetrics
import torch
from tqdm import tqdm
# from transformers import T5ForConditionalGeneration
import torch.nn as nn
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import csv



### Checking Versions and Device

In [2]:
# Check PyTorch version
torch_version = torch.__version__
print("PyTorch: ", torch_version)

# Check Transformers version
transformers_version = transformers.__version__
print("Transformers: ", transformers_version)

# Check Datasets version
datasets_version = datasets.__version__
print("Datasets: ", datasets_version)


# Get the GPU device
DEVICE = "cpu"
if torch.cuda.is_available():
    DEVICE = "cuda:0"
elif torch.backends.mps.is_available():
    DEVICE = "mps"

device = torch.device(DEVICE)

# Check if the device is available
if device.type == "cuda":
    print("Using GPU")
elif device.type == "mps":
    print("Using Macbook GPU")
else:
    print("Using CPU")
# # Check if TensorFlow version is 2.3+
# tf_version_split = tf_version.split('.')
# assert int(tf_version_split[0]) == 2 and int(tf_version_split[-2]) >= 3, f"Tensorflow version should be '2.3+,x', given {tf_version}"

PyTorch:  2.0.1+cpu
Transformers:  4.29.2
Datasets:  2.12.0
Using CPU


### Hyper parameters

In [3]:
# !ls -a
if os.path.exists("./output") == False:
    !mkdir output
# model_name = "t5-base"
model_name = "google/t5-v1_1-base"
model_folder_name ="t5-v1_1-base"
# model_name = "google/flan-t5-base"
seq_length = 512


# Data Loader Parameters
batch_size = 4 #16
num_workers = 0 # 0 for macbook
pin_memory = False #False for GPU on COLAB

# Training
# You can change these values according to your needs
warmup_steps = 1e4 # Number of steps for the warmup phase
initial_lr = 0.01 # Initial learning rate
lr = 0.01 # Learning rate
num_epochs = 50

data_dir = "./output"
if os.path.exists(f"{data_dir}/experiments") ==False:
    os.mkdir(f"{data_dir}/experiments")

if os.path.exists(f"{data_dir}/experiments/{model_folder_name}")==False:
    os.mkdir(f"{data_dir}/experiments/{model_folder_name}")

if os.path.exists(f"{data_dir}/cache")==False:
    os.mkdir(f"{data_dir}/cache")

if os.path.exists(f"{data_dir}/cache/{model_folder_name}")==False:
    os.mkdir(f"{data_dir}/cache/{model_folder_name}")


log_dir = f"{data_dir}/experiments/{model_folder_name}/logs"
save_path = f"{data_dir}/experiments/{model_folder_name}/models"
cache_path_train = f"{data_dir}/cache/{model_folder_name}.train"
cache_path_test = f"{data_dir}/cache/{model_folder_name}.test"

if os.path.exists(save_path) == False:
    os.mkdir(save_path)

'ls' is not recognized as an internal or external command,
operable program or batch file.


## Dataset
### Loading Dataset for COT

In [5]:
# Read the  file and creating Training and Validation Datasets
import json
from datasets import Dataset as DatasetConvertor
import csv
def readDataset  (file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            # print(len(lines))
            data.append(json.loads(line))
        # for line in lines:
        #     data.append({ "input" :line[0], "output": (line[1]+". " + line[2]) })
    return  data

complete_data = readDataset("./data/dev_retrieval_top100_sp.json")
# print(len(complete_data))
# print(complete_data[0])
complete_dataset = DatasetConvertor.from_list(complete_data)
#
# # print ( complete_dataset)
from sklearn.model_selection import train_test_split
train_data, validation_data = train_test_split(complete_data, test_size=0.2, random_state=0)
#
# print (train_data)
# print (validation_data)
train_dataset = DatasetConvertor.from_list(train_data)
validation_dataset = DatasetConvertor.from_list(validation_data)
#
print(train_dataset)
print(validation_dataset)


Dataset({
    features: ['_id', 'question', 'type', 'sp', 'answer', 'candidate_chains', 'bridge'],
    num_rows: 5924
})
Dataset({
    features: ['_id', 'question', 'type', 'sp', 'answer', 'candidate_chains', 'bridge'],
    num_rows: 1481
})



### Visualise Data

In [6]:
validation_dataset[2]


{'_id': '5ae127c9554299422ee99612',
 'question': 'Fighting Cock is produced in what Kentucky county?',
 'type': 'bridge',
 'sp': [{'sents': ['Fighting Cock is a brand of Kentucky straight bourbon whiskey produced in Bardstown, Kentucky by Heaven Hill Distilleries, Inc. It is sold in 16 oz (1 pint), 750ml, and 1-liter glass bottles.'],
   'sp_sent_ids': [0],
   'title': 'Fighting Cock (bourbon)'},
  {'sents': ['Bardstown is a home rule-class city in Nelson County, Kentucky, in the United States.',
    ' The population was recorded as 11,700 by the 2010 census.',
    ' It is the county seat of Nelson County.',
    ' It is named for the pioneering Bard brothers.',
    ' David Bard obtained a 1,000 acre land grant in 1785 in what was then Jefferson County, Virginia from Governor Patrick Henry.',
    ' William Bard surveyed and platted the town.',
    " It was originally chartered as Baird's Town."],
   'sp_sent_ids': [0],
   'title': 'Bardstown, Kentucky'}],
 'answer': ['Nelson County'],
 

## Pipeline
### The Tokenizer and Model

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_classes = tokenizer.vocab_size+28
vocab_size = num_classes
print(tokenizer)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)
print("TOKENIZER & MODEL LOADED")

T5TokenizerFast(name_or_path='google/t5-v1_1-base', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>'

### DataLoader

In [8]:

class HOTPOTDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        output_text = 'answer: ' + ' '.join(example['answer'])
        str = ''
        for s in  example['sp']:
            # print (' '.join(s['sents']))
            str += ' '.join(s['sents'])
        input_text = 'context: '+ str + ' question: '+ example['question']
        input_idx = tokenizer.encode_plus(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length).to(device)
        output_idx = tokenizer.encode_plus(output_text, return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length).to(device)
        return {
            'input_ids' : input_idx['input_ids'].squeeze(),
            'attention_mask': input_idx['attention_mask'].squeeze(),
            'labels': output_idx['input_ids'].squeeze(),
            'labels_attention_mask': output_idx['attention_mask'].squeeze(),
        }


In [9]:
train_dataloader = DataLoader(HOTPOTDataset(train_dataset), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=num_workers, pin_memory=pin_memory)

print(train_dataloader)
valid_dataloader = DataLoader(HOTPOTDataset(validation_dataset), batch_size=batch_size, shuffle=False, drop_last=True, num_workers=num_workers, pin_memory=pin_memory)

<torch.utils.data.dataloader.DataLoader object at 0x00000195C76C2410>


### Visualise Training and Validation Examples through Loader

In [10]:
print("-"*50)
# Print first 5 examples of train_dataloader
print('Train examples:')
for i, batch in enumerate(train_dataloader):
    if i == 1:
        break
    # print(batch)
    # Convert the input and output ids to lists of integers
    input_ids = batch['input_ids'][0].tolist()
    output_ids = batch['labels'][0].tolist()
    # Flatten the lists using the sum() function
    # print(input_ids)
    # input_ids = sum(input_ids, [])
    # output_ids = sum(output_ids, [])
    input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    print(f'Input: {input_text}')
    print(f'Output: {output_text}')
    print()

print("-"*50)
print("Print first 5 examples of valid_dataloader")
# Print first 5 examples of valid_dataloader
print('Valid examples:')
for i, batch in enumerate(valid_dataloader):
    if i == 1:
        break
    # Convert the input and output ids to lists of integers
    input_ids = batch['input_ids'][0].tolist()
    output_ids = batch['labels'][0].tolist()
    # Flatten the lists using the sum() function
    # print(input_ids)
    # input_ids = sum(input_ids, [])
    # output_ids = sum(output_ids, [])
    input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    print(f'Input: {input_text}')
    print(f'Output: {output_text}')
    print()

--------------------------------------------------
Train examples:
Input: context: Doana National Park is a natural reserve in Andalusia, southern Spain, in the provinces of Huelva (most of its territory) and Seville. It covers 543 km2, of which 135 km2 are a protected area. The park is an area of marshes, shallow streams, and sand dunes in Las Marismas, the delta where the Guadalquivir River flows into the Atlantic Ocean. It was established as a nature reserve in 1969 when the World Wildlife Fund joined with the Spanish government and purchased a section of marshes to protect it. The eco-system has been under constant threat by the draining of the marshes, the use of river water to boost agricultural production by irrigating land along the coast, water pollution by upriver mining, and the expansion of tourist facilities. It is named after wife of the seventh Duke of Medina-Sidonia.Sierra de Guadarrama National Park (in Spanish: "Parque Nacional de la Sierra de Guadarrama") is a nation

### Dataset Summary

In [11]:
ntrain = len(train_dataset)
nvalid = len(validation_dataset)
steps = int(np.ceil(ntrain/batch_size))
valid_steps = int(np.ceil(nvalid/batch_size))
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)

Total Steps:  1481
Total Validation Steps:  371


### optimizer and Learning rate Decay

In [12]:
# Define a custom function for the learning rate decay
def lr_lambda(step):
  m = max(warmup_steps, step)
  lr = initial_lr / math.sqrt(m/100.0)
  return lr

# Create an optimizer object using torch.optim.Adam
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Create a LambdaLR object and pass it your optimizer
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

### Logging

In [13]:
from datetime import datetime
dt = datetime.now()

# Create a SummaryWriter object and pass it the log_path
writer = SummaryWriter(log_dir)

# Write your own custom logging functions
def log_scalar(tag, value, step):
  # Log a scalar value
  writer.add_scalar(tag, value, step)

def log_histogram(tag, values, step):
  # Log a histogram of tensor values
  writer.add_histogram(tag, values, step)

def log_graph(model, input):
  # Log the model graph
  writer.add_graph(model, input)

# Write your own custom saving function
def save_model(model, epoch, val_loss, val_accuracy, val_f1):
  # Save the model checkpoint
  checkpoint_filepath = f"{save_path}/M-{epoch:04d}-{val_loss:.4f}-{val_accuracy:.4f}-{val_f1:.4f}.ckpt"
  torch.save(model.state_dict(), checkpoint_filepath)

### Evaluation Matrix

In [14]:
# create an F1Score instance and move it to the same device as the input tensors
f1_score = torchmetrics.F1Score(task='multiclass', num_classes=num_classes)
f1_score = f1_score.to(device)
# create an Accuracy instance and move it to the same device as the input tensors
accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=num_classes)
accuracy = accuracy.to(device)
#
exact_match = torchmetrics.classification.MulticlassExactMatch(num_classes=num_classes).to(device)


In [16]:
def evaluate_performance(dataloader, text = "", epoch = -1):
    model.eval()
    loss = 0.0
    f1 = 0.0
    acc = 0.0
    em = 0.0
    f1_score.reset()
    accuracy.reset()
    exact_match.reset()
    # create a tqdm progress bar for the data
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}, {text}")
    with torch.no_grad():
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            output_ids = batch['labels'].to(device)
            input_ids = input_ids.view(batch_size, seq_length)
            output_ids = output_ids.view(batch_size, seq_length)
            outputs = model(input_ids=input_ids, labels=output_ids)
            iterationloss = outputs.loss
            logits = outputs.logits
            loss += iterationloss.item()
            # flatten inputs and pass them to F1Score and Accuracy
            logits = logits.view(-1, num_classes)
            output_ids = output_ids.view(-1)
            f1_current = f1_score(logits, output_ids).item()
            acc_current = accuracy(logits, output_ids).item()
            em_current = exact_match(logits, output_ids).item()
            f1 += f1_current
            acc += acc_current
            em += em_current
            # update the progress bar with the metrics
            pbar.set_postfix({"loss": iterationloss.item(), "f1": f1_current, "acc": acc_current, "em" :em_current})
        loss /= len(dataloader)
        f1 /= len(dataloader)
        acc /= len(dataloader)
        em /= len(dataloader)
    print ( f"\n{text}: Average Loss : {loss:.4f}  f1 : {f1:.4f} accuracy : {acc:.4f} Exact_Match: {em:.4f} \n\n")
    return loss,acc,f1

In [15]:
def get_output(input_text):
    encoded_query = tokenizer.encode_plus(input_text,  return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length).to(device)
    question = encoded_query['input_ids']#.squeeze()
    attention_mask = encoded_query['attention_mask']#.squeeze()
    # input_ids = encoded_query["input_ids"]
    model.eval()
    generated_answer = model.generate( question, attention_mask=attention_mask, max_length=seq_length, early_stopping= True,repetition_penalty=2.9) #top_p=0.95, top_k=50, repetition_penalty=2.0
    decoded_answer = tokenizer.decode(generated_answer[0])
    return decoded_answer


In [16]:
%%time
question = 'context: Skeptic, colloquially known as Skeptic magazine, is a quarterly science education and science advocacy magazine published internationally by The Skeptics Society, a nonprofit organization devoted to promoting scientific skepticism and resisting the spread of pseudoscience, superstition, and irrational beliefs. Founded by Michael Shermer, founder of the Skeptics Society, the magazine was first published in the spring of 1992 and is published through Millennium Press.The Chronicle of Philanthropy is a magazine that covers the nonprofit world. Based in Washington, DC, it is aimed at charity leaders, foundation executives, fund raisers, and other people involved in philanthropy. The Chronicle of Philanthropy publishes 12 issues a year while updating its Web site daily. It was founded in 1988 by editor Phil Semas and then managing editor Stacy Palmer. It is owned by The Chronicle of Higher Education Inc., which also publishes "The Chronicle of Higher Education", a weekly newspaper covering colleges and universities. question: Which magazine was first published earlier, The Chronicle of Philanthropy or Skeptic?'

# answer: The Chronicle of Philanthropy
print(question)
print("Answer: ", get_output(question))

context: Skeptic, colloquially known as Skeptic magazine, is a quarterly science education and science advocacy magazine published internationally by The Skeptics Society, a nonprofit organization devoted to promoting scientific skepticism and resisting the spread of pseudoscience, superstition, and irrational beliefs. Founded by Michael Shermer, founder of the Skeptics Society, the magazine was first published in the spring of 1992 and is published through Millennium Press.The Chronicle of Philanthropy is a magazine that covers the nonprofit world. Based in Washington, DC, it is aimed at charity leaders, foundation executives, fund raisers, and other people involved in philanthropy. The Chronicle of Philanthropy publishes 12 issues a year while updating its Web site daily. It was founded in 1988 by editor Phil Semas and then managing editor Stacy Palmer. It is owned by The Chronicle of Higher Education Inc., which also publishes "The Chronicle of Higher Education", a weekly newspaper 

### Performance of Model Before Fine Tunning

### Training

In [59]:
from transformers import T5ForConditionalGeneration
from accelerate import Accelerator
# TOKENIZERS_PARALLELISM = False


for epoch in range(num_epochs):
    # create a tqdm progress bar for the training loop
    accelerator = Accelerator()
    model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, valid_dataloader
    )
    train_pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}, Train")
    i = 0
    model.train()
    for batch in train_pbar:
        input_ids = batch['input_ids'].to(device)
        attention_masks = batch['attention_mask'].to(device)
        output_ids = batch['labels'].to(device)
        input_ids = input_ids.view(batch_size, seq_length)
        output_ids = output_ids.view(batch_size, seq_length)
        outputs = model(input_ids=input_ids, labels=output_ids,attention_mask=attention_masks )
        loss = outputs.loss
        # loss.backward()
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    # flatten inputs and pass them to F1Score and Accuracy
        if i % 10 == 0:
            logits = outputs.logits
            logits = logits.view(-1, num_classes)
            output_ids = output_ids.view(-1)
            train_f1 = f1_score(logits, output_ids)
            train_acc = accuracy(logits, output_ids)
            # update the progress bar with the metrics
            if accelerator.is_main_process:
                train_pbar.set_postfix({"loss": loss.item(), "f1": train_f1.item(), "acc": train_acc.item()})
        i +=1
    if epoch%3 ==0:
        valid_loss, valid_acc, valid_f1=evaluate_performance(valid_dataloader, "Validation ", epoch)
        if epoch == 0 or valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            save_model(model, epoch, valid_loss, valid_acc, valid_f1)

    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(save_path, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(save_path)
    f1_score.reset()
    accuracy.reset()
    exact_match.reset()

Epoch 1, Train: 100%|██████████| 1481/1481 [51:01<00:00,  2.07s/it, loss=0.0191, f1=0.996, acc=0.996] 
Epoch 1, Validation : 100%|██████████| 370/370 [01:57<00:00,  3.15it/s, loss=0.0108, f1=0.998, acc=0.998, em=0.998] 



Validation : Average Loss : 0.0176  f1 : 0.9967 accuracy : 0.9967 Exact_Match: 0.9967 




Epoch 2, Train: 100%|██████████| 1481/1481 [53:26<00:00,  2.17s/it, loss=0.0381, f1=0.995, acc=0.995]
Epoch 3, Train: 100%|██████████| 1481/1481 [53:09<00:00,  2.15s/it, loss=0.0404, f1=0.996, acc=0.996]
Epoch 4, Train: 100%|██████████| 1481/1481 [53:34<00:00,  2.17s/it, loss=0.0273, f1=0.997, acc=0.997] 
Epoch 4, Validation : 100%|██████████| 370/370 [01:57<00:00,  3.15it/s, loss=0.00512, f1=0.999, acc=0.999, em=0.999]



Validation : Average Loss : 0.0096  f1 : 0.9979 accuracy : 0.9979 Exact_Match: 0.9979 




Epoch 5, Train: 100%|██████████| 1481/1481 [52:50<00:00,  2.14s/it, loss=0.0198, f1=0.997, acc=0.997]
Epoch 6, Train: 100%|██████████| 1481/1481 [53:05<00:00,  2.15s/it, loss=0.023, f1=0.997, acc=0.997]  
Epoch 7, Train: 100%|██████████| 1481/1481 [1:07:13<00:00,  2.72s/it, loss=0.0136, f1=0.997, acc=0.997]   
Epoch 7, Validation : 100%|██████████| 370/370 [05:41<00:00,  1.08it/s, loss=0.00371, f1=0.999, acc=0.999, em=0.999]  



Validation : Average Loss : 0.0081  f1 : 0.9982 accuracy : 0.9982 Exact_Match: 0.9982 




Epoch 8, Train:  72%|███████▏  | 1067/1481 [3:25:34<1:19:45, 11.56s/it, loss=0.0129, f1=0.997, acc=0.997] 


KeyboardInterrupt: 

### Lets test our model!!

In [None]:
# save_model(model, 1, 15.4699, 0.0097, 0.0097)
# 

In [18]:
print("Performance of Model before fine Tunning")
# model.load_state_dict(torch.load(save_path+"/M-0001-1.3129-0.8890-0.8890.ckpt")) #M-0014-1.7052-0.8886-0.8886
model.load_state_dict(torch.load(save_path+"/M-0006-0.0081-0.9982-0.9982.ckpt", map_location=torch.device('cpu'))) #
# model.load_state_dict(torch.load("./../May2023_COT/output/experiments/t5-v1_1-base/models/M-0036-0.0665-0.9829-0.9829.ckpt", map_location=torch.device('mps'))) #
model= model.to(device)

# evaluate_performance(valid_dataloader, "Validation ")

Performance of Model before fine Tunning


In [21]:
%%time
question = "Passage: Webbed toes is the common name for syndactyly affecting the feet. It is characterised by the fusion of two or more digits of the feet. This is normal in many birds, such as ducks; amphibians, such as frogs; and mammals, such as kangaroos. In humans it is considered unusual, occurring in approximately one in 2,000 to 2,500 live births.\n\nQuestion: Based on this passage, what is the medical term for webbed toes?"
print(question)
print("Answer: ", get_output(question))

# short : syndactyly affecting the feet
# COT : The relevant information is: Webbed toes is the common name for syndactyly affecting the feet.

Passage: Webbed toes is the common name for syndactyly affecting the feet. It is characterised by the fusion of two or more digits of the feet. This is normal in many birds, such as ducks; amphibians, such as frogs; and mammals, such as kangaroos. In humans it is considered unusual, occurring in approximately one in 2,000 to 2,500 live births.

Question: Based on this passage, what is the medical term for webbed toes?
Answer:  <pad> answer: syndactyly</s>
CPU times: total: 12.3 s
Wall time: 9.29 s


In [22]:
%%time
question = 'Passage: The Nobel Peace Prize (Swedish: Nobels fredspris) is one of the five Nobel Prizes created by the Swedish industrialist, inventor, and armaments manufacturer Alfred Nobel, along with the prizes in Chemistry, Physics, Physiology or Medicine, and Literature. Since March 1901, it has been awarded annually (with some exceptions) to those who have "done the most or the best work for fraternity between nations, for the abolition or reduction of standing armies and for the holding and promotion of peace congresses".\n\nQuestion: Based on this passage, who is the nobel peace prize given to?'
print(question)
input_text =  f"{question}"
print("Answer: ", get_output(input_text))

Passage: The Nobel Peace Prize (Swedish: Nobels fredspris) is one of the five Nobel Prizes created by the Swedish industrialist, inventor, and armaments manufacturer Alfred Nobel, along with the prizes in Chemistry, Physics, Physiology or Medicine, and Literature. Since March 1901, it has been awarded annually (with some exceptions) to those who have "done the most or the best work for fraternity between nations, for the abolition or reduction of standing armies and for the holding and promotion of peace congresses".

Question: Based on this passage, who is the nobel peace prize given to?
Answer:  <pad> answer: Alfred Nobel</s>
CPU times: total: 10.7 s
Wall time: 8.09 s


In [19]:
%%time
question ='Context: John Joseph Haley Jr (August 10, 1897 -- June 6, 1979) was an American vaudevillian, actor, radio host, comedian, singer and dancer best known for his portrayal of the Tin Man and his farmhand counterpart Hickory in the classic 1939 MGM film The Wizard of Oz.\n\nQuestion: who played the tin man in the wizard of oz 1939?'
# Short Answer : John Joseph Haley Jr
# The relevant information to answer the above question is: John Joseph Haley Jr (August 10, 1897 -- June 6, 1979) was an American vaudevillian, actor, radio host, comedian, singer and dancer best known for his portrayal of the Tin Man and his farmhand counterpart Hickory in the classic 1939 MGM film The Wizard of Oz.

print("Answer: ", get_output(question))

Answer:  <pad> answer: John Joseph Haley Jr</s>
CPU times: total: 12.1 s
Wall time: 8.55 s


In [20]:
%%time
question ='context: Muslim Mahammad oglu Magomayev (Azerbaijani: "Müslüm Maqomayev" ) (18 September 1885 in Grozny – 28 July 1937 in Nalchik) was an Azerbaijani and Soviet composer and conductor. He is the grandfather and a namesake of Azerbaijani opera singer Muslim Magomayev.Muslim Magometovich Magomayev (Azerbaijani: "Müslüm Mhmmd olu Maqomayev", 17 August 1942 – 25 October 2008), dubbed the "King of Songs" and the "Soviet Sinatra" was a Soviet Azerbaijani baritone operatic pop singer. He achieved iconic status in Russia and the post-Soviet countries for his vocal talent and charisma. question: Muslim Mahammad oglu Magomayev is the grandfther of what Soviet Azerbaijani singer who was dubbed the King of Songs?'

#     answer: Muslim Magometovich Magomayev

print("Answer: ", get_output(question))


Answer:  <pad> answer: Muslim Magomayev</s>
CPU times: total: 10.7 s
Wall time: 7.22 s


## Evaluation

### Formulas

In [76]:
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
            f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

In [79]:
def overall_validation_predictions(dataloader):
    model.eval()
    outputs = []
    # create a tqdm progress bar for the data
    pbar = tqdm(dataloader, desc=f"Evaluation ")
    with torch.no_grad():
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            # output_ids = batch['labels'].to(device)
            attention_mask=batch['attention_mask'].to(device)
            outs = model.generate(input_ids=input_ids,
                      attention_mask=attention_mask,
                      max_length=512,
                      early_stopping=True)
            outs = tokenizer.decode(outs[0])
            outputs.extend(outs)
    return outputs

In [80]:
answers = overall_validation_predictions (valid_dataloader)
references = []
for ref in validation_dataset:
    references.append(ref['output'])

print(evaluate(references,answers))


Evaluation : 100%|██████████| 257/257 [1:21:46<00:00, 19.09s/it]


{'exact_match': 99.51503394762366, 'f1': 37.63336566440349}
