In [1]:
!pip install transformers
!pip install evaluate
!pip install rouge


import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


2024-04-10 15:39:51.748650: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-10 15:39:53.319397: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# TOKENIZER = T5TokenizerFast.from_pretrained("t5-base")
# MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)

In [2]:
TOKENIZER = T5TokenizerFast.from_pretrained("/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_tokenizer2_no_answer_512")
MODEL = T5ForConditionalGeneration.from_pretrained("/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_model2_no_answer_512", return_dict=True)
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
Q_LEN = 512   # Question Length
T_LEN = 32    # Target Length
BATCH_SIZE = 4
DEVICE = "cuda:0"
MODEL.to(DEVICE)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [3]:
# Extracting context, question, and answers from the dataset

def prepare_data(data):
    articles = []
    
    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                question = qa["question"]

                if not qa["is_impossible"]:
                    answer = qa["answers"][0]["text"]
                else:
                    answer = "no answer"
                
                inputs = {"context": paragraph["context"], "question": question, "answer": answer}

            
                articles.append(inputs)

    return articles

In [4]:
import json
# Loading the data

with open('/mnt/research/ghassemi-capstone/datasets/train-v2.0.json') as f:
    data = json.load(f)

In [11]:
data["data"][0]

{'title': 'Beyoncé',
 'paragraphs': [{'qas': [{'question': 'When did Beyonce start becoming popular?',
     'id': '56be85543aeaaa14008c9063',
     'answers': [{'text': 'in the late 1990s', 'answer_start': 269}],
     'is_impossible': False},
    {'question': 'What areas did Beyonce compete in when she was growing up?',
     'id': '56be85543aeaaa14008c9065',
     'answers': [{'text': 'singing and dancing', 'answer_start': 207}],
     'is_impossible': False},
    {'question': "When did Beyonce leave Destiny's Child and become a solo singer?",
     'id': '56be85543aeaaa14008c9066',
     'answers': [{'text': '2003', 'answer_start': 526}],
     'is_impossible': False},
    {'question': 'In what city and state did Beyonce  grow up? ',
     'id': '56bf6b0f3aeaaa14008c9601',
     'answers': [{'text': 'Houston, Texas', 'answer_start': 166}],
     'is_impossible': False},
    {'question': 'In which decade did Beyonce become famous?',
     'id': '56bf6b0f3aeaaa14008c9602',
     'answers': [{'text

In [22]:
data = prepare_data(data)

# Create a Dataframe
data = pd.DataFrame(data)

In [23]:
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.data = dataframe
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data['answer']
        
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]
        
        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length", 
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)
        
        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100
        
        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }

In [24]:
# Dataloader

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [25]:
train_loss = 0
val_loss = 0
train_batch_count = 0
val_batch_count = 0

for epoch in range(1):
    MODEL.train()
    MODEL = MODEL.to(DEVICE)
    for batch in tqdm(train_loader, desc="Training batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        train_loss += outputs.loss.item()
        train_batch_count += 1
    
    #Evaluation
    MODEL.eval()
    for batch in tqdm(val_loader, desc="Validation batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        val_loss += outputs.loss.item()
        val_batch_count += 1
        
    print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

Training batches: 100%|██████████| 26064/26064 [1:35:52<00:00,  4.53it/s]
Validation batches: 100%|██████████| 6516/6516 [22:38<00:00,  4.79it/s]

1/2 -> Train loss: 0.20999822687517378	Validation loss: 0.04596105567056316





In [None]:
# MODEL.save_pretrained("/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_model")
# TOKENIZER.save_pretrained("/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_tokenizer")
MODEL.save_pretrained("/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_model2_no_answer_512_epoch3")
TOKENIZER.save_pretrained("/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_tokenizer2_no_answer_512_epoch3")

In [3]:
def predict_answer(context, question, ref_answer=None):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
    
    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)
  
    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
    
    if ref_answer:
        # Load the Bleu metric
        bleu = evaluate.load("google_bleu")
        score = bleu.compute(predictions=[predicted_answer], 
                            references=[ref_answer])
    
        print("Context: \n", context)
        print("\n")
        print("Question: \n", question)
        return {
            "Reference Answer: ": ref_answer, 
            "Predicted Answer: ": predicted_answer, 
            "BLEU Score: ": score
        }
    else:
        return predicted_answer

In [10]:
context = data.iloc[0]["context"]
question =data.iloc[0]["question"]
answer = data.iloc[0]["answer"]

predict_answer(context, question, answer)

Context: 
 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".


Question: 
 When did Beyonce start becoming popular?


{'Reference Answer: ': 'in the late 1990s',
 'Predicted Answer: ': 'late 1990s',
 'BLEU Score: ': {'google_bleu': 0.3}}

In [9]:
MODEL = MODEL.to(DEVICE)
context = data.iloc[0]["context"]
question =data.iloc[0]["question"]
answer = data.iloc[0]["answer"]

predict_answer(context, question, answer)

Context: 
 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".


Question: 
 When did Beyonce start becoming popular?


{'Reference Answer: ': 'in the late 1990s',
 'Predicted Answer: ': 'late 1990s',
 'BLEU Score: ': {'google_bleu': 0.3}}

In [13]:
context = data.iloc[1]["context"]
question =data.iloc[1]["question"]
answer = data.iloc[1]["answer"]

predict_answer(context, question, answer)

Context: 
 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".


Question: 
 What areas did Beyonce compete in when she was growing up?


{'Reference Answer: ': 'singing and dancing',
 'Predicted Answer: ': 'singing and dancing',
 'BLEU Score: ': {'google_bleu': 1.0}}

## Evaluation

In [17]:
import json
# Loading the data

with open('/mnt/research/ghassemi-capstone/datasets/SQuAD-explorer/dataset/dev-v2.0.json') as f:
    test_data = json.load(f)

In [18]:
test_data["data"][-1:]

[{'title': 'Force',
  'paragraphs': [{'qas': [{'question': 'What concept did philosophers in antiquity use to study simple machines?',
      'id': '573735e8c3c5551400e51e71',
      'answers': [{'text': 'force', 'answer_start': 46},
       {'text': 'force', 'answer_start': 46},
       {'text': 'the concept of force', 'answer_start': 31},
       {'text': 'the concept of force', 'answer_start': 31},
       {'text': 'force', 'answer_start': 46},
       {'text': 'force', 'answer_start': 46}],
      'is_impossible': False},
     {'question': 'What was the belief that maintaining motion required force?',
      'id': '573735e8c3c5551400e51e72',
      'answers': [{'text': 'fundamental error', 'answer_start': 387},
       {'text': 'A fundamental error', 'answer_start': 385},
       {'text': 'A fundamental error', 'answer_start': 385},
       {'text': 'A fundamental error', 'answer_start': 385},
       {'text': 'A fundamental error', 'answer_start': 385},
       {'text': 'A fundamental error', 'a

In [20]:
val_data.to_csv("/mnt/research/ghassemi-capstone/datasets/val_data.csv")

In [21]:
train_data.to_csv("/mnt/research/ghassemi-capstone/datasets/train_data.csv")

In [12]:
test_data = prepare_data(test_data)

In [13]:
test_data = pd.DataFrame(test_data)

In [14]:
test_data

Unnamed: 0,context,question,answer
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,10th and 11th centuries
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"Denmark, Iceland and Norway"
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,Rollo
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,10th century
...,...,...,...
11868,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène
11869,"The pound-force has a metric counterpart, less...",What does not have a metric counterpart?,no answer
11870,"The pound-force has a metric counterpart, less...",What is the force exerted by standard gravity ...,no answer
11871,"The pound-force has a metric counterpart, less...",What force leads to a commonly used unit of mass?,no answer


In [13]:
from tqdm import tqdm

def predict_answer1(context, question, ref_answer=None):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
    
    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)
    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
    
    return predicted_answer

l = []
for index, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
    predicted_answer = predict_answer1(row["context"],row["question"])
    l.append({index:predicted_answer})

100%|██████████| 11873/11873 [39:55<00:00,  4.96it/s]


In [None]:
with open("/mnt/research/ghassemi-capstone/datasets/predictions.json") as f:
    json.dump(l, f)

In [None]:
with open("/mnt/research/ghassemi-capstone/datasets/predictions.json") as f:
    json.dump(l, f)

In [16]:
MODEL = MODEL.to(DEVICE)
def predict_answer2(context, question):
    task_prefix = "Answer the question based on context provided: "
    inputs = TOKENIZER([task_prefix + "question: " + question +  ", context: " + context], return_tensors="pt", padding=True, max_length=512).to(DEVICE)

    output_sequences = MODEL.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      do_sample=False,  # disable sampling to test if batching affects output
      )

    return TOKENIZER.batch_decode(output_sequences, skip_special_tokens=True)

l1 = []
for index, row in tqdm(test_data.iterrows(),total=test_data.shape[0]):
    predicted_answer = predict_answer2(row["context"],row["question"])
    l1.append(predicted_answer)

100%|██████████| 11873/11873 [10:07<00:00, 19.54it/s]


In [21]:
l = [i[0] for i in l]

In [22]:
val_data["predicted_in_context"] = l

In [30]:
l = []
for index, row in tqdm(test_data.iterrows(),total=test_data.shape[0]):
    predicted_answer = predict_answer2(row["context"],row["question"])
    l.append(predicted_answer)

100%|██████████| 11873/11873 [13:41<00:00, 14.46it/s]


In [33]:
l = [i[0] for i in l]

In [35]:
test_data["predicted_in_context"] = l

In [24]:
test_data

Unnamed: 0,context,question,answer,base512_with_prompt2
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,France
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,10th and 11th centuries,10th and 11th centuries
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"Denmark, Iceland and Norway","Denmark, Iceland and Norway"
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,Rollo,Rollo
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,10th century,10th
...,...,...,...,...
11868,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène,pound-force
11869,"The pound-force has a metric counterpart, less...",What does not have a metric counterpart?,no answer,pound-force
11870,"The pound-force has a metric counterpart, less...",What is the force exerted by standard gravity ...,no answer,pound-force
11871,"The pound-force has a metric counterpart, less...",What force leads to a commonly used unit of mass?,no answer,kilogram-force


In [37]:
f1_score, precision, recall = calculate_f1(test_data["predicted_in_context"], test_data["answer"])
exact_match = calculate_exact_match(test_data["predicted_in_context"], test_data["answer"])
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")

Precision: 0.8481905075196913
Recall: 0.8310499646984304
F1 Score: 0.7604987193868253
Exact Match: 0.3157584435273309


In [23]:
from collections import Counter

def calculate_f1(predictions, ground_truths):
    """
    Calculates average F1 score between two lists of predicted and ground truth sentences.

    Args:
      predictions (list): A list of predicted sentences.
      ground_truths (list): A list of ground truth sentences.

    Returns:
      float: The average F1 score across all sentence pairs.
    """

    f1_scores = []
    precision_scores = []
    recall_scores = []
    for prediction, ground_truth in zip(predictions, ground_truths):
        prediction_tokens = Counter(str(prediction).lower().split())
        ground_truth_tokens = Counter(str(ground_truth).lower().split())

        intersection = sum(min(a, b) for a, b in zip(prediction_tokens.values(), ground_truth_tokens.values()))
        union = sum(prediction_tokens.values()) + sum(ground_truth_tokens.values()) - intersection

        if union == 0:
            f1_scores.append(0.0)
            continue
        
        try:
            precision = intersection / sum(prediction_tokens.values())
            recall = intersection / sum(ground_truth_tokens.values())
            f1 = 2 * (precision * recall) / (precision + recall)
            f1_scores.append(f1)
            precision_scores.append(precision)
            recall_scores.append(recall)
        except:
            continue

    return sum(f1_scores) / len(f1_scores), sum(precision_scores) / len(precision_scores), sum(recall_scores) / len(recall_scores)  # Average F1 score


def calculate_exact_match(predictions, ground_truths):
    """
    Calculates proportion of sentence pairs with exact match between two lists.

    Args:
      predictions (list): A list of predicted sentences.
      ground_truths (list): A list of ground truth sentences.

    Returns:
      float: The proportion of sentence pairs with exact match.
    """

    exact_matches = 0
    for prediction, ground_truth in zip(predictions, ground_truths):
        if str(prediction).lower().strip() == str(ground_truth).lower().strip():
            exact_matches += 1

    return exact_matches / len(predictions)

In [23]:
f1_score, precision, recall = calculate_f1(val_data["predicted_in_context"], val_data["answer"])
exact_match = calculate_exact_match(val_data["predicted_in_context"], val_data["answer"])
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")

Precision: 0.9031581946170637
Recall: 0.8851828873065034
F1 Score: 0.8444622955769181
Exact Match: 0.5126227747084101


In [38]:
test_data.to_csv("/mnt/research/ghassemi-capstone/datasets/test_data_incontext_results.csv")

In [26]:
val_data.to_csv("/mnt/research/ghassemi-capstone/datasets/val_data_incontext_results.csv")

In [24]:
val_data

Unnamed: 0,context,question,answer,predicted_in_context
125137,It threatened the collapse of large financial ...,What year did the global recession that follow...,2012,2012
30275,"But house was also being developed on Ibiza,[c...",what was a popular club in ibiza that started ...,Amnesia,Amnesia
39176,Although Calvin and Huldrych Zwingli honored M...,In what century did Martin Luther honor Mary a...,Catholics,16th
32129,"Due to extreme variation in elevation, great v...",What is the climate like?,varies from hot and subhumid tropical,hot and subhumid tropical
44136,The Queen addressed the United Nations for a s...,How many times has the Queen toured Canada?,October 2011,sixteen
...,...,...,...,...
101061,"Realizing that war was imminent, Prussia preem...",What region did Austria hope to recapture?,"Austria formed an alliance with France, seeing...",Silesia
93625,EU member countries have shown support for amb...,What group polled the 27 EU member states in 2...,Eurobarometer,Eurobarometer
111003,The terms used to define Greekness have varied...,What do Westerners believe it means to belong ...,"Western standards, the term Greeks has traditi...","Western standards, the term Greeks has traditi..."
75612,Richard Owen showed that fossils of extinct sp...,What type of scientist was John Gould?,ornithologist,ornithologist


# Context Length 512

In [22]:
MODEL = MODEL.to(DEVICE)
def predict_answer2(context, question):
    task_prefix = "Answer the question based on context provided: "
    inputs = TOKENIZER([task_prefix + "question: " + question +  ", context: " + context], return_tensors="pt", padding=True).to(DEVICE)

    output_sequences = MODEL.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      do_sample=False,  # disable sampling to test if batching affects output
      )

    return TOKENIZER.batch_decode(output_sequences, skip_special_tokens=True)

l = []
for index, row in tqdm(test_data.iterrows(),total=test_data.shape[0]):
    predicted_answer = predict_answer2(row["context"],row["question"])
    l.append(predicted_answer)

100%|██████████| 26064/26064 [29:07<00:00, 14.92it/s] 


NameError: name 'l2' is not defined

In [None]:
l1 = []
for index, row in tqdm(test_data.iterrows(),total=test_data.shape[0]):
    predicted_answer = predict_answer2(row["context"],row["question"])
    l1.append(predicted_answer)

In [12]:
l1 = [i[0] for i in l1]
test_data["base512_with_prompt1"] = l1

In [14]:
from collections import Counter

def calculate_f1(predictions, ground_truths):
    """
    Calculates average F1 score between two lists of predicted and ground truth sentences.

    Args:
      predictions (list): A list of predicted sentences.
      ground_truths (list): A list of ground truth sentences.

    Returns:
      float: The average F1 score across all sentence pairs.
    """

    f1_scores = []
    precision_scores = []
    recall_scores = []
    for prediction, ground_truth in zip(predictions, ground_truths):
        prediction_tokens = Counter(str(prediction).lower().split())
        ground_truth_tokens = Counter(str(ground_truth).lower().split())

        intersection = sum(min(a, b) for a, b in zip(prediction_tokens.values(), ground_truth_tokens.values()))
        union = sum(prediction_tokens.values()) + sum(ground_truth_tokens.values()) - intersection

        if union == 0:
            f1_scores.append(0.0)
            continue
        
        try:
            precision = intersection / sum(prediction_tokens.values())
            recall = intersection / sum(ground_truth_tokens.values())
            f1 = 2 * (precision * recall) / (precision + recall)
            f1_scores.append(f1)
            precision_scores.append(precision)
            recall_scores.append(recall)
        except:
            continue

    return sum(f1_scores) / len(f1_scores), sum(precision_scores) / len(precision_scores), sum(recall_scores) / len(recall_scores)  # Average F1 score


def calculate_exact_match(predictions, ground_truths):
    """
    Calculates proportion of sentence pairs with exact match between two lists.

    Args:
      predictions (list): A list of predicted sentences.
      ground_truths (list): A list of ground truth sentences.

    Returns:
      float: The proportion of sentence pairs with exact match.
    """

    exact_matches = 0
    for prediction, ground_truth in zip(predictions, ground_truths):
        if str(prediction).lower().strip() == str(ground_truth).lower().strip():
            exact_matches += 1

    return exact_matches / len(predictions)




In [None]:
l1 = []
for index, row in tqdm(test_data.iterrows(),total=test_data.shape[0]):
    predicted_answer = predict_answer2(row["context"],row["question"])
    l1.append(predicted_answer)

In [18]:
# 4 epoch new model test (fine tuned)
f1_score, precision, recall = calculate_f1(test_data["base512_with_prompt1"], test_data["answer"])
exact_match = calculate_exact_match(test_data["base512_with_prompt1"], test_data["answer"])
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")

Precision: 0.9257644857332232
Recall: 0.9344818791348036
F1 Score: 0.900858833691499
Exact Match: 0.6658805693590499


In [30]:
f1_score, precision, recall = calculate_f1(test_data["base512_with_prompt1"], test_data["answer"])
exact_match = calculate_exact_match(test_data["base512_with_prompt1"], test_data["answer"])
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")

Precision: 0.8353348788537716
Recall: 0.8737081929036429
F1 Score: 0.7927565074067925
Exact Match: 0.28636401920323423


In [15]:
MODEL = MODEL.to(DEVICE)
def predict_answer3(context, question):
    task_prefix = "Answer the question based on context provided. If there is no answer in the context, respond with 'no answer'. "
    inputs = TOKENIZER([task_prefix + "question: " + question +  ", context: " + context], return_tensors="pt", padding=True).to(DEVICE)

    output_sequences = MODEL.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      do_sample=False,  # disable sampling to test if batching affects output
      )

    return TOKENIZER.batch_decode(output_sequences, skip_special_tokens=True)

In [16]:
l3 = []
for index, row in tqdm(test_data.iterrows(),total=test_data.shape[0]):
    predicted_answer = predict_answer3(row["context"],row["question"])
    l3.append(predicted_answer)

100%|██████████| 11873/11873 [12:44<00:00, 15.53it/s]


In [17]:
l3 = [i[0] for i in l3]
test_data["base512_with_prompt2"] = l3

In [31]:
f1_score, precision, recall = calculate_f1(test_data["base512_with_prompt2"], test_data["answer"])
exact_match = calculate_exact_match(test_data["base512_with_prompt2"], test_data["answer"])
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")

Precision: 0.8528006707828603
Recall: 0.8654780187644472
F1 Score: 0.8008807717457961
Exact Match: 0.29470226564474017


In [19]:
test_data

Unnamed: 0,context,question,answer,base512_with_prompt2
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,France
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,10th and 11th centuries,10th and 11th centuries
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"Denmark, Iceland and Norway","Denmark, Iceland and Norway"
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,Rollo,Rollo
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,10th century,10th
...,...,...,...,...
11868,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène,pound-force
11869,"The pound-force has a metric counterpart, less...",What does not have a metric counterpart?,no answer,pound-force
11870,"The pound-force has a metric counterpart, less...",What is the force exerted by standard gravity ...,no answer,pound-force
11871,"The pound-force has a metric counterpart, less...",What force leads to a commonly used unit of mass?,no answer,kilogram-force


In [25]:
l3[0]

'France'

In [28]:
l1[0]

['France']

# New prompt

In [35]:
MODEL = MODEL.to(DEVICE)
def predict_answer4(context, question):
    task_prefix = "Answer the question based on context provided. The answer should be a phrase within the context, if there is no answer within the context, respond with 'no answer'. "
    inputs = TOKENIZER([task_prefix + "question: " + question +  ", context: " + context], return_tensors="pt", padding=True).to(DEVICE)

    output_sequences = MODEL.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      do_sample=False,  # disable sampling to test if batching affects output
      )

    return TOKENIZER.batch_decode(output_sequences, skip_special_tokens=True)

In [36]:
l4 = []
for index, row in tqdm(test_data.iterrows(),total=test_data.shape[0]):
    predicted_answer = predict_answer4(row["context"],row["question"])
    l4.append(predicted_answer)

100%|██████████| 11873/11873 [12:33<00:00, 15.76it/s]


In [37]:
l4 = [i[0] for i in l4]
test_data["base512_with_prompt3"] = l4

In [38]:
f1_score, precision, recall = calculate_f1(test_data["base512_with_prompt3"], test_data["answer"])
exact_match = calculate_exact_match(test_data["base512_with_prompt3"], test_data["answer"])
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")

Precision: 0.8535953860892543
Recall: 0.86451151232228
F1 Score: 0.8008520014014119
Exact Match: 0.29352311968331507


In [39]:
test_data

Unnamed: 0,context,question,answer,base512_with_prompt2,base512_with_prompt1,base512_with_prompt3
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,France,France,France
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,10th and 11th centuries,10th and 11th centuries,10th and 11th centuries,10th and 11th centuries
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"Denmark, Iceland and Norway","Denmark, Iceland and Norway","Denmark, Iceland and Norway","Denmark, Iceland and Norway"
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,Rollo,Rollo,Rollo,Rollo
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,10th century,10th,10th,10th
...,...,...,...,...,...,...
11868,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène,pound-force,pound-force,pound-force
11869,"The pound-force has a metric counterpart, less...",What does not have a metric counterpart?,no answer,pound-force,pound-force,pound-force
11870,"The pound-force has a metric counterpart, less...",What is the force exerted by standard gravity ...,no answer,pound-force,pound-force,pound-force
11871,"The pound-force has a metric counterpart, less...",What force leads to a commonly used unit of mass?,no answer,kilogram-force,kilogram-force,kilogram-force
