In [1]:
!pip install transformers
!pip install evaluate
!pip install rouge


import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Obtaining dependency information for datasets>=2.0.0 from https://files.pythonhosted.org/packages/95/fc/661a7f06e8b7d48fcbd3f55423b7ff1ac3ce59526f146fda87a1e1788ee4/datasets-2.18.0-py3-none-any.whl.metadata
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Obtaining dependency information for dill from https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl.metadata
  Downloading dill-0.3.8-py3

2024-03-15 14:59:37.997519: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-15 14:59:39.628615: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
TOKENIZER = T5TokenizerFast.from_pretrained("t5-base")
MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
Q_LEN = 256   # Question Length
T_LEN = 32    # Target Length
BATCH_SIZE = 4
DEVICE = "cuda:0"

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [3]:
# Extracting context, question, and answers from the dataset

def prepare_data(data):
    articles = []
    
    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                question = qa["question"]

                if not qa["is_impossible"]:
                    answer = qa["answers"][0]["text"]
                
                inputs = {"context": paragraph["context"], "question": question, "answer": answer}

            
                articles.append(inputs)

    return articles

In [5]:
import json
# Loading the data

with open('/mnt/research/ghassemi-capstone/datasets/train-v2.0.json') as f:
    data = json.load(f)

In [6]:
data = prepare_data(data)

# Create a Dataframe
data = pd.DataFrame(data)

In [7]:
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.data = dataframe
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data['answer']
        
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]
        
        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length", 
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)
        
        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100
        
        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }

In [8]:
# Dataloader

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [9]:
train_loss = 0
val_loss = 0
train_batch_count = 0
val_batch_count = 0

for epoch in range(2):
    MODEL.train()
    MODEL = MODEL.to(DEVICE)
    for batch in tqdm(train_loader, desc="Training batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        train_loss += outputs.loss.item()
        train_batch_count += 1
    
    #Evaluation
    MODEL.eval()
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation batches"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)
            decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

            outputs = MODEL(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            labels=labels,
                            decoder_attention_mask=decoder_attention_mask
                            )

            # OPTIMIZER.zero_grad()
            # outputs.loss.backward()
            # OPTIMIZER.step()
            val_loss += outputs.loss.item()
            val_batch_count += 1
        
    print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

Training batches: 100%|██████████| 26064/26064 [1:20:08<00:00,  5.42it/s]
Validation batches: 100%|██████████| 6516/6516 [19:00<00:00,  5.71it/s]


1/2 -> Train loss: 0.8220058080602061	Validation loss: 0.3557346291447237


Training batches: 100%|██████████| 26064/26064 [1:18:48<00:00,  5.51it/s]
Validation batches: 100%|██████████| 6516/6516 [18:44<00:00,  5.79it/s]

2/2 -> Train loss: 0.6970656873070631	Validation loss: 0.2608250671060689





In [10]:
MODEL.save_pretrained("/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_model")
TOKENIZER.save_pretrained("/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_tokenizer")

('/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_tokenizer/tokenizer_config.json',
 '/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_tokenizer/special_tokens_map.json',
 '/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_tokenizer/spiece.model',
 '/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_tokenizer/added_tokens.json',
 '/mnt/research/ghassemi-capstone/datasets/checkpoints/qa_tokenizer/tokenizer.json')

In [11]:
def predict_answer(context, question, ref_answer=None):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
    
    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)
  
    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
    
    if ref_answer:
        # Load the Bleu metric
        bleu = evaluate.load("google_bleu")
        score = bleu.compute(predictions=[predicted_answer], 
                            references=[ref_answer])
    
        print("Context: \n", context)
        print("\n")
        print("Question: \n", question)
        return {
            "Reference Answer: ": ref_answer, 
            "Predicted Answer: ": predicted_answer, 
            "BLEU Score: ": score
        }
    else:
        return predicted_answer

In [12]:
context = data.iloc[0]["context"]
question =data.iloc[0]["question"]
answer = data.iloc[0]["answer"]

predict_answer(context, question, answer)

Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Context: 
 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".


Question: 
 When did Beyonce start becoming popular?


{'Reference Answer: ': 'in the late 1990s',
 'Predicted Answer: ': 'late 1990s',
 'BLEU Score: ': {'google_bleu': 0.3}}

In [13]:
context = data.iloc[1]["context"]
question =data.iloc[1]["question"]
answer = data.iloc[1]["answer"]

predict_answer(context, question, answer)

Context: 
 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".


Question: 
 What areas did Beyonce compete in when she was growing up?


{'Reference Answer: ': 'singing and dancing',
 'Predicted Answer: ': 'singing and dancing',
 'BLEU Score: ': {'google_bleu': 1.0}}

## Evaluation

In [17]:
import json
# Loading the data

with open('/mnt/research/ghassemi-capstone/datasets/SQuAD-explorer/dataset/dev-v2.0.json') as f:
    test_data = json.load(f)

In [18]:
train_data

Unnamed: 0,context,question,answer
62022,"Between 1948 and 1958, the Jewish population r...",What percent of the Israeli population is not ...,227258
82967,"In 1919, following the Treaty of Versailles, t...",In what year was U.S. President Woodrow Wilson...,Alsatians
81753,"All government officers of the United States, ...",What branch of the government has prosecutoria...,executive branch
29599,"On April 29, 2011, the boundaries for the stat...",What year were the boundaries for the state-ru...,2011
23910,"Poles of the 17th century assumed that ""szlach...",What German word is also suggestive deriving f...,Schlacht
...,...,...,...
128106,Anthropology of development tends to view deve...,What type of development rarely fails?,fail
103694,"DC commutating electric motors, if fitted with...",The now-standard DC frequencies are what?,converted from utility power
860,Fryderyk may have had some piano instruction f...,At what age did Frédéric start giving public c...,7
15795,Hydrogen is highly soluble in many rare earth ...,When is it damaging?,gas's high solubility is a metallurgical probl...


In [20]:
val_data.to_csv("/mnt/research/ghassemi-capstone/datasets/val_data.csv")

In [21]:
train_data.to_csv("/mnt/research/ghassemi-capstone/datasets/train_data.csv")

In [22]:
test_data = prepare_data(test_data)

In [24]:
test_data = pd.DataFrame(test_data)

In [25]:
test_data

Unnamed: 0,context,question,answer
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,10th and 11th centuries
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"Denmark, Iceland and Norway"
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,Rollo
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,10th century
...,...,...,...
11868,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène
11869,"The pound-force has a metric counterpart, less...",What does not have a metric counterpart?,sthène
11870,"The pound-force has a metric counterpart, less...",What is the force exerted by standard gravity ...,sthène
11871,"The pound-force has a metric counterpart, less...",What force leads to a commonly used unit of mass?,sthène


In [None]:
def predict_answer1(context, question, ref_answer=None):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
    
    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)
    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
    
    return predicted_answer

l = []
for index, row in test_data.iterrows():
    predicted_answer = predict_answer1(row["context"],row["question"])
    l.append(predicted_answer)

In [None]:
with open("/mnt/research/ghassemi-capstone/datasets/predictions.json") as f:
    json.dump(l, f)

In [None]:
with open("/mnt/research/ghassemi-capstone/datasets/predictions.json") as f:
    json.dump(l, f)

In [None]:
test_data["predicted"] = new_l

In [None]:
test_data.to_csv("/mnt/research/ghassemi-capstone/datasets/test_data.csv")

In [None]:
from collections import Counter

def calculate_f1(predictions, ground_truths):
    """
    Calculates average F1 score between two lists of predicted and ground truth sentences.

    Args:
      predictions (list): A list of predicted sentences.
      ground_truths (list): A list of ground truth sentences.

    Returns:
      float: The average F1 score across all sentence pairs.
    """

    f1_scores = []
    precision_scores = []
    recall_scores = []
    for prediction, ground_truth in zip(predictions, ground_truths):
        prediction_tokens = Counter(str(prediction).lower().split())
        ground_truth_tokens = Counter(str(ground_truth).lower().split())

        intersection = sum(min(a, b) for a, b in zip(prediction_tokens.values(), ground_truth_tokens.values()))
        union = sum(prediction_tokens.values()) + sum(ground_truth_tokens.values()) - intersection

        if union == 0:
            f1_scores.append(0.0)
            continue
        
        try:
            precision = intersection / sum(prediction_tokens.values())
            recall = intersection / sum(ground_truth_tokens.values())
            f1 = 2 * (precision * recall) / (precision + recall)
            f1_scores.append(f1)
            precision_scores.append(precision)
            recall_scores.append(recall)
        except:
            continue

    return sum(f1_scores) / len(f1_scores), sum(precision_scores) / len(precision_scores), sum(recall_scores) / len(recall_scores)  # Average F1 score


def calculate_exact_match(predictions, ground_truths):
    """
    Calculates proportion of sentence pairs with exact match between two lists.

    Args:
      predictions (list): A list of predicted sentences.
      ground_truths (list): A list of ground truth sentences.

    Returns:
      float: The proportion of sentence pairs with exact match.
    """

    exact_matches = 0
    for prediction, ground_truth in zip(predictions, ground_truths):
        if str(prediction).lower().strip() == str(ground_truth).lower().strip():
            exact_matches += 1

    return exact_matches / len(predictions)

# Example usage


f1_score, precision, recall = calculate_f1(test_data["predicted"], test_data["answer"])
exact_match = calculate_exact_match(test_data["predicted"], test_data["answer"])

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")

Test results:


Precision: 0.8553666492018343

Recall: 0.8185787344377639

F1 Score: 0.75678737194348

Exact Match: 0.2963867598753474

In [None]:
val_data["predicted"] = l

f1_score, precision, recall = calculate_f1(val_data["predicted"], val_data["answer"])
exact_match = calculate_exact_match(val_data["predicted"], val_data["answer"])
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")

Val/train results: (we added val to training, so train accuracy is calculated using val dataset)

Precision: 0.9329698734409485

Recall: 0.8927604979485285

F1 Score: 0.8717708098706461

Exact Match: 0.599025475751995