In [1]:
!pip install transformers datasets torch evaluate
!pip install transformers
!pip install evaluate
!pip install rouge

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [28]:
from datasets import load_dataset, load_metric
import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW
import nltk
import spacy
import string
import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast
from transformers import AutoModelForQuestionAnswering
import warnings
warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [97]:
def load_dataset(path, num_records=10):
    with open(path) as file:
        dataset = json.load(file)

        if num_records is not None:
            if isinstance(dataset, dict):
                keys = list(dataset.keys())[:num_records]
                dataset = {k: dataset[k] for k in keys}
            else:
                dataset = dataset[:num_records]

    return dataset

squad_data = load_dataset('/content/drive/MyDrive/Data/train-v2.0.json')

In [67]:
model_name = "BAAI/bge-m3"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
TOKENIZER = AutoTokenizer.from_pretrained(model_name)
MODEL = AutoModelForQuestionAnswering.from_pretrained(model_name).to(DEVICE)

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at BAAI/bge-m3 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
OPTIMIZER = AdamW(MODEL.parameters(), lr=5e-5)
Q_LEN = 256   # Question Length
T_LEN = 32    # Target Length
BATCH_SIZE = 2
DEVICE = "cuda:0"

In [74]:
# Extracting context, question, and answers from the dataset

def prepare_data(data):
    records = []
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                id = qa['id']
                is_impossible = qa['is_impossible']
                if not is_impossible:
                    for answer in qa['answers']:
                        records.append({
                            "id": id,
                            "question": question,
                            "context": context,
                            "answer_text": answer['text'],
                            "answer_start": answer['answer_start'],
                            "is_impossible": is_impossible
                        })
                else:
                    # For impossible questions, you might want to handle differently
                    # For now, we append a record with no answer
                    records.append({
                        "id": id,
                        "question": question,
                        "context": context,
                        "answer_text": "",
                        "answer_start": -1,
                        "is_impossible": is_impossible
                    })
    return pd.DataFrame.from_records(records)

In [98]:
data = prepare_data(squad_data)

# Create a Dataframe
data = pd.DataFrame(data)

In [93]:
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, max_length):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Extract items from dataframe
        question = self.data.iloc[idx]["question"]
        context = self.data.iloc[idx]["context"]
        answer = self.data.iloc[idx]["answer_text"]  # Make sure your dataframe has the correct column name for answer text
        answer_start = self.data.iloc[idx]["answer_start"]  # Starting position of the answer in the context

        # Tokenize question and context
        encodings = self.tokenizer.encode_plus(question, context, max_length=self.max_length,
                                               truncation="only_second", padding="max_length",
                                               return_tensors="pt", return_offsets_mapping=True)
        input_ids = encodings["input_ids"].squeeze()
        attention_mask = encodings["attention_mask"].squeeze()
        offset_mapping = encodings["offset_mapping"].squeeze()

        # Find positions of answer in tokenized context
        answer_end = answer_start + len(answer)
        start_position = end_position = None

        for i, (offset_start, offset_end) in enumerate(offset_mapping):
            if start_position is None and offset_start <= answer_start < offset_end:
                start_position = i
            if offset_start < answer_end <= offset_end:
                end_position = i
                break

        # Handling cases where the answer cannot be found in the text
        if start_position is None or end_position is None:
            start_position = end_position = 0

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "start_positions": torch.tensor(start_position, dtype=torch.long),
            "end_positions": torch.tensor(end_position, dtype=torch.long)
        }

In [99]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Define max_length for tokenization
max_length = 512  # Or another value suitable for your model and GPU memory

# Create instances of QA_Dataset for training and validation sets
train_dataset = QA_Dataset(TOKENIZER, train_data, max_length)
val_dataset = QA_Dataset(TOKENIZER, val_data, max_length)

# Batch size
BATCH_SIZE = 8  # Adjust based on your GPU memory

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)  # Shuffle=False for validation data

In [100]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f4d074f03a0>

In [101]:
train_loss = 0
val_loss = 0
train_batch_count = 0
val_batch_count = 0

for epoch in range(2):
    MODEL.train()
    train_loss = 0
    train_batch_count = 0

    for batch in train_loader:  # Adjust according to your loader
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        start_positions = batch["start_positions"].to(DEVICE)
        end_positions = batch["end_positions"].to(DEVICE)

        outputs = MODEL(input_ids=input_ids,
                        attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)

        loss = outputs.loss
        OPTIMIZER.zero_grad()
        loss.backward()
        OPTIMIZER.step()

        train_loss += loss.item()
        train_batch_count += 1
    print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 13.06 MiB is free. Process 9215 has 14.73 GiB memory in use. Of the allocated memory 14.52 GiB is allocated by PyTorch, and 96.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
MODEL.save_pretrained("/content/drive/MyDrive/Data/qa_model_")
TOKENIZER.save_pretrained("/content/drive/MyDrive/Data/qa_tokenizer_")

In [None]:
def predict_answer(context, question, ref_answer=None):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)

    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)

    if ref_answer:
        # Load the Bleu metric
        bleu = evaluate.load("google_bleu")
        score = bleu.compute(predictions=[predicted_answer],
                            references=[ref_answer])

        print("Context: \n", context)
        print("\n")
        print("Question: \n", question)
        return {
            "Reference Answer: ": ref_answer,
            "Predicted Answer: ": predicted_answer,
            "BLEU Score: ": score
        }
    else:
        return predicted_answer

In [None]:
context = data.iloc[0]["context"]
question =data.iloc[0]["question"]
answer = data.iloc[0]["answer"]

predict_answer(context, question, answer)

In [None]:
context = data.iloc[1]["context"]
question =data.iloc[1]["question"]
answer = data.iloc[1]["answer"]

predict_answer(context, question, answer)

EVALUATION

In [None]:
test_data = load_dataset('/content/drive/MyDrive/Data/dev-v2.0.json')

In [None]:
test_data = prepare_data(test_data)

In [None]:
test_data = pd.DataFrame(test_data)

In [None]:
train_data

In [None]:
val_data.to_csv("/content/drive/MyDrive/Data/val_data.csv")

In [None]:
def predict_answer1(context, question, ref_answer=None):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)
    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)

    return predicted_answer

l = []
for index, row in test_data.iterrows():
    predicted_answer = predict_answer1(row["context"],row["question"])
    l.append(predicted_answer)

In [None]:
from collections import Counter

def calculate_f1(predictions, ground_truths):
    """
    Calculates average F1 score between two lists of predicted and ground truth sentences.

    Args:
      predictions (list): A list of predicted sentences.
      ground_truths (list): A list of ground truth sentences.

    Returns:
      float: The average F1 score across all sentence pairs.
    """

    f1_scores = []
    precision_scores = []
    recall_scores = []
    for prediction, ground_truth in zip(predictions, ground_truths):
        prediction_tokens = Counter(str(prediction).lower().split())
        ground_truth_tokens = Counter(str(ground_truth).lower().split())

        intersection = sum(min(a, b) for a, b in zip(prediction_tokens.values(), ground_truth_tokens.values()))
        union = sum(prediction_tokens.values()) + sum(ground_truth_tokens.values()) - intersection

        if union == 0:
            f1_scores.append(0.0)
            continue

        try:
            precision = intersection / sum(prediction_tokens.values())
            recall = intersection / sum(ground_truth_tokens.values())
            f1 = 2 * (precision * recall) / (precision + recall)
            f1_scores.append(f1)
            precision_scores.append(precision)
            recall_scores.append(recall)
        except:
            continue

    return sum(f1_scores) / len(f1_scores), sum(precision_scores) / len(precision_scores), sum(recall_scores) / len(recall_scores)  # Average F1 score


def calculate_exact_match(predictions, ground_truths):
    """
    Calculates proportion of sentence pairs with exact match between two lists.

    Args:
      predictions (list): A list of predicted sentences.
      ground_truths (list): A list of ground truth sentences.

    Returns:
      float: The proportion of sentence pairs with exact match.
    """

    exact_matches = 0
    for prediction, ground_truth in zip(predictions, ground_truths):
        if str(prediction).lower().strip() == str(ground_truth).lower().strip():
            exact_matches += 1

    return exact_matches / len(predictions)

# Example usage


f1_score, precision, recall = calculate_f1(test_data["predicted"], test_data["answer"])
exact_match = calculate_exact_match(test_data["predicted"], test_data["answer"])

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")

In [None]:
val_data["predicted"] = l

f1_score, precision, recall = calculate_f1(val_data["predicted"], val_data["answer"])
exact_match = calculate_exact_match(val_data["predicted"], val_data["answer"])
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Exact Match: {exact_match}")