# 0. Imports, libraries and rusable functions

In [2]:
# Standard Library Imports
import ast
import copy
import csv
import json
import math
import os
import re
import time
import warnings
import logging
import random
import collections
from collections import Counter, defaultdict
from typing import List, Tuple, Optional
from IPython.display import HTML, display
import math
import time
from unidecode import unidecode
import string
import multiprocessing as mp



# Data Handling Libraries
import numpy as np
import pandas as pd
import csv
from torch.utils.data import random_split
import datasets
from datasets import ClassLabel, Sequence, Dataset, DatasetDict, load_dataset, load_metric, concatenate_datasets, load_from_disk


# Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
# import scikitplot as skplt  # Uncomment if scikit-plot is installed and needed

# Machine Learning: Model Preparation
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import cross_val_score, cross_validate, KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler

# Machine Learning: Models and Frameworks
import tensorflow as tf
import torch
from torch.utils.data import DataLoader
import evaluate
import xgboost
import wandb
from xgboost import plot_importance  # Uncomment if xgboost importance plot is required


# NLP and Transformers
import spacy
import transformers
from transformers import (AdamW, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForMultipleChoice,
                          AutoTokenizer, CamembertForSequenceClassification, DistilBertConfig,
                          DistilBertForSequenceClassification, DistilBertModel, EarlyStoppingCallback,
                          get_linear_schedule_with_warmup, RobertaForSequenceClassification, EvalPrediction,
                          Trainer, TrainerCallback, TrainingArguments, XLMRobertaForSequenceClassification,
                         DefaultDataCollator, BertForQuestionAnswering, DataCollatorWithPadding, PreTrainedTokenizerFast,
                         default_data_collator, is_torch_xla_available, pipeline)
from transformers.trainer_utils import PredictionOutput, speed_metrics

# Experiment Tracking and Optimization Utilities
import optuna
from optuna.trial import TrialState
# import wandb  # Uncomment if using Weights & Biases for experiment tracking

# Progress Bar Utilities
from tqdm.auto import tqdm


In [3]:
class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

GPU: NVIDIA GeForce RTX 4070 Ti SUPER is available.
cuda


# 1. Global Variables

In [5]:
## Arguments and global vriables
pretrained_model_name = "microsoft/deberta-v3-base"
normalized_model_name = pretrained_model_name.replace("/", "-")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
assert isinstance( tokenizer, PreTrainedTokenizerFast )
data_collator = DefaultDataCollator()
max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
pad_on_right = right_padding = tokenizer.padding_side == 'right'
global_counter = 0
traing_answer_mismatches = []
logger = logging.getLogger(__name__)



In [6]:
training_args = TrainingArguments(
    output_dir=f"./{normalized_model_name}-best_model",
    overwrite_output_dir = True,
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    save_total_limit=4, 
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb",  # Enable logging to Weights & Biases
    run_name=f"{normalized_model_name}-best_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    num_train_epochs=3,
    lr_scheduler_type = 'linear',
    fp16=True,  # Enable mixed-precision training
)

# 2. Fine-tuning DeBERTa on a question-answering task (SQUAD v2.0 Dataset)


The below code is taken Question Answering on SQUAD NoteBook in the below link
https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb#scrollTo=DDtsaJeVIrJT

## 2.1 Loading the dataset

In [11]:
squad_v2 = True
squad_v2_datasets = load_dataset("squad_v2" if squad_v2 else "squad")

In [12]:
squad_v2_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [13]:
squad_v2_datasets["train"][0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

## 2.2 Preprocessing the training data

In [40]:
tokenizer("What is your name?", "My name is Sylvain.")

{'input_ids': [1, 458, 269, 290, 601, 302, 2, 573, 601, 269, 92556, 260, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [50]:
## Let's find one long example in our dataset:
for i, example in enumerate(squad_v2_datasets["train"]):
    if len(tokenizer(example["question"], example["context"])["input_ids"]) > 512:
        break
example = squad_v2_datasets["train"][i]

In [52]:
len(tokenizer(example["question"], example["context"])["input_ids"])

684

In [86]:
len(tokenizer(example["question"], example["context"], max_length=max_length, truncation="only_second")["input_ids"])

512

In [88]:
tokenized_example = tokenizer(
    example["question"],
    example["context"],
    max_length=max_length,
    truncation="only_second",
    return_overflowing_tokens=True,
    stride=doc_stride
)

In [66]:
[len(x) for x in tokenized_example["input_ids"]]

[512, 316]

In [68]:
for x in tokenized_example["input_ids"][:2]:
    print(tokenizer.decode(x))

[CLS] Which journalist considered Spectre the worst James Bond movie in three decades?[SEP] Critical appraisal of the film was mixed in the United States. In a lukewarm review for RogerEbert.com, Matt Zoller Seitz gave the film 2.5 stars out of 4, describing Spectre as inconsistent and unable to capitalise on its potential. Kenneth Turan, reviewing the film for Los Angeles Times, concluded that Spectre "comes off as exhausted and uninspired". Manohla Dargis of The New York Times panned the film as having "nothing surprising" and sacrificing its originality for the sake of box office returns. Forbes' Scott Mendelson also heavily criticised the film, denouncing Spectre as "the worst 007 movie in 30 years". Darren Franich of Entertainment Weekly viewed Spectre as "an overreaction to our current blockbuster moment", aspiring "to be a serialized sequel" and proving "itself as a Saga". While noting that "[n]othing that happens in Spectre holds up to even minor logical scrutiny", he had "come

In [90]:
tokenized_example = tokenizer(
    example["question"],
    example["context"],
    max_length=max_length,
    truncation="only_second",
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    stride=doc_stride
)
print(tokenized_example["offset_mapping"][0][:100])

[(0, 0), (0, 5), (5, 16), (16, 27), (27, 35), (35, 39), (39, 45), (45, 51), (51, 56), (56, 62), (62, 65), (65, 71), (71, 79), (79, 80), (0, 0), (0, 8), (8, 18), (18, 21), (21, 25), (25, 30), (30, 34), (34, 40), (40, 43), (43, 47), (47, 54), (54, 61), (61, 62), (62, 65), (65, 67), (67, 76), (76, 83), (83, 87), (87, 93), (93, 94), (94, 98), (98, 99), (99, 102), (102, 103), (103, 108), (108, 113), (113, 115), (115, 121), (121, 126), (126, 130), (130, 135), (135, 137), (137, 138), (138, 139), (139, 145), (145, 149), (149, 152), (152, 154), (154, 155), (155, 166), (166, 174), (174, 177), (177, 190), (190, 194), (194, 201), (201, 204), (204, 215), (215, 218), (218, 222), (222, 232), (232, 233), (233, 241), (241, 247), (247, 248), (248, 258), (258, 262), (262, 267), (267, 271), (271, 275), (275, 283), (283, 289), (289, 290), (290, 300), (300, 305), (305, 313), (313, 315), (315, 319), (319, 320), (320, 324), (324, 327), (327, 337), (337, 341), (341, 352), (352, 353), (353, 354), (354, 359), (3

In [72]:
first_token_id = tokenized_example["input_ids"][0][1]
offsets = tokenized_example["offset_mapping"][0][1]
print(tokenizer.convert_ids_to_tokens([first_token_id])[0], example["question"][offsets[0]:offsets[1]])

▁Which Which


In [74]:
sequence_ids = tokenized_example.sequence_ids()
print(sequence_ids)

[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [76]:
answers = example["answers"]
start_char = answers["answer_start"][0]
end_char = start_char + len(answers["text"][0])

# Start token index of the current span in the text.
token_start_index = 0
while sequence_ids[token_start_index] != 1:
    token_start_index += 1

# End token index of the current span in the text.
token_end_index = len(tokenized_example["input_ids"][0]) - 1
while sequence_ids[token_end_index] != 1:
    token_end_index -= 1

# Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
offsets = tokenized_example["offset_mapping"][0]
if (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
    # Move the token_start_index and token_end_index to the two ends of the answer.
    # Note: we could go after the last offset if the answer is the last word (edge case).
    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
        token_start_index += 1
    start_position = token_start_index - 1
    while offsets[token_end_index][1] >= end_char:
        token_end_index -= 1
    end_position = token_end_index + 1
    print(start_position, end_position)
else:
    print("The answer is not in this feature.")

121 122


In [78]:
print(tokenizer.decode(tokenized_example["input_ids"][0][start_position: end_position+1]))
print(answers["text"][0])

Scott Mendelson
Scott Mendelson


In [19]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [96]:
features = prepare_train_features(squad_v2_datasets['train'][:5])

In [21]:
tokenized_datasets = squad_v2_datasets.map(prepare_train_features, batched=True, remove_columns=squad_v2_datasets["train"].column_names)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

## 2.3 Fine-tuning the model

In [104]:
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name)

Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [116]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [118]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [124]:
trainer.save_model("squad-trained-model")

## 2.4 Reusable Functions for Evaluation

In [24]:
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [26]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

In [36]:
validation_features = squad_v2_datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=squad_v2_datasets["validation"].column_names
)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [103]:
squad_metric = load_metric("squad_v2")

In [368]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punct(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punct(lower(s))))

def f1_score(prediction, ground_truth):
    # Normalize and tokenize both prediction and ground truth
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()

    # If both are empty, it's a perfect match
    if not prediction_tokens and not ground_truth_tokens:
        return 1.0
    
    # Calculate overlap in tokens
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    
    # If there is no overlap and either prediction or ground truth is not empty
    if num_same == 0:
        return 0
    
    # Calculate precision and recall
    precision = 1.0 * num_same / len(prediction_tokens) if prediction_tokens else 0
    recall = 1.0 * num_same / len(ground_truth_tokens) if ground_truth_tokens else 0

    # If either precision or recall is zero, F1 is zero
    if not precision or not recall:
        return 0
    
    # Calculate F1 score
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

In [370]:
def evaluate_model(dataset):
    global training_answer_mismatches, training_answer_matches
    exact, f1 = 0, 0
    has_ans_exact, has_ans_f1, no_ans_exact, no_ans_f1 = 0, 0, 0, 0
    total, has_ans_total, no_ans_total = 0, 0, 0
    
    for example in tqdm(dataset, desc="Evaluating model"):
        model_answer = get_model_answer(example['context'], example['question'])
        # Assuming there is only one ground truth answer:
        ground_truth = example['answers']['text'][0] if example['answers']['text'] else ''
        
        if normalize_answer(model_answer) != normalize_answer(ground_truth):
            training_answer_mismatches.append({
                'context': example['context'],
                'question': example['question'],
                'model_answer': model_answer,
                'ground_truth': ground_truth
            })
        else:
            training_answer_matches.append({
                'context': example['context'],
                'question': example['question'],
                'model_answer': model_answer,
                'ground_truth': ground_truth
            })
        
        # Increment counters based on whether there is an answer
        if ground_truth:
            has_ans_total += 1
            has_ans_exact += exact_match_score(model_answer, ground_truth)
            has_ans_f1 += f1_score(model_answer, ground_truth)
        else:
            no_ans_total += 1
            no_ans_exact += exact_match_score(model_answer, ground_truth)  # typically, model_answer should be ""
            no_ans_f1 += f1_score(model_answer, ground_truth)  # typically, f1 should be 0 if model_answer is ""

        # General metrics
        exact += exact_match_score(model_answer, ground_truth)
        f1 += f1_score(model_answer, ground_truth)
        total += 1        

    metrics = {
        'exact': 100.0 * exact / total,
        'f1': 100.0 * f1 / total,
        'total': total,
        'HasAns_exact': 100.0 * has_ans_exact / has_ans_total if has_ans_total else 0,
        'HasAns_f1': 100.0 * has_ans_f1 / has_ans_total if has_ans_total else 0,
        'HasAns_total': has_ans_total,
        'NoAns_exact': 100.0 * no_ans_exact / no_ans_total if no_ans_total else 0,
        'NoAns_f1': 100.0 * no_ans_f1 / no_ans_total if no_ans_total else 0,
        'NoAns_total': no_ans_total
    }
    print(metrics)
    return metrics


In [424]:
def get_model_answer(context, question):
    # Run the QA pipeline
    result = qa_pipeline({
        'context': context,
        'question': question
    })

    answer = result['answer'].strip()
    score = result['score'] if 'score' in result else None

    # Remove any unwanted punctuation at the end of the answer
    while answer and answer[-1] in ",.":
        answer = answer[:-1].strip()

    # Implement a check to determine certainty (optional)
    if score and score < 0.2:  # Example threshold
        return ""  # Return empty if low confidence
    
    return answer

## 2.5 Evaluate Vanilla DeBERTa  

In [32]:
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
assert isinstance( tokenizer, PreTrainedTokenizerFast )

Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [42]:
raw_predictions = trainer.predict(validation_features)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


In [45]:
final_predictions = postprocess_qa_predictions(squad_v2_datasets["validation"], validation_features, raw_predictions.predictions)

Post-processing 11873 example predictions split into 11951 features.


  0%|          | 0/11873 [00:00<?, ?it/s]

In [49]:
formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]

In [53]:
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in squad_v2_datasets["validation"]]
squad_metric.compute(predictions=formatted_predictions, references=references)

{'exact': 36.26716078497431,
 'f1': 37.16487613088951,
 'total': 11873,
 'HasAns_exact': 0.0,
 'HasAns_f1': 1.7980051116821147,
 'HasAns_total': 5928,
 'NoAns_exact': 72.43061396131203,
 'NoAns_f1': 72.43061396131203,
 'NoAns_total': 5945,
 'best_exact': 50.07159100480081,
 'best_exact_thresh': 0.0,
 'best_f1': 50.07159100480081,
 'best_f1_thresh': 0.0}

## 2.6 Evaluate Fine-Tuned DeBERTa

In [55]:
path = "./squad-trained-model"

tokenizer = AutoTokenizer.from_pretrained(path)
assert isinstance( tokenizer, PreTrainedTokenizerFast )
model = AutoModelForQuestionAnswering.from_pretrained(path)

In [57]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [59]:
raw_predictions = trainer.predict(validation_features)

In [61]:
final_predictions = postprocess_qa_predictions(squad_v2_datasets["validation"], validation_features, raw_predictions.predictions)

Post-processing 11873 example predictions split into 11951 features.


  0%|          | 0/11873 [00:00<?, ?it/s]

In [62]:
formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]

In [63]:
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in squad_v2_datasets["validation"]]
squad_metric.compute(predictions=formatted_predictions, references=references)

{'exact': 67.21974227238272,
 'f1': 83.71821933404674,
 'total': 11873,
 'HasAns_exact': 47.064777327935225,
 'HasAns_f1': 80.10904489762844,
 'HasAns_total': 5928,
 'NoAns_exact': 87.3170731707317,
 'NoAns_f1': 87.3170731707317,
 'NoAns_total': 5945,
 'best_exact': 67.21974227238272,
 'best_exact_thresh': 0.0,
 'best_f1': 83.7182193340456,
 'best_f1_thresh': 0.0}

## 2.7 Using QA Pipeline to Evaluate the Model 

In [374]:
training_answer_mismatches = []
training_answer_matches = []
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name)
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer, device=device)

metrics = evaluate_model(squad_v2_datasets['validation'])

#metrics = evaluate_model(random.sample(list(squad_v2_datasets['validation']), 20))


Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating model:   0%|          | 0/11873 [00:00<?, ?it/s]

{'exact': 50.07159100480081, 'f1': 50.07159100480081, 'total': 11873, 'HasAns_exact': 0.0, 'HasAns_f1': 0.0, 'HasAns_total': 5928, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 5945}


In [375]:
print(f"Total matches found: {len(training_answer_matches)}")
#for match in training_answer_matches[:3]:
#    print(match)

Total matches found: 5945


In [376]:
print(f"Total mismatches found: {len(training_answer_mismatches)}")
#training_answer_mismatches[:3]
#for match in training_answer_mismatches[:3]:
#    print(match)

Total mismatches found: 5928


In [426]:
training_answer_mismatches = []
training_answer_matches = []
model = AutoModelForQuestionAnswering.from_pretrained("./squad-trained-model")
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer, device=device)

metrics = evaluate_model(squad_v2_datasets['validation'])


Evaluating model:   0%|          | 0/11873 [00:00<?, ?it/s]

{'exact': 60.79339678261602, 'f1': 79.29771307135124, 'total': 11873, 'HasAns_exact': 33.24898785425101, 'HasAns_f1': 70.31068611608363, 'HasAns_total': 5928, 'NoAns_exact': 88.25904121110176, 'NoAns_f1': 88.25904121110176, 'NoAns_total': 5945}


In [427]:
print(f"Total matches found: {len(training_answer_matches)}")
#for match in training_answer_matches[:3]:
#    print(match)

Total matches found: 7218


In [428]:
print(f"Total mismatches found: {len(training_answer_mismatches)}")
#training_answer_mismatches[:3]
#for match in training_answer_mismatches[:3]:
#    print(match)

Total mismatches found: 4655


In [465]:
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
assert isinstance( tokenizer, PreTrainedTokenizerFast )

Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [473]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# End of NoteBook