## 0. Imports, libraries and rusable functions

In [2]:
# Standard Library Imports
import ast
import copy
import csv
import json
import math
import os
import re
import time
import warnings
import logging
import random
import collections
from collections import Counter
from typing import List, Tuple, Optional
from IPython.display import HTML, display
import math
import time
from unidecode import unidecode


# Data Handling Libraries
import numpy as np
import pandas as pd
import csv
from torch.utils.data import random_split
import datasets
from datasets import ClassLabel, Sequence

# Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
# import scikitplot as skplt  # Uncomment if scikit-plot is installed and needed

# Machine Learning: Model Preparation
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import cross_val_score, cross_validate, KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler

# Machine Learning: Models and Frameworks
import tensorflow as tf
import torch
import evaluate
import xgboost
import wandb
from xgboost import plot_importance  # Uncomment if xgboost importance plot is required


# NLP and Transformers
from transformers import (AdamW, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
                          AutoTokenizer, CamembertForSequenceClassification, DistilBertConfig,
                          DistilBertForSequenceClassification, DistilBertModel, EarlyStoppingCallback,
                          get_linear_schedule_with_warmup, RobertaForSequenceClassification, EvalPrediction,
                          Trainer, TrainerCallback, TrainingArguments, XLMRobertaForSequenceClassification,
                         DefaultDataCollator, BertForQuestionAnswering, DataCollatorWithPadding, PreTrainedTokenizerFast,
                         default_data_collator, is_torch_xla_available)
from datasets import Dataset, DatasetDict, load_dataset
from transformers.trainer_utils import PredictionOutput, speed_metrics

# Experiment Tracking and Optimization Utilities
import optuna
from optuna.trial import TrialState
# import wandb  # Uncomment if using Weights & Biases for experiment tracking

# Progress Bar Utilities
from tqdm.notebook import tqdm


In [3]:
#wandb.login(key='8f7092f0fdaf14add2b4cc07cb0e740080cdd8e7')
wandb.login()

wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


True

In [4]:
class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

GPU: NVIDIA GeForce RTX 4070 Ti SUPER is available.
cuda


## 1. Global Variables

In [6]:
## Arguments and global vriables
global_doc_stride = 128
pretrained_model_name = "bert-base-cased"
normalized_model_name = pretrained_model_name.replace("/", "-")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
assert isinstance( tokenizer, PreTrainedTokenizerFast )
data_collator = DefaultDataCollator()
max_seq_length = tokenizer.model_max_length
version_2_with_negative = True
no_answer_threshold = 0.5
right_padding = tokenizer.padding_side == 'right'
global_counter = 0
traing_answer_mismatches = []
logger = logging.getLogger(__name__)
global train_dataset
global eval_dataset
global cleaned_training_dataset

## 1. Q&A Reusable Functions

In [8]:
### Compute_metrics function for Question and Answering problem is different to classification, more preocessing required.
metric = evaluate.load("squad_v2")

def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions=p.predictions, references=p.label_ids)

In [9]:
"""
A subclass of `Trainer` specific to Question-Answering tasks
"""

if is_torch_xla_available():
    import torch_xla.core.xla_model as xm
    import torch_xla.debug.metrics as met


class QuestionAnsweringTrainer(Trainer):
    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_examples = eval_examples
        self.post_process_function = post_process_function

    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        eval_examples = self.eval_examples if eval_examples is None else eval_examples

        # Temporarily disable metric computation, we will do it in the loop here.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        start_time = time.time()
        try:
            output = eval_loop(
                eval_dataloader,
                description="Evaluation",
                # No point gathering the predictions if there are no metrics, otherwise we defer to
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
                metric_key_prefix=metric_key_prefix,
            )
        finally:
            self.compute_metrics = compute_metrics
        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )
        if self.post_process_function is not None and self.compute_metrics is not None and self.args.should_save:
            # Only the main node write the results by default
            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
            metrics = self.compute_metrics(eval_preds)

            # Prefix all keys with metric_key_prefix + '_'
            for key in list(metrics.keys()):
                if not key.startswith(f"{metric_key_prefix}_"):
                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
            metrics.update(output.metrics)
        else:
            metrics = output.metrics

        if self.args.should_log:
            # Only the main node log the results by default
            self.log(metrics)

        if self.args.tpu_metrics_debug or self.args.debug:
            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
            xm.master_print(met.metrics_report())

        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
        return metrics

    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
        predict_dataloader = self.get_test_dataloader(predict_dataset)

        # Temporarily disable metric computation, we will do it in the loop here.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        start_time = time.time()
        try:
            output = eval_loop(
                predict_dataloader,
                description="Prediction",
                # No point gathering the predictions if there are no metrics, otherwise we defer to
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
                metric_key_prefix=metric_key_prefix,
            )
        finally:
            self.compute_metrics = compute_metrics
        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )

        if self.post_process_function is None or self.compute_metrics is None:
            return output

        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
        metrics = self.compute_metrics(predictions)

        # Prefix all keys with metric_key_prefix + '_'
        for key in list(metrics.keys()):
            if not key.startswith(f"{metric_key_prefix}_"):
                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
        metrics.update(output.metrics)
        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)

In [10]:
def prepare_validation_features(examples):
  # Some of the questions have lots of whitespace on the left, which is not useful and will make the
  # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
  # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]
  # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
  # in one example possible giving several features when a context is long, each of those features having a
  # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
      examples["question"],
      examples["context"],
      truncation="only_second" if right_padding else "only_first",
      max_length=max_seq_length,
      stride=global_doc_stride,
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
      padding="max_length",
  )

  # Since one example might give us several features if it has a long context, we need a map from a feature to
  # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

  # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
  # corresponding example_id and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
      # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 0

      # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

      # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
      # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
      ]

    return tokenized_examples

In [11]:
# Validation preprocessing
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if right_padding else "context"],
        examples["context" if right_padding else "question"],
        truncation="only_second" if right_padding else "only_first",
        max_length=max_seq_length,
        stride=global_doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
    # corresponding example_id and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if right_padding else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [12]:
# Post-processing:
def post_processing_function(examples, features, predictions, stage="eval"):
    # Post-processing: we match the start logits and end logits to answers in the original context.
    predictions = postprocess_qa_predictions(
        examples=examples,
        features=features,
        predictions=predictions,
        version_2_with_negative=version_2_with_negative, #If true, some of the examples do not have an answer.
        n_best_size=20, #The total number of n-best predictions to generate when looking for an answer.
        max_answer_length=30, #The maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.
        null_score_diff_threshold=no_answer_threshold, #The threshold used to select the null answer: if the best answer has a score that is less than the score of the null answer minus this threshold, the null answer is selected for this example. Only useful when `version_2_with_negative=True`.
        output_dir= None,
        log_level=logging.WARNING,
        prefix=stage,
    )
    # Format the result to the format the metric expects.
    if version_2_with_negative:
        formatted_predictions = [
                {"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
            ]
    else:
        formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()]
    
    
    references = [{"id": str(ex["id"]), "answers": ex["answers"]} for ex in examples]
    return EvalPrediction(predictions=formatted_predictions, label_ids=references)

In [13]:
def postprocess_qa_predictions(
    examples,
    features,
    predictions: Tuple[np.ndarray, np.ndarray],
    version_2_with_negative: bool = False,
    n_best_size: int = 20,
    max_answer_length: int = 30,
    null_score_diff_threshold: float = 0.0,
    output_dir: Optional[str] = None,
    prefix: Optional[str] = None,
    log_level: Optional[int] = logging.WARNING,
):
    """
    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
    original contexts. This is the base postprocessing functions for models that only return start and end logits.

    Args:
        examples: The non-preprocessed dataset (see the main script for more information).
        features: The processed dataset (see the main script for more information).
        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
            first dimension must match the number of elements of :obj:`features`.
        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not the underlying dataset contains examples with no answers.
        n_best_size (:obj:`int`, `optional`, defaults to 20):
            The total number of n-best predictions to generate when looking for an answer.
        max_answer_length (:obj:`int`, `optional`, defaults to 30):
            The maximum length of an answer that can be generated. This is needed because the start and end predictions
            are not conditioned on one another.
        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
            The threshold used to select the null answer: if the best answer has a score that is less than the score of
            the null answer minus this threshold, the null answer is selected for this example (note that the score of
            the null answer for an example giving several features is the minimum of the scores for the null answer on
            each feature: all features must be aligned on the fact they `want` to predict a null answer).

            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
        output_dir (:obj:`str`, `optional`):
            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
            answers, are saved in `output_dir`.
        prefix (:obj:`str`, `optional`):
            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
            ``logging`` log level (e.g., ``logging.WARNING``)
    """
    if len(predictions) != 2:
        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
    all_start_logits, all_end_logits = predictions

    if len(predictions[0]) != len(features):
        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")

    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    if version_2_with_negative:
        scores_diff_json = collections.OrderedDict()

    # Logging.
    logger.setLevel(log_level)
    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_prediction = None
        prelim_predictions = []

        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]
            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
            # available in the current feature.
            token_is_max_context = features[feature_index].get("token_is_max_context", None)

            # Update minimum null prediction.
            feature_null_score = start_logits[0] + end_logits[0]
            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
                min_null_prediction = {
                    "offsets": (0, 0),
                    "score": feature_null_score,
                    "start_logit": start_logits[0],
                    "end_logit": end_logits[0],
                }

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or len(offset_mapping[start_index]) < 2
                        or offset_mapping[end_index] is None
                        or len(offset_mapping[end_index]) < 2
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    # Don't consider answer that don't have the maximum context available (if such information is
                    # provided).
                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
                        continue

                    prelim_predictions.append(
                        {
                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
                            "score": start_logits[start_index] + end_logits[end_index],
                            "start_logit": start_logits[start_index],
                            "end_logit": end_logits[end_index],
                        }
                    )
        if version_2_with_negative and min_null_prediction is not None:
            # Add the minimum null prediction
            prelim_predictions.append(min_null_prediction)
            null_score = min_null_prediction["score"]

        # Only keep the best `n_best_size` predictions.
        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]

        # Add back the minimum null prediction if it was removed because of its low score.
        if (
            version_2_with_negative
            and min_null_prediction is not None
            and not any(p["offsets"] == (0, 0) for p in predictions)
        ):
            predictions.append(min_null_prediction)

        # Use the offsets to gather the answer text in the original context.
        context = example["context"]
        for pred in predictions:
            offsets = pred.pop("offsets")
            pred["text"] = context[offsets[0] : offsets[1]]

        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
        # failure.
        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})

        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
        # the LogSumExp trick).
        scores = np.array([pred.pop("score") for pred in predictions])
        exp_scores = np.exp(scores - np.max(scores))
        probs = exp_scores / exp_scores.sum()

        # Include the probabilities in our predictions.
        for prob, pred in zip(probs, predictions):
            pred["probability"] = prob

        # Pick the best prediction. If the null answer is not possible, this is easy.
        if not version_2_with_negative:
            all_predictions[example["id"]] = predictions[0]["text"]
        else:
            # Otherwise we first need to find the best non-empty prediction.
            i = 0
            while predictions[i]["text"] == "":
                i += 1
            best_non_null_pred = predictions[i]

            # Then we compare to the null prediction using the threshold.
            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
            if score_diff > null_score_diff_threshold:
                all_predictions[example["id"]] = ""
            else:
                all_predictions[example["id"]] = best_non_null_pred["text"]

        # Make `predictions` JSON-serializable by casting np.float back to float.
        all_nbest_json[example["id"]] = [
            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
            for pred in predictions
        ]

    # If we have an output_dir, let's save all those dicts.
    if output_dir is not None:
        if not os.path.isdir(output_dir):
            raise EnvironmentError(f"{output_dir} is not a directory.")

        prediction_file = os.path.join(
            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
        )
        nbest_file = os.path.join(
            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
        )
        if version_2_with_negative:
            null_odds_file = os.path.join(
                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
            )

        logger.info(f"Saving predictions to {prediction_file}.")
        with open(prediction_file, "w") as writer:
            writer.write(json.dumps(all_predictions, indent=4) + "\n")
        logger.info(f"Saving nbest_preds to {nbest_file}.")
        with open(nbest_file, "w") as writer:
            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
        if version_2_with_negative:
            logger.info(f"Saving null_odds to {null_odds_file}.")
            with open(null_odds_file, "w") as writer:
                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

    return all_predictions

In [32]:
# List of questions to debug
debug_questions = [
    "What ideology was sponsored at the Ming court?",
    "Who stopped their trips to Ming China?",
    "Another question of interest"
]

# Training preprocessing
def preprocess_function(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if right_padding else "context"],
        examples["context" if right_padding else "question"],
        truncation="only_second" if right_padding else "only_first",
        max_length=max_seq_length,
        stride=global_doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]        
        question = examples["question"][sample_index]
        example_id = examples["id"][sample_index]
        
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if right_padding else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if right_padding else 0):
                token_end_index -= 1
            
            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Increase the end index here to make sure it includes the last character
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                safe_start_index = max(0, token_start_index - 1)
                tokenized_examples["start_positions"].append(safe_start_index)

                # Adjust this line to ensure that the end character is included
                # Decrement token_end_index to point exactly one past the end of the answer text
                while token_end_index >= 0 and offsets[token_end_index][1] > end_char:
                    token_end_index -= 1
                safe_end_index = min(len(input_ids) - 1, token_end_index + 1)
                tokenized_examples["end_positions"].append(safe_end_index)  # Adjust to +2 if needed
                
        # Debugging code
        global global_counter
        global traing_answer_mismatches
        #if question in debug_questions:
        if len(answers["answer_start"]) > 0:
            # Ensure indices are within bounds
            #safe_start_index = max(0, token_start_index - 1)
            #safe_end_index = min(len(input_ids) - 1, token_end_index + 1)

            # Decode the answer using safe indices
            decoded_answer = tokenizer.decode(input_ids[safe_start_index:safe_end_index])
            actual_answer = answers["text"][0] if answers["text"] else "No answer provided"
            normalized_actual_answer = actual_answer.lower().replace(" ", "")
            normalized_decoded_answer = decoded_answer.lower().replace(" ", "")
            if normalized_decoded_answer != normalized_actual_answer:
                if global_counter == 0: print('Mismatch Found')
                global_counter += 1 
                traing_answer_mismatches.append({
                    'ID': example_id,
                    'Question': question,
                    'Offsets': offsets,
                    'Input IDs':input_ids,
                    'Tokenized Text': tokenizer.decode(input_ids),
                    'Char start/end index': f"{start_char} / {end_char}",
                    'Token start/end index':f"{token_start_index} / {token_end_index}",
                    'Decoded Answer': decoded_answer,
                    'Actual Answer': actual_answer,                    
                })    
    return tokenized_examples

## 2. Data Preparation

In [16]:
# Load the full dataset
full_squad_dataset = load_dataset("squad_v2")

In [17]:
Use_Only_Sample = False
Sample_Size = 10000

if Use_Only_Sample:
    # Load a sample portion of the dataset
    squad_raw = datasets.DatasetDict({
        'train': full_squad_dataset['train'].select(range(0, Sample_Size)),
        'validation': full_squad_dataset['validation'].select(range(0, int(Sample_Size * 0.2)))
    })
else:
    # Load the full dataset
    squad_raw = datasets.DatasetDict({
        'train': full_squad_dataset['train'],
        'validation': full_squad_dataset['validation'],    
    })
    
# Display the sizes of the splits to confirm
print("Train set size:", len(squad_raw['train']))
print("Validation set size:", len(squad_raw['validation']))
squad_raw

Train set size: 130319
Validation set size: 11873


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [31]:
## Preprocess the Training Dataset To find Missmatches
train_dataset = squad_raw['train'].map(
                preprocess_function,
                batched=True,
                remove_columns=squad_raw["train"].column_names,
                desc="Running tokenizer on train dataset",
                load_from_cache_file=False,  # Disable caching
            )


Running tokenizer on train dataset:   0%|          | 0/130319 [00:00<?, ? examples/s]

Mismatch Found


In [35]:
# Analyse Missmatches
mismatches_df = pd.DataFrame(traing_answer_mismatches)
print('Total number of miss matches:',global_counter)
display(mismatches_df.head(5))  

Total number of miss matches: 376


Unnamed: 0,ID,Question,Offsets,Input IDs,Tokenized Text,Char start/end index,Token start/end index,Decoded Answer,Actual Answer
0,56bf7e603aeaaa14008c9681,What event caused Beyonce's depression?,"[(0, 0), (0, 4), (5, 10), (11, 17), (18, 21), ...","[101, 1327, 1856, 2416, 24896, 1320, 2093, 112...",[CLS] What event caused Beyonce's depression? ...,194 / 222,49 / 53,split with Luckett and Rob,split with Luckett and Rober
1,56be973d3aeaaa14008c9123,How many number one singles did Beyonce now ha...,"[(0, 0), (0, 3), (4, 8), (9, 15), (16, 19), (2...","[101, 1731, 1242, 1295, 1141, 3896, 1225, 2489...",[CLS] How many number one singles did Beyonce ...,457 / 460,131 / 129,,six
2,56cbdea66d243a140015edae,At what age did Frédéric start giving public c...,"[(0, 0), (0, 2), (3, 7), (8, 11), (12, 15), (1...","[101, 1335, 1184, 1425, 1225, 13359, 2744, 118...",[CLS] At what age did Frédéric start giving pu...,391 / 392,110 / 109,1817,7
3,56cf609aaab44d1400b89187,At what age did Chopin start playing publicly?,"[(0, 0), (0, 2), (3, 7), (8, 11), (12, 15), (1...","[101, 1335, 1184, 1425, 1225, 22964, 6709, 183...",[CLS] At what age did Chopin start playing pub...,391 / 392,106 / 105,1817,7
4,56d20a6ae7d4791d0090261a,How old was Chopin when he began to perform fo...,"[(0, 0), (0, 3), (4, 7), (8, 11), (12, 15), (1...","[101, 1731, 1385, 1108, 22964, 6709, 1165, 111...",[CLS] How old was Chopin when he began to perf...,391 / 392,110 / 109,1817,7


In [37]:
# List of specific IDs to inspect
debug_ids = [
    "56bf7e603aeaaa14008c9681",
    "56be973d3aeaaa14008c9123",
    "56cbdea66d243a140015edae",
    "56cf609aaab44d1400b89187",
    "56d20a6ae7d4791d0090261a",    
]

# Define a function to filter examples by ID and stop processing when all have been found
def print_specific_records(dataset):
    found_ids = set()  # To track IDs that have been printed
    for example in dataset:
        if example['id'] in debug_ids:
            print(example)
            found_ids.add(example['id'])
            if found_ids == set(debug_ids):
                break  # Stop processing as all requested records have been printed

# Apply the function to the full dataset (assuming the dataset is loaded and named full_squad_dataset)
print_specific_records(full_squad_dataset['train'])


{'id': '56bf7e603aeaaa14008c9681', 'title': 'Beyoncé', 'context': "LeToya Luckett and Roberson became unhappy with Mathew's managing of the band and eventually were replaced by Farrah Franklin and Michelle Williams. Beyoncé experienced depression following the split with Luckett and Roberson after being publicly blamed by the media, critics, and blogs for its cause. Her long-standing boyfriend left her at this time. The depression was so severe it lasted for a couple of years, during which she occasionally kept herself in her bedroom for days and refused to eat anything. Beyoncé stated that she struggled to speak about her depression because Destiny's Child had just won their first Grammy Award and she feared no one would take her seriously. Beyoncé would later speak of her mother as the person who helped her fight it. Franklin was dismissed, leaving just Beyoncé, Rowland, and Williams.", 'question': "What event caused Beyonce's depression?", 'answers': {'text': ['split with Luckett an

In [39]:
# Ensure the mismatches DataFrame is created from non-empty data
if traing_answer_mismatches:
    mismatches_df = pd.DataFrame(traing_answer_mismatches)
    print('Total number of mismatches:', len(mismatches_df))
else:
    print("No mismatches to display. The list is empty.")

def remove_mismatches(dataset, mismatch_ids):
    """
    Filters the dataset to exclude mismatched entries.

    Args:
        dataset (Dataset): The dataset from which to remove mismatched examples.
        mismatch_ids (set): A set of example IDs that have mismatches.

    Returns:
        Dataset: A dataset with mismatched examples removed.
    """
    # Use a set for faster look-up times
    filtered_dataset = dataset.filter(lambda example: example['id'] not in mismatch_ids)
    return filtered_dataset

# Check if mismatches DataFrame exists and is not empty before removing mismatches
if 'traing_answer_mismatches' in locals() and not mismatches_df.empty:
    mismatch_ids = set(mismatches_df['ID'])
    cleaned_training_dataset = remove_mismatches(squad_raw['train'], mismatch_ids)
    print("Original dataset size:", len(squad_raw['train']))
    print("Cleaned dataset size:", len(cleaned_training_dataset))
else:
    print("No mismatches found or mismatch data is empty. No filtering applied.")
    cleaned_training_dataset = squad_raw['train']


Total number of mismatches: 376
Original dataset size: 130319
Cleaned dataset size: 129944


In [41]:

# Preprocessing the datasets
train_dataset = cleaned_training_dataset.map(
                preprocess_function,
                batched=True,
                remove_columns=squad_raw["train"].column_names,
                desc="Running tokenizer on train dataset",
                load_from_cache_file=False,  # Disable caching
            )
eval_dataset = squad_raw['validation'].map(
                prepare_validation_features,
                batched=True,
                remove_columns=squad_raw["train"].column_names,
                desc="Running tokenizer on validation dataset",
                load_from_cache_file=False,  # Disable caching
            )
eval_examples =  squad_raw["validation"]


Running tokenizer on train dataset:   0%|          | 0/129944 [00:00<?, ? examples/s]

Running tokenizer on validation dataset:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [43]:
# Display the sizes of the splits to confirm
print("Train set size:", len(train_dataset))
print("Validation set size:", len(eval_dataset))
print("Validation Examples size:", len(eval_examples))

Train set size: 130049
Validation set size: 11974
Validation Examples size: 11873


## 3. Initial Tuning of BERT Model


In [23]:
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name)
training_args = TrainingArguments(
    output_dir=f'{pretrained_model_name}-finetuned-manual',
    overwrite_output_dir = True,
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    save_total_limit=4, 
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb",  # Enable logging to Weights & Biases
    run_name=f"{pretrained_model_name}-finetune-manual",  # Optionally set a specific run name    
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        eval_examples=eval_examples,
        tokenizer=tokenizer,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.0295,No log,52.716247,67.764357,11873,41.396761,71.536135,5928,64.003364,64.003364,5945,53.465847,0.0,67.772779,0.0
2,0.6739,No log,55.866251,70.884979,11873,41.177463,71.257988,5928,70.513036,70.513036,5945,56.16946,0.0,70.884979,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

TrainOutput(global_step=8160, training_loss=0.9914125863243551, metrics={'train_runtime': 4861.6222, 'train_samples_per_second': 53.704, 'train_steps_per_second': 1.678, 'total_flos': 6.822144762268877e+16, 'train_loss': 0.9914125863243551, 'epoch': 2.0})

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,0.6675,No log,52.092984,67.779827,11873,42.560729,73.9794,5928,61.597981,61.597981,5945,52.615177,0.0,67.788249,0.0
2,0.4124,No log,55.571465,70.988679,11873,40.789474,71.668114,5928,70.311186,70.311186,5945,55.815716,0.0,70.988679,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

TrainOutput(global_step=8160, training_loss=0.5420088936300839, metrics={'train_runtime': 4857.1372, 'train_samples_per_second': 53.753, 'train_steps_per_second': 1.68, 'total_flos': 6.822144762268877e+16, 'train_loss': 0.5420088936300839, 'epoch': 2.0})

## 5. Models Tuning Reusable Functions

In [65]:
Sample_Size = 60000

# Load a sample portion of the dataset
subset_squad_raw = datasets.DatasetDict({
    'train': cleaned_training_dataset.shuffle(seed=42).select(range(Sample_Size)),
    'validation': full_squad_dataset['validation']
})
    
# Display the sizes of the splits to confirm
print("Train set size:", len(subset_squad_raw['train']))
print("Validation set size:", len(subset_squad_raw['validation']))
squad_raw

# Preprocessing the datasets
eval_examples =  subset_squad_raw["validation"]


Train set size: 60000
Validation set size: 11873


In [55]:
class AdvancedEarlyStoppingCallback(TrainerCallback):
    """
    A callback to stop training when either the performance falls below a certain threshold
    or if there is no improvement over a set number of epochs.
    """
    def __init__(self, metric_name, patience, threshold):
        self.metric_name = metric_name
        self.patience = patience
        self.threshold = threshold
        self.best_score = None
        self.no_improve_epochs = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metric_value = kwargs['metrics'].get(self.metric_name)

        if self.best_score is None or metric_value > self.best_score:
            self.best_score = metric_value
            self.no_improve_epochs = 0
        else:
            self.no_improve_epochs += 1

        # Check if performance is below the threshold
        if metric_value < self.threshold:
            control.should_training_stop = True
            print(f"Stopping training: {self.metric_name} below threshold of {self.threshold}")

        # Check if no improvement has been seen over the allowed patience
        if self.no_improve_epochs >= self.patience:
            control.should_training_stop = True
            print(f"Stopping training: No improvement in {self.metric_name} for {self.patience} epochs")


In [77]:
# Define model initialization function
def model_init():
    return AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name)

# Define train dataset initialization function
def train_dataset_init():
    return subset_squad_raw['train'].map(
                preprocess_function,
                batched=True,
                remove_columns=subset_squad_raw["train"].column_names,
                desc="Running tokenizer on train dataset",
            )

# Define validation dataset initialization function
def vald_dataset_init():
    return subset_squad_raw['validation'].map(
                prepare_validation_features,
                batched=True,
                remove_columns=subset_squad_raw["train"].column_names,
                desc="Running tokenizer on validation dataset",
            )

# Optuna objective function for hyperparameter tuning
def objective(trial):
    # Hyperparameters to tune    
    learning_rate = trial.suggest_float('learning_rate', 1e-7, 1e-4, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    warmup_steps = trial.suggest_int('warmup_steps', 0, 1000)
    weight_decay = trial.suggest_float('weight_decay', 0.01, 0.25)
    adam_beta1 = trial.suggest_float('adam_beta1', 0.8, 0.95)
    adam_beta2 = trial.suggest_float('adam_beta2', 0.990, 0.999)
    adam_epsilon = trial.suggest_float('adam_epsilon', 1e-8, 1e-6)
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine', 'cosine_with_restarts','constant_with_warmup'])
    output_dir = f"./{normalized_model_name}-finetuned-squadv2/trial_{trial.number}"
    
    #global global_doc_stride
    #global_doc_stride=trial.suggest_int('doc_stride', 128, 256, step=64)

    # Print trial parameters
    print(f"Current Trial {trial.number} parameters: {trial.params}")
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir = True,
        metric_for_best_model='f1',
        greater_is_better=True,
        load_best_model_at_end=True,
        save_total_limit=2, # Save only the best model unless you specify a different number
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,  # Adjust based on computation limits
        report_to="wandb",  # Enable logging to Weights & Biases        
        run_name=f"{normalized_model_name}-finetune-squadv2",  # Optionally set a specific run name    
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=warmup_steps,
        weight_decay=weight_decay,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        lr_scheduler_type=lr_scheduler_type,
        fp16=True,  # Enable mixed-precision training
    )    

    trainer = QuestionAnsweringTrainer(
        model=model_init(),
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset_init(),
        eval_dataset=vald_dataset_init(),
        eval_examples=eval_examples,        
        data_collator=data_collator,
        post_process_function=post_processing_function,
        compute_metrics=compute_metrics,
        #callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
        callbacks=[AdvancedEarlyStoppingCallback(metric_name='eval_f1', patience=1, threshold=40)]
    )  
    

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    #print("Evaluation results:", eval_results)  # Debug print
    return eval_results['eval_f1']


## 6. Hyperparameters Search for BERT Model


In [82]:
# Create a study object and optimize the objective
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2024-05-26 22:47:58,761] A new study created in memory with name: no-name-dec054a1-5aa0-483c-a828-3a8198e82f1a


Current Trial 0 parameters: {'learning_rate': 2.8895616704837976e-06, 'batch_size': 32, 'warmup_steps': 861, 'weight_decay': 0.197267375250397, 'adam_beta1': 0.9105565192513869, 'adam_beta2': 0.9965905541309563, 'adam_epsilon': 5.360504619784085e-07, 'lr_scheduler_type': 'constant_with_warmup'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on train dataset:   0%|          | 0/60000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,2.3188,No log,36.646172,45.51646,11873,22.925101,40.691115,5928,50.328007,50.328007,5945,50.096858,0.0,50.096858,0.0
2,1.7001,No log,44.310621,55.559279,11873,30.296896,52.826472,5928,58.284272,58.284272,5945,50.096858,0.0,55.835714,0.0
3,1.4835,No log,41.927061,55.507553,11873,36.842105,64.042034,5928,46.997477,46.997477,5945,50.096858,0.0,55.664693,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-26 23:16:42,812] Trial 0 finished with value: 55.559279401611526 and parameters: {'learning_rate': 2.8895616704837976e-06, 'batch_size': 32, 'warmup_steps': 861, 'weight_decay': 0.197267375250397, 'adam_beta1': 0.9105565192513869, 'adam_beta2': 0.9965905541309563, 'adam_epsilon': 5.360504619784085e-07, 'lr_scheduler_type': 'constant_with_warmup'}. Best is trial 0 with value: 55.559279401611526.


Current Trial 1 parameters: {'learning_rate': 3.3317262390422633e-06, 'batch_size': 16, 'warmup_steps': 443, 'weight_decay': 0.2395289014748998, 'adam_beta1': 0.9314702937622692, 'adam_beta2': 0.9954068855733819, 'adam_epsilon': 8.895442680688017e-08, 'lr_scheduler_type': 'constant_with_warmup'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.7564,No log,38.911817,51.87911,11873,34.227395,60.199169,5928,43.582843,43.582843,5945,50.096858,0.0,52.286693,0.0
2,1.3695,No log,47.182683,60.831803,11873,35.711876,63.049258,5928,58.62069,58.62069,5945,50.096858,0.0,60.840226,0.0
3,1.2377,No log,44.293776,59.279692,11873,39.642375,69.657183,5928,48.931876,48.931876,5945,50.096858,0.0,59.304959,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-26 23:47:16,836] Trial 1 finished with value: 60.831803292632145 and parameters: {'learning_rate': 3.3317262390422633e-06, 'batch_size': 16, 'warmup_steps': 443, 'weight_decay': 0.2395289014748998, 'adam_beta1': 0.9314702937622692, 'adam_beta2': 0.9954068855733819, 'adam_epsilon': 8.895442680688017e-08, 'lr_scheduler_type': 'constant_with_warmup'}. Best is trial 1 with value: 60.831803292632145.


Current Trial 2 parameters: {'learning_rate': 9.610315082349156e-06, 'batch_size': 16, 'warmup_steps': 70, 'weight_decay': 0.02229216388298138, 'adam_beta1': 0.8341249903812974, 'adam_beta2': 0.9918099833530545, 'adam_epsilon': 6.981765861025366e-07, 'lr_scheduler_type': 'linear'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.4204,No log,43.645246,58.093403,11873,38.61336,67.551109,5928,48.662742,48.662742,5945,50.096858,0.0,58.108376,0.0
2,1.1157,No log,51.815043,65.630159,11873,36.066127,63.735977,5928,67.518923,67.518923,5945,52.429883,0.0,65.630159,0.0
3,0.9824,No log,51.478144,65.714881,11873,38.292848,66.80715,5928,64.625736,64.625736,5945,52.109829,0.0,65.706459,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 00:17:34,949] Trial 2 finished with value: 65.71488138575606 and parameters: {'learning_rate': 9.610315082349156e-06, 'batch_size': 16, 'warmup_steps': 70, 'weight_decay': 0.02229216388298138, 'adam_beta1': 0.8341249903812974, 'adam_beta2': 0.9918099833530545, 'adam_epsilon': 6.981765861025366e-07, 'lr_scheduler_type': 'linear'}. Best is trial 2 with value: 65.71488138575606.


Current Trial 3 parameters: {'learning_rate': 3.257002507875358e-06, 'batch_size': 16, 'warmup_steps': 226, 'weight_decay': 0.22605457772789392, 'adam_beta1': 0.9256993417064603, 'adam_beta2': 0.9978072445101214, 'adam_epsilon': 5.964676983840752e-07, 'lr_scheduler_type': 'constant_with_warmup'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.7539,No log,38.852859,51.81145,11873,34.092443,60.046786,5928,43.599664,43.599664,5945,50.096858,0.0,52.143811,0.0
2,1.3709,No log,46.753137,60.534282,11873,35.576923,63.178734,5928,57.897393,57.897393,5945,50.096858,0.0,60.542705,0.0
3,1.2424,No log,44.706477,59.603747,11873,39.507422,69.344684,5928,49.890664,49.890664,5945,50.096858,0.0,59.637258,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 00:48:03,138] Trial 3 finished with value: 60.53428217781201 and parameters: {'learning_rate': 3.257002507875358e-06, 'batch_size': 16, 'warmup_steps': 226, 'weight_decay': 0.22605457772789392, 'adam_beta1': 0.9256993417064603, 'adam_beta2': 0.9978072445101214, 'adam_epsilon': 5.964676983840752e-07, 'lr_scheduler_type': 'constant_with_warmup'}. Best is trial 2 with value: 65.71488138575606.


Current Trial 4 parameters: {'learning_rate': 2.303378516306679e-06, 'batch_size': 64, 'warmup_steps': 9, 'weight_decay': 0.15556251586553263, 'adam_beta1': 0.8457120370297727, 'adam_beta2': 0.9987553898547082, 'adam_epsilon': 3.886023799216506e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,3.5525,No log,37.606334,44.504575,11873,17.661943,31.478207,5928,57.493692,57.493692,5945,50.080013,0.0,50.080013,0.0
2,2.1248,No log,38.524383,47.723398,11873,23.582996,42.007406,5928,53.423045,53.423045,5945,50.088436,0.0,50.092869,0.0
3,1.9696,No log,38.78548,48.362196,11873,24.612011,43.792907,5928,52.918419,52.918419,5945,50.088436,0.0,50.225246,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 01:15:59,691] Trial 4 finished with value: 48.362195871919646 and parameters: {'learning_rate': 2.303378516306679e-06, 'batch_size': 64, 'warmup_steps': 9, 'weight_decay': 0.15556251586553263, 'adam_beta1': 0.8457120370297727, 'adam_beta2': 0.9987553898547082, 'adam_epsilon': 3.886023799216506e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 2 with value: 65.71488138575606.


Current Trial 5 parameters: {'learning_rate': 2.576136287284426e-07, 'batch_size': 32, 'warmup_steps': 786, 'weight_decay': 0.134616139978681, 'adam_beta1': 0.8589774142157393, 'adam_beta2': 0.9980007316024959, 'adam_epsilon': 7.724820212017663e-07, 'lr_scheduler_type': 'constant_with_warmup'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,4.3384,No log,50.071591,50.071591,11873,0.0,0.0,5928,100.0,100.0,5945,50.071591,0.0,50.071591,0.0
2,3.2392,No log,50.063169,50.063169,11873,0.0,0.0,5928,99.983179,99.983179,5945,50.071591,0.0,50.071591,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 01:35:19,516] Trial 5 finished with value: 50.07159100480081 and parameters: {'learning_rate': 2.576136287284426e-07, 'batch_size': 32, 'warmup_steps': 786, 'weight_decay': 0.134616139978681, 'adam_beta1': 0.8589774142157393, 'adam_beta2': 0.9980007316024959, 'adam_epsilon': 7.724820212017663e-07, 'lr_scheduler_type': 'constant_with_warmup'}. Best is trial 2 with value: 65.71488138575606.


Current Trial 6 parameters: {'learning_rate': 7.176361416135588e-05, 'batch_size': 16, 'warmup_steps': 102, 'weight_decay': 0.141338500720577, 'adam_beta1': 0.8824298717779988, 'adam_beta2': 0.9968626436091191, 'adam_epsilon': 3.0277412934900717e-07, 'lr_scheduler_type': 'constant_with_warmup'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.3255,No log,46.222522,60.549908,11873,36.825236,65.521095,5928,55.592935,55.592935,5945,50.096858,0.0,60.55833,0.0
2,1.013,No log,51.065443,63.928259,11873,32.371795,58.134314,5928,69.705635,69.705635,5945,51.958225,0.0,63.928259,0.0
3,0.8199,No log,45.26236,60.600694,11873,37.634953,68.355608,5928,52.867956,52.867956,5945,50.096858,0.0,60.600694,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 02:05:35,630] Trial 6 finished with value: 63.92825852132462 and parameters: {'learning_rate': 7.176361416135588e-05, 'batch_size': 16, 'warmup_steps': 102, 'weight_decay': 0.141338500720577, 'adam_beta1': 0.8824298717779988, 'adam_beta2': 0.9968626436091191, 'adam_epsilon': 3.0277412934900717e-07, 'lr_scheduler_type': 'constant_with_warmup'}. Best is trial 2 with value: 65.71488138575606.


Current Trial 7 parameters: {'learning_rate': 8.489260450423479e-07, 'batch_size': 64, 'warmup_steps': 607, 'weight_decay': 0.12419477807991067, 'adam_beta1': 0.8396904433225968, 'adam_beta2': 0.9957496548893403, 'adam_epsilon': 7.607206541165466e-07, 'lr_scheduler_type': 'linear'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,5.8054,No log,50.071591,50.071591,11873,0.0,0.0,5928,100.0,100.0,5945,50.071591,0.0,50.071591,0.0
2,2.9231,No log,45.675061,47.554094,11873,5.280027,9.043481,5928,85.954584,85.954584,5945,50.071591,0.0,50.073837,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 02:33:57,410] Trial 7 finished with value: 50.07159100480081 and parameters: {'learning_rate': 8.489260450423479e-07, 'batch_size': 64, 'warmup_steps': 607, 'weight_decay': 0.12419477807991067, 'adam_beta1': 0.8396904433225968, 'adam_beta2': 0.9957496548893403, 'adam_epsilon': 7.607206541165466e-07, 'lr_scheduler_type': 'linear'}. Best is trial 2 with value: 65.71488138575606.


Current Trial 8 parameters: {'learning_rate': 3.0879006441869604e-06, 'batch_size': 64, 'warmup_steps': 14, 'weight_decay': 0.13127911620432747, 'adam_beta1': 0.9337588014459542, 'adam_beta2': 0.9905404978352851, 'adam_epsilon': 9.744955653178852e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,3.6296,No log,38.473848,46.618903,11873,20.546559,36.860027,5928,56.349874,56.349874,5945,50.088436,0.0,50.097583,0.0
2,2.0044,No log,38.364356,49.157628,11873,28.593117,50.210614,5928,48.107653,48.107653,5945,50.096858,0.0,50.81264,0.0
3,1.8096,No log,39.324518,50.228553,11873,29.251012,51.090353,5928,49.369218,49.369218,5945,50.096858,0.0,51.425193,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 03:08:57,743] Trial 8 finished with value: 50.22855340134922 and parameters: {'learning_rate': 3.0879006441869604e-06, 'batch_size': 64, 'warmup_steps': 14, 'weight_decay': 0.13127911620432747, 'adam_beta1': 0.9337588014459542, 'adam_beta2': 0.9905404978352851, 'adam_epsilon': 9.744955653178852e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 2 with value: 65.71488138575606.


Current Trial 9 parameters: {'learning_rate': 4.91978224351371e-05, 'batch_size': 32, 'warmup_steps': 398, 'weight_decay': 0.19404494917360213, 'adam_beta1': 0.8368183686077211, 'adam_beta2': 0.9969107098230887, 'adam_epsilon': 5.874878468800929e-07, 'lr_scheduler_type': 'cosine'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.3158,No log,45.919313,60.987655,11873,40.435223,70.61512,5928,51.387721,51.387721,5945,50.096858,0.0,60.987655,0.0
2,0.8064,No log,55.579887,69.761629,11873,38.157895,66.562047,5928,72.952061,72.952061,5945,56.110503,0.0,69.753206,0.0
3,0.4564,No log,54.249137,69.094824,11873,39.119433,68.853382,5928,69.335576,69.335576,5945,54.80502,0.0,69.086402,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 03:37:38,825] Trial 9 finished with value: 69.76162859449346 and parameters: {'learning_rate': 4.91978224351371e-05, 'batch_size': 32, 'warmup_steps': 398, 'weight_decay': 0.19404494917360213, 'adam_beta1': 0.8368183686077211, 'adam_beta2': 0.9969107098230887, 'adam_epsilon': 5.874878468800929e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 69.76162859449346.


Current Trial 10 parameters: {'learning_rate': 9.41535226399275e-05, 'batch_size': 32, 'warmup_steps': 380, 'weight_decay': 0.07077487856651368, 'adam_beta1': 0.8010331990576185, 'adam_beta2': 0.9935490335829797, 'adam_epsilon': 1.7199442129813214e-07, 'lr_scheduler_type': 'cosine'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.3607,No log,45.700328,60.307245,11873,38.157895,67.413617,5928,53.221194,53.221194,5945,50.096858,0.0,60.330106,0.0
2,0.8317,No log,55.108229,69.014954,11873,36.116734,63.970065,5928,74.045416,74.045416,5945,55.739914,0.0,69.006531,0.0
3,0.3938,No log,54.198602,68.712792,11873,39.237517,68.30752,5928,69.116905,69.116905,5945,54.746062,0.0,68.70437,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 04:06:22,660] Trial 10 finished with value: 69.01495391051611 and parameters: {'learning_rate': 9.41535226399275e-05, 'batch_size': 32, 'warmup_steps': 380, 'weight_decay': 0.07077487856651368, 'adam_beta1': 0.8010331990576185, 'adam_beta2': 0.9935490335829797, 'adam_epsilon': 1.7199442129813214e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 69.76162859449346.


Current Trial 11 parameters: {'learning_rate': 8.159944536374918e-05, 'batch_size': 32, 'warmup_steps': 401, 'weight_decay': 0.05795700215907494, 'adam_beta1': 0.8067226194877563, 'adam_beta2': 0.9929362145788089, 'adam_epsilon': 3.902532280527017e-08, 'lr_scheduler_type': 'cosine'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.3368,No log,45.36343,60.260946,11873,40.941296,70.779051,5928,49.772918,49.772918,5945,50.096858,0.0,60.269369,0.0
2,0.8157,No log,54.358629,68.852438,11873,38.24224,67.271423,5928,70.428932,70.428932,5945,54.981892,0.0,68.852438,0.0
3,0.3924,No log,53.743788,68.66727,11873,39.136302,69.026062,5928,68.309504,68.309504,5945,54.451276,0.0,68.658848,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 04:34:57,013] Trial 11 finished with value: 68.85243802364668 and parameters: {'learning_rate': 8.159944536374918e-05, 'batch_size': 32, 'warmup_steps': 401, 'weight_decay': 0.05795700215907494, 'adam_beta1': 0.8067226194877563, 'adam_beta2': 0.9929362145788089, 'adam_epsilon': 3.902532280527017e-08, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 69.76162859449346.


Current Trial 12 parameters: {'learning_rate': 2.318609422378073e-05, 'batch_size': 32, 'warmup_steps': 611, 'weight_decay': 0.0788564276765058, 'adam_beta1': 0.8010154959384883, 'adam_beta2': 0.9941237737468845, 'adam_epsilon': 2.6717616220681404e-07, 'lr_scheduler_type': 'cosine'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.4475,No log,44.64752,59.233499,11873,39.018219,68.232007,5928,50.260723,50.260723,5945,50.096858,0.0,59.253152,0.0
2,0.9963,No log,53.903815,67.810409,11873,37.415655,65.268722,5928,70.344828,70.344828,5945,54.442854,0.0,67.810409,0.0
3,0.7362,No log,52.58991,66.950249,11873,38.967611,67.729472,5928,66.173255,66.173255,5945,53.196328,0.0,66.950249,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 05:03:31,055] Trial 12 finished with value: 67.81040876588408 and parameters: {'learning_rate': 2.318609422378073e-05, 'batch_size': 32, 'warmup_steps': 611, 'weight_decay': 0.0788564276765058, 'adam_beta1': 0.8010154959384883, 'adam_beta2': 0.9941237737468845, 'adam_epsilon': 2.6717616220681404e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 69.76162859449346.


Current Trial 13 parameters: {'learning_rate': 2.1545575027189555e-05, 'batch_size': 32, 'warmup_steps': 296, 'weight_decay': 0.18253292872945703, 'adam_beta1': 0.8231749605587936, 'adam_beta2': 0.9939072454544569, 'adam_epsilon': 1.895119179143103e-07, 'lr_scheduler_type': 'cosine'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.4149,No log,43.805272,58.775002,11873,39.74359,69.725978,5928,47.855341,47.855341,5945,50.096858,0.0,58.794655,0.0
2,0.986,No log,53.785901,67.739371,11873,37.634953,65.581908,5928,69.890664,69.890664,5945,54.291249,0.0,67.739371,0.0
3,0.7577,No log,52.55622,66.936091,11873,38.866397,67.667378,5928,66.206897,66.206897,5945,53.145793,0.0,66.936091,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 05:32:04,575] Trial 13 finished with value: 67.7393707621622 and parameters: {'learning_rate': 2.1545575027189555e-05, 'batch_size': 32, 'warmup_steps': 296, 'weight_decay': 0.18253292872945703, 'adam_beta1': 0.8231749605587936, 'adam_beta2': 0.9939072454544569, 'adam_epsilon': 1.895119179143103e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 69.76162859449346.


Current Trial 14 parameters: {'learning_rate': 2.8549155447900446e-05, 'batch_size': 32, 'warmup_steps': 577, 'weight_decay': 0.09294699920541574, 'adam_beta1': 0.8774512339865945, 'adam_beta2': 0.9927446497021774, 'adam_epsilon': 4.3144566530926874e-07, 'lr_scheduler_type': 'cosine'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.407,No log,45.405542,60.150774,11873,39.288124,68.820873,5928,51.505467,51.505467,5945,50.088436,0.0,60.162004,0.0
2,0.9412,No log,54.442854,68.575675,11873,37.533738,65.839911,5928,71.303616,71.303616,5945,54.8724,0.0,68.584098,0.0
3,0.65,No log,53.179483,67.640945,11873,39.085695,68.05009,5928,67.232969,67.232969,5945,53.710099,0.0,67.640945,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 06:00:39,339] Trial 14 finished with value: 68.57567543025168 and parameters: {'learning_rate': 2.8549155447900446e-05, 'batch_size': 32, 'warmup_steps': 577, 'weight_decay': 0.09294699920541574, 'adam_beta1': 0.8774512339865945, 'adam_beta2': 0.9927446497021774, 'adam_epsilon': 4.3144566530926874e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 69.76162859449346.


Current Trial 15 parameters: {'learning_rate': 9.60291898281835e-05, 'batch_size': 32, 'warmup_steps': 305, 'weight_decay': 0.041701741689357016, 'adam_beta1': 0.8171426308360827, 'adam_beta2': 0.9951163801971098, 'adam_epsilon': 5.892615403419578e-07, 'lr_scheduler_type': 'cosine'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.3547,No log,45.860355,60.728618,11873,38.394062,68.173226,5928,53.305299,53.305299,5945,50.096858,0.0,60.745463,0.0
2,0.8325,No log,54.072265,68.076549,11873,37.68556,65.73429,5928,70.412111,70.412111,5945,54.712373,0.0,68.059704,0.0
3,0.3964,No log,53.659564,68.407562,11873,38.782051,68.320342,5928,68.494533,68.494533,5945,54.215447,0.0,68.39914,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 06:29:14,218] Trial 15 finished with value: 68.40756230636224 and parameters: {'learning_rate': 9.60291898281835e-05, 'batch_size': 32, 'warmup_steps': 305, 'weight_decay': 0.041701741689357016, 'adam_beta1': 0.8171426308360827, 'adam_beta2': 0.9951163801971098, 'adam_epsilon': 5.892615403419578e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 69.76162859449346.


Current Trial 16 parameters: {'learning_rate': 3.6809224647208096e-05, 'batch_size': 32, 'warmup_steps': 755, 'weight_decay': 0.09221106406952706, 'adam_beta1': 0.859623441525126, 'adam_beta2': 0.9965755321665304, 'adam_epsilon': 9.344525266405982e-07, 'lr_scheduler_type': 'cosine'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.3958,No log,41.758612,57.317797,11873,40.080972,71.243961,5928,43.431455,43.431455,5945,50.088436,0.0,57.337449,0.0
2,0.902,No log,54.889245,68.872396,11873,37.112011,65.118413,5928,72.615643,72.615643,5945,55.301946,0.0,68.872396,0.0
3,0.5655,No log,53.482692,68.072163,11873,38.731444,67.952224,5928,68.191758,68.191758,5945,54.046997,0.0,68.072163,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 06:57:48,776] Trial 16 finished with value: 68.87239553190524 and parameters: {'learning_rate': 3.6809224647208096e-05, 'batch_size': 32, 'warmup_steps': 755, 'weight_decay': 0.09221106406952706, 'adam_beta1': 0.859623441525126, 'adam_beta2': 0.9965755321665304, 'adam_epsilon': 9.344525266405982e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 69.76162859449346.


Current Trial 17 parameters: {'learning_rate': 1.067395298558973e-05, 'batch_size': 32, 'warmup_steps': 499, 'weight_decay': 0.1816655269610026, 'adam_beta1': 0.8210134551977514, 'adam_beta2': 0.9930401718266507, 'adam_epsilon': 4.16055252593504e-07, 'lr_scheduler_type': 'cosine'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.6422,No log,40.25099,54.654761,11873,37.803644,66.652493,5928,42.691337,42.691337,5945,50.096858,0.0,54.680029,0.0
2,1.2215,No log,51.570791,64.843821,11873,35.762483,62.346607,5928,67.333894,67.333894,5945,52.278278,0.0,64.852243,0.0
3,1.0794,No log,49.953676,64.158944,11873,37.887989,66.339261,5928,61.984861,61.984861,5945,50.686431,0.0,64.167366,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 07:26:24,779] Trial 17 finished with value: 64.84382089416286 and parameters: {'learning_rate': 1.067395298558973e-05, 'batch_size': 32, 'warmup_steps': 499, 'weight_decay': 0.1816655269610026, 'adam_beta1': 0.8210134551977514, 'adam_beta2': 0.9930401718266507, 'adam_epsilon': 4.16055252593504e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 69.76162859449346.


Current Trial 18 parameters: {'learning_rate': 1.0883583569586597e-07, 'batch_size': 32, 'warmup_steps': 181, 'weight_decay': 0.2022478637681134, 'adam_beta1': 0.903264299975208, 'adam_beta2': 0.9911374128002458, 'adam_epsilon': 1.5685341855847904e-07, 'lr_scheduler_type': 'cosine'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,5.5922,No log,18.251495,21.213807,11873,0.607287,6.540406,5928,35.845248,35.845248,5945,50.071591,0.0,50.07394,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: eval_f1 below threshold of 40


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: eval_f1 below threshold of 40
Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 07:36:45,689] Trial 18 finished with value: 21.213806891631645 and parameters: {'learning_rate': 1.0883583569586597e-07, 'batch_size': 32, 'warmup_steps': 181, 'weight_decay': 0.2022478637681134, 'adam_beta1': 0.903264299975208, 'adam_beta2': 0.9911374128002458, 'adam_epsilon': 1.5685341855847904e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 69.76162859449346.


Current Trial 19 parameters: {'learning_rate': 4.446867231192033e-05, 'batch_size': 64, 'warmup_steps': 346, 'weight_decay': 0.015127726688795667, 'adam_beta1': 0.8563588297321466, 'adam_beta2': 0.9948246466467797, 'adam_epsilon': 3.2913479553621697e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,2.5682,No log,41.792302,57.135377,11873,39.92915,70.659301,5928,43.650126,43.650126,5945,50.088436,0.0,57.17749,0.0
2,1.0094,No log,53.053146,67.170735,11873,37.651822,65.927486,5928,68.410429,68.410429,5945,53.726943,0.0,67.170735,0.0
3,0.6277,No log,51.360229,66.069656,11873,39.456815,68.917852,5928,63.229605,63.229605,5945,52.261434,0.0,66.058827,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 1 epochs


[I 2024-05-27 08:10:08,726] Trial 19 finished with value: 67.17073493525506 and parameters: {'learning_rate': 4.446867231192033e-05, 'batch_size': 64, 'warmup_steps': 346, 'weight_decay': 0.015127726688795667, 'adam_beta1': 0.8563588297321466, 'adam_beta2': 0.9948246466467797, 'adam_epsilon': 3.2913479553621697e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 9 with value: 69.76162859449346.


## 7. Summary of Hyperparameters Search

Each row of the below table corresponds to a trial and shows all specified parameters along with measured performance metrics with Trial 9 acheiving the highest performance.



| Trial | Learning Rate     | Batch Size | Warmup Steps | Weight Decay   | Adam Beta1 | Adam Beta2 | Adam Epsilon | LR Scheduler Type          | Exact   | F1       | Hasans Exact | Hasans F1 | Noans Exact | Noans F1 |
|-------|-------------------|------------|--------------|----------------|------------|------------|--------------|----------------------------|---------|----------|--------------|-----------|-------------|----------|
| 0     | 0.00000288956     | 32         | 861          | 0.19727        | 0.91056    | 0.99659    | 0.00000053605| constant_with_warmup       | 41.92706 | 55.50755 | 36.84211     | 64.04203  | 46.99748    | 46.99748 |
| 1     | 0.00000333173     | 16         | 443          | 0.23953        | 0.93147    | 0.99541    | 0.00000008895| constant_with_warmup       | 44.29378 | 59.27969 | 39.64238     | 69.65718  | 48.93188    | 48.93188 |
| 2     | 0.00000961032     | 16         | 70           | 0.02229        | 0.83412    | 0.99181    | 0.00000069818| linear                     | 51.47814 | 65.71488 | 38.29285     | 66.80715  | 64.62574    | 64.62574 |
| 3     | 0.00000325700     | 16         | 226          | 0.22605        | 0.92570    | 0.99781    | 0.00000059647| constant_with_warmup       | 44.70648 | 59.60375 | 39.50742     | 69.34468  | 49.89066    | 49.89066 |
| 4     | 0.00000230338     | 64         | 9            | 0.15556        | 0.84571    | 0.99876    | 0.00000038860| cosine_with_restarts       | 38.78548 | 48.36220 | 24.61201     | 43.79291  | 52.91842    | 52.91842 |
| 5     | 0.00000025761     | 32         | 786          | 0.13462        | 0.85898    | 0.99800    | 0.00000077248| constant_with_warmup       | 50.06317 | 50.06317 | 0.00000      | 0.00000   | 99.98318    | 99.98318 |
| 6     | 0.00007176        | 16         | 102          | 0.14134        | 0.88243    | 0.99686    | 0.00000030277| constant_with_warmup       | 45.26236 | 60.60069 | 37.63495     | 68.35561  | 52.86796    | 52.86796 |
| 7     | 0.00000084893     | 64         | 607          | 0.12419        | 0.83969    | 0.99575    | 0.00000076072| linear                     | 45.67506 | 47.55409 | 5.28003      | 9.04348   | 85.95458    | 85.95458 |
| 8     | 0.000003088       | 64         | 14           | 0.13128        | 0.93376    | 0.99054    | 0.00000097450| cosine_with_restarts       | 39.32452 | 50.22855 | 29.25101     | 51.09035  | 49.36922    | 49.36922 |
| __9__     | __0.00004920__        | __32__         | __398__          | __0.19404__        | __0.83682__    | __0.99691__    | __0.00000058749__ | __cosine__                     | __54.24914__ | __69.09482__ | __39.11943__     | __68.85338__  | __69.33558__    | __69.33558__ |
| 10    | 0.00009415        | 32         | 380          | 0.07077        | 0.80103    | 0.99355    | 0.00000017199| cosine                     | 54.19860 | 68.71279 | 39.23752     | 68.30752  | 69.11691    | 69.11691 |
| 11    | 0.00008160        | 32         | 401          | 0.05796        | 0.80672    | 0.99294    | 0.00000003903| cosine                     | 53.74379 | 68.66727 | 39.13630     | 69.02606  | 68.30950    | 68.30950 |
| 12    | 0.00002319        | 32         | 611          | 0.07886        | 0.80102    | 0.99412    | 0.00000026718| cosine                     | 52.58991 | 66.95025 | 38.96761     | 67.72947  | 66.17326    | 66.17326 |
| 13    | 0.00002155        | 32         | 296          | 0.18253        | 0.82317    | 0.99391    | 0.00000018951| cosine                     | 52.55622 | 66.93609 | 38.86640     | 67.66738  | 66.20690    | 66.20690 |
| 14    | 0.00002855        | 32         | 577          | 0.09295        | 0.87745    | 0.99274    | 0.00000043145| cosine                   | 53.17948| 67.64095 | 39.08570     | 68.05009  | 67.23297    | 67.23297 |
| 15    | 0.00009603        | 32         | 305          | 0.04170        | 0.81714    | 0.99512    | 0.00000058926| cosine                   | 53.65956| 68.40756 | 38.78205     | 68.32034  | 68.49453    | 68.49453 |
| 16    | 0.00003681        | 32         | 755          | 0.09221        | 0.85962    | 0.99658    | 0.00000093445| cosine                   | 53.48269| 68.07216 | 38.73144     | 67.95222  | 68.19176    | 68.19176 |
| 17    | 0.00001067        | 32         | 499          | 0.18167        | 0.82101    | 0.99304    | 0.00000041606| cosine                   | 49.95368| 64.15894 | 37.88799     | 66.33926  | 61.98486    | 61.98486 |
| 18    | 0.00000010884     | 32         | 181          | 0.20225        | 0.90326    | 0.99114    | 0.00000015685| cosine                   | 18.25150| 21.21381 | 0.60729      | 6.54041   | 35.84525    | 35.84525 |
| 19    | 0.00004447        | 64         | 346          | 0.01513        | 0.85636    | 0.99482    | 0.00000032913| cosine_with_restarts     | 51.36023| 66.06966 | 39.45682     | 68.91785  | 63.22961    | 63.22961 |

## Tuning BERT Using the Best Hyperparameters

In [57]:
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name)
training_args = TrainingArguments(
    output_dir=f"./{normalized_model_name}-best_model",
    overwrite_output_dir = True,
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    save_total_limit=4, 
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    report_to="wandb",  # Enable logging to Weights & Biases
    run_name=f"{normalized_model_name}-best_model",
    learning_rate=4.91978224351371e-05,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=398,
    weight_decay=0.19404494917360213,
    adam_beta1=0.8368183686077211,
    adam_beta2=0.9969107098230887,
    adam_epsilon=5.874878468800929e-07,
    lr_scheduler_type='cosine',
    fp16=True,  # Enable mixed-precision training
)

trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        eval_examples=eval_examples,
        tokenizer=tokenizer,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        compute_metrics=compute_metrics,
        callbacks=[AdvancedEarlyStoppingCallback(metric_name='eval_f1', patience=3, threshold=40)]
)

trainer.train()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact,F1,Total,Hasans Exact,Hasans F1,Hasans Total,Noans Exact,Noans F1,Noans Total,Best Exact,Best Exact Thresh,Best F1,Best F1 Thresh
1,1.0699,No log,58.022404,70.754999,11873,36.454116,61.955821,5928,79.529016,79.529016,5945,58.013981,0.0,70.746577,0.0
2,0.7867,No log,57.786575,71.976954,11873,39.271255,67.692708,5928,76.248949,76.248949,5945,57.786575,0.0,71.976954,0.0
3,0.5562,No log,56.363177,71.408624,11873,40.317139,70.451179,5928,72.363331,72.363331,5945,56.363177,0.0,71.408624,0.0
4,0.4,No log,56.001011,71.080669,11873,39.794197,69.996758,5928,72.16148,72.16148,5945,56.152615,0.0,71.080669,0.0
5,0.269,No log,55.125074,70.405812,11873,39.777328,70.382625,5928,70.428932,70.428932,5945,55.529352,0.0,70.405812,0.0


  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

Stopping training: No improvement in eval_f1 for 3 epochs


TrainOutput(global_step=20325, training_loss=0.6738202459844422, metrics={'train_runtime': 5762.4661, 'train_samples_per_second': 225.683, 'train_steps_per_second': 7.054, 'total_flos': 1.6990690958380032e+17, 'train_loss': 0.6738202459844422, 'epoch': 5.0})