In [1]:
# misc
import os
import re
import time
import ast
import warnings
import math
import copy
import matplotlib.pyplot as plt
from xgboost import plot_importance
import seaborn as sns

# data
import pandas as pd
import numpy as np
import csv
from data_preprocessing import SquadDataset
from torch.utils.data import random_split
import datasets

# ML
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn import metrics

#import scikitplot as skplt
import xgboost
import tensorflow as tf
import torch

from transformers import DefaultDataCollator
from transformers import AutoTokenizer, AutoModelForSequenceClassification, XLMRobertaForSequenceClassification
from transformers import DistilBertConfig, DistilBertModel
from transformers import AutoModel
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoModelForQuestionAnswering, DistilBertForSequenceClassification, DistilBertModel, CamembertForSequenceClassification, RobertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
import json
from transformers import TrainerCallback, EarlyStoppingCallback

from sklearn.metrics import accuracy_score
from collections import Counter
import torch
import evaluate
import optuna

metric = evaluate.load("squad")

def ensemble_score(output1,output2,output3, true_labels):
    final_labels = []
    
    for pred1,pred2,pred3 in zip(output1.predictions,output2.predictions, output3.predictions):
        label1 = np.argmax(pred1)
        label2 = np.argmax(pred2)
        label3 = np.argmax(pred3)
        labels = [label1,label2,label3]
        final_labels.append(max(labels,key=labels.count))
        
    acc = accuracy_score(true_labels,final_labels)
    return final_labels, acc


class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")

### Compute_metrics function for Question and Answering problem is different to classification, more preocessing required.
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

GPU: NVIDIA GeForce RTX 2060 is available.
cuda


In [2]:
max_length = 384
stride = 128
model = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model)

## The dataset does not have the ending index of the "Answers" which is required for the Question and Answering transformer. 
## This function uses the data in the Squad dataset to extract the start index and end index as well as tokenizes the dataset.
## after running this function, the data is ready to be used in the model

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs



In [3]:
## IMPORT RAW DATASET

squad_raw = load_dataset("squad", split="train[:5000]+validation[:1000]")
train_testvalid = squad_raw.train_test_split(test_size=0.2, shuffle=True)

test_valid = train_testvalid['test'].train_test_split(test_size=0.5, shuffle=True)

squad_raw = datasets.DatasetDict({
                                'train': train_testvalid['train'],
                                'validation': test_valid['train'],
                                'test': test_valid['test']
                                })

In [4]:
squad_raw

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 4800
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 600
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 600
    })
})

In [5]:
## APPLY PREPROCESSING

tokenized_squad = squad_raw.map(preprocess_function, batched=True, remove_columns=squad_raw["train"].column_names)

Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [6]:
###############################################################
# UNCOMMENT THIS TO TEST TRAINER BEFORE STARTING TUNING
###############################################################


# model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
# training_args = TrainingArguments(
#     output_dir="test",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01
# )
# data_collator = DefaultDataCollator()
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_squad["train"],
#     eval_dataset=tokenized_squad["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics
# )

# trainer.train()

In [14]:

# Define model initialization function, "model" parameter is defined in the second cell of the notebook
def model_init():
    return AutoModelForQuestionAnswering.from_pretrained(model)

# Hyperparameter tuning with Optuna
def objective(trial):
    # Hyperparameters to tune    
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-4, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    warmup_steps = trial.suggest_int('warmup_steps', 0, 500)
    weight_decay = trial.suggest_float('weight_decay', 0.01, 0.3)
    # Training arguments
    arguments = TrainingArguments(
        output_dir="testOutput",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=10,  # More epochs
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        warmup_steps=warmup_steps,
        weight_decay=weight_decay
    )
    # Initialize the Trainer
    trainer = Trainer(
        model_init=model_init,
        args=arguments,
        train_dataset=tokenized_squad['train'],
        eval_dataset=tokenized_squad['validation'],
        compute_metrics=compute_metrics
    )
    # Train the model
    trainer.train()
    # Evaluate the model
    eval_results = trainer.evaluate()
    return eval_results['f1']

In [15]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2024-05-22 00:06:53,186] A new study created in memory with name: no-name-cecce350-0eab-48b0-b9fd-c2f54783beaf
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[W 2024-05-22 00:06:56,664] Trial 0 failed with parameters: {'learning_rate': 1.5723087865213442e-05, 'batch_size': 32, 'warmup_steps': 57, 'weight_decay': 0.2539952048328979} because of the following error: OutOfMemoryError('CUDA out of memory. Tried to allocate 90.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 11.88 GiB is allocated by PyTorch, and 174.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentat

OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 11.88 GiB is allocated by PyTorch, and 174.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
print("Best Trial:")
print("Value (Performance Metric):", study.best_trial.value)
print("Parameters:")
for key, value in study.best_trial.params.items():
    print(f"{key}: {value}")

In [13]:
torch.cuda.empty_cache()