In [1]:
import collections
import logging
import os
from typing import Optional, Tuple
#from tqdm.auto import tqdm

# misc
import os
import re
import time
import ast
import warnings
import math
import copy
import matplotlib.pyplot as plt
from xgboost import plot_importance
import seaborn as sns

# data
import pandas as pd
import numpy as np
import csv
from torch.utils.data import random_split
import datasets

# ML
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn import metrics

#import scikitplot as skplt
import xgboost
import tensorflow as tf
import torch

from transformers import DefaultDataCollator, EvalPrediction
from transformers import AutoTokenizer, AutoModelForSequenceClassification, XLMRobertaForSequenceClassification
from transformers import DistilBertConfig, DistilBertModel
from transformers import AutoModel
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoModelForQuestionAnswering, DistilBertForSequenceClassification, DistilBertModel, CamembertForSequenceClassification, RobertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
import json
from transformers import TrainerCallback, EarlyStoppingCallback
from sklearn.metrics import accuracy_score
from collections import Counter
import torch
import evaluate
import optuna

class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")

### Compute_metrics function for Question and Answering problem is different to classification, more preocessing required.

metric = evaluate.load("squad")

def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions=p.predictions, references=p.label_ids)

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

GPU: NVIDIA GeForce RTX 2060 is available.
cuda


In [2]:
"""
A subclass of `Trainer` specific to Question-Answering tasks
"""
import math
import time

from transformers import Trainer, is_torch_xla_available
from transformers.trainer_utils import PredictionOutput, speed_metrics


if is_torch_xla_available():
    import torch_xla.core.xla_model as xm
    import torch_xla.debug.metrics as met


class QuestionAnsweringTrainer(Trainer):
    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_examples = eval_examples
        self.post_process_function = post_process_function

    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        eval_examples = self.eval_examples if eval_examples is None else eval_examples

        # Temporarily disable metric computation, we will do it in the loop here.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        start_time = time.time()
        try:
            output = eval_loop(
                eval_dataloader,
                description="Evaluation",
                # No point gathering the predictions if there are no metrics, otherwise we defer to
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
                metric_key_prefix=metric_key_prefix,
            )
        finally:
            self.compute_metrics = compute_metrics
        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )
        if self.post_process_function is not None and self.compute_metrics is not None and self.args.should_save:
            # Only the main node write the results by default
            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
            metrics = self.compute_metrics(eval_preds)

            # Prefix all keys with metric_key_prefix + '_'
            for key in list(metrics.keys()):
                if not key.startswith(f"{metric_key_prefix}_"):
                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
            metrics.update(output.metrics)
        else:
            metrics = output.metrics

        if self.args.should_log:
            # Only the main node log the results by default
            self.log(metrics)

        if self.args.tpu_metrics_debug or self.args.debug:
            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
            xm.master_print(met.metrics_report())

        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
        return metrics

    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
        predict_dataloader = self.get_test_dataloader(predict_dataset)

        # Temporarily disable metric computation, we will do it in the loop here.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        start_time = time.time()
        try:
            output = eval_loop(
                predict_dataloader,
                description="Prediction",
                # No point gathering the predictions if there are no metrics, otherwise we defer to
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
                metric_key_prefix=metric_key_prefix,
            )
        finally:
            self.compute_metrics = compute_metrics
        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )

        if self.post_process_function is None or self.compute_metrics is None:
            return output

        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
        metrics = self.compute_metrics(predictions)

        # Prefix all keys with metric_key_prefix + '_'
        for key in list(metrics.keys()):
            if not key.startswith(f"{metric_key_prefix}_"):
                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
        metrics.update(output.metrics)
        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)

In [3]:
NUM_RECORDS = 200


with open('./train-v2.0.json', 'rb') as f:
    squad = json.load(f)
contexts = []
questions = []
answers = []
for group in squad['data']:
    for parag in group['paragraphs']:
        context = parag['context']
        for qa in parag['qas']:
            question = qa['question']
            for answer in qa['answers']:
                contexts.append(context)
                questions.append(question)
                answers.append(answer)

contexts, questions, answers = contexts[:NUM_RECORDS], questions[:NUM_RECORDS], answers[:NUM_RECORDS]

In [4]:
for answer, context in zip(answers, contexts):
    gold_text = answer["text"]
    start_idx = answer["answer_start"]

    end_idx = start_idx + len(gold_text)

    if context[start_idx:end_idx] == gold_text:
        answer["answer_end"] = end_idx
    elif context[start_idx-1:end_idx-1] == gold_text:
        answer["answer_start"] = start_idx - 1
        answer["answer_end"] = end_idx - 1
    elif context[start_idx-1:end_idx-2] == gold_text:
        answer["answer_start"] = start_idx - 2
        answer["answer_end"] = end_idx - 2

In [5]:
answers

[{'text': 'in the late 1990s', 'answer_start': 269, 'answer_end': 286},
 {'text': 'singing and dancing', 'answer_start': 207, 'answer_end': 226},
 {'text': '2003', 'answer_start': 526, 'answer_end': 530},
 {'text': 'Houston, Texas', 'answer_start': 166, 'answer_end': 180},
 {'text': 'late 1990s', 'answer_start': 276, 'answer_end': 286},
 {'text': "Destiny's Child", 'answer_start': 320, 'answer_end': 335},
 {'text': 'Dangerously in Love', 'answer_start': 505, 'answer_end': 524},
 {'text': 'Mathew Knowles', 'answer_start': 360, 'answer_end': 374},
 {'text': 'late 1990s', 'answer_start': 276, 'answer_end': 286},
 {'text': 'lead singer', 'answer_start': 290, 'answer_end': 301},
 {'text': 'Dangerously in Love', 'answer_start': 505, 'answer_end': 524},
 {'text': '2003', 'answer_start': 526, 'answer_end': 530},
 {'text': 'five', 'answer_start': 590, 'answer_end': 594},
 {'text': 'lead singer', 'answer_start': 290, 'answer_end': 301},
 {'text': 'Dangerously in Love', 'answer_start': 505, 'answ

In [6]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
encodings = tokenizer(contexts, questions, padding=True, truncation=True)



In [7]:
dataset = Dataset.from_dict({"contexts": contexts, "answers": answers, "questions": questions})

In [8]:
dataset = dataset.map(lambda x: tokenizer(x['contexts'],x['questions'], padding='max_length', truncation=True))

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [9]:
dataset

Dataset({
    features: ['contexts', 'answers', 'questions', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

In [10]:
start_positions = []
end_positions = []
for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]["answer_start"]))
    end_positions.append(encodings.char_to_token(i, answers[i]["answer_end"]-1))
    if start_positions[-1] is None:
        start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
        end_positions[-1] = tokenizer.model_max_length

In [11]:
dataset = dataset.add_column("start_positions",start_positions)

In [12]:
dataset = dataset.add_column("end_positions",end_positions)

In [13]:
dataset

Dataset({
    features: ['contexts', 'answers', 'questions', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 200
})

In [14]:
dataset

Dataset({
    features: ['contexts', 'answers', 'questions', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 200
})

In [15]:
train_testvalid = dataset.train_test_split(test_size=0.2, shuffle=True)

test_valid = train_testvalid['test'].train_test_split(test_size=0.5, shuffle=True)

dataset = datasets.DatasetDict({
                                'train': train_testvalid['train'],
                                'validation': test_valid['train'],
                                'test': test_valid['test']
                                })

In [16]:
val_examples = dataset["validation"].remove_columns(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [17]:
dataset = dataset.remove_columns(['contexts', 'answers', 'questions'])

In [18]:
data_collator = DefaultDataCollator()
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
training_args = TrainingArguments(
    output_dir="test",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        eval_examples=val_examples,
        tokenizer=tokenizer,
        data_collator=data_collator,
        #post_process_function=post_processing_function,
        compute_metrics=compute_metrics,)

trainer.train()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,5.390177


KeyboardInterrupt: 