# COVID QA Analysis (BERT-Base)
## Advanced Statistical NLP (CSE 291-3) 
### Yash Khandelwal, Kaushik Ravindran

github: https://github.com/yashskhandelwal/Covid_QA_Analysis

#### Expected outputs per model:

##### Evaluation:
*   Exact Match
*   F1 Score

##### Training time:
*   Time taken to fine tune the model
*   Average prediction time

##### Environmental impact:
*   GPU Details
*   CO2 emission impact of training the model

#### List of models

*   BERT: Base, Large
*   RoBERTa: Base, Large
*   DistilBERT: Base
*   ALBERT: Base, XXL
*   ELECTRA: Base
*   LongFormer: Base, Large
*   BigBird: base

#### Main libraries:

*   pyTorch
*   trasnformers (HuggingFace)
*   tokenizers (HuggingFace)
*   datasets (HuggingFace)
*   codecarbon




In [None]:
%%capture
# env setup
# install relavant libraries
!pip install datasets transformers
!pip install accelerate
!pip install humanize
!pip install millify
!pip install tqdm
!pip install codecarbon

In [None]:
# imports
import math, statistics, time
from collections import defaultdict
import numpy as np
from tqdm.auto import tqdm
from datetime import datetime

import torch
from codecarbon import EmissionsTracker
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments

import warnings
warnings.filterwarnings("ignore")

In [None]:
# login to hugging face
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# constants
dataset = "covid_qa_deepset"
pre_trained_model_checkpoint = "bert-base-cased"
model_name = "covid_qa_analysis_bert_base"
hub_model_id = "armageddon/covid_qa_analysis_bert_base"

### Section 1: Prepping the dataset

##### Section 1.1: load covid qa dataset and get a bearing

In [None]:
raw_datasets = load_dataset(dataset)

In [None]:
raw_datasets

In [None]:
raw_datasets['train'].features

##### Section 1.2: Print some basic stats for the dataset

In [None]:
# about context lengths
context_lengths = list(map(len, raw_datasets['train']['context']))
print('Average context length is:', statistics.mean(context_lengths))
print('Max context length is:', max(context_lengths))
print('Min context length is:', min(context_lengths))
print('Median context length is:', statistics.median(context_lengths))

In [None]:
# about questions lengths
question_lengths = list(map(len, raw_datasets['train']['question']))
print('Average question length is:', statistics.mean(question_lengths))
print('Max question length is:', max(question_lengths))
print('Min question length is:', min(question_lengths))
print('Median question length is:', statistics.median(question_lengths))

In [None]:
#About num of answers per question
answer_count = list(map(lambda x: len(x['answers']['text']), raw_datasets['train']))
print('Average answer count is:', statistics.mean(answer_count))
print('Max answer count is:', max(answer_count))
print('Min answer count is:', min(answer_count))
print('Median answer count is:', statistics.median(answer_count))

In [None]:
#About length of answers
answer_lengths = list(map(lambda x: len(x['answers']['text'][0]), raw_datasets['train']))
print('Average answer length is:', statistics.mean(answer_lengths))
print('Max answer length is:', max(answer_lengths))
print('Min answer length is:', min(answer_lengths))
print('Median answer length is:', statistics.median(answer_lengths))

##### Section 1.3: Split dataset into train and validation

In [None]:
raw_datasets_split = raw_datasets["train"].train_test_split(train_size=0.9, seed=42)
raw_datasets_split['validation'] = raw_datasets_split.pop('test')
raw_datasets = raw_datasets_split

In [None]:
raw_datasets

#### Section 2: Tokenize the dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_checkpoint)

###### Section 2.1 Preprocessing raw_dataset

In [None]:
# pre-processing for training 
# split long context into multiple features 
# find answer start and end token id in each of the features
def preprocess_training_examples(examples):
    #overlapping between context split in multiple features
    stride = 50

    questions = [q.strip() for q in examples["question"]]
    context =  examples["context"]
    answers = examples["answers"] 
    
    # use model tokenizer to tokenize examples
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    # return_overflowing_tokens -- for each feature, it represents the original example it belonged to
    # return_offsets_mapping -- for each token, it returns the start and end position of the word represented by that token in the original context
        
    # pop offset_mapping and overflow_to_sample mapping
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    
    # map the start an dend token of answer in each feature
    start_positions = []
    end_positions = []
    
    # for each feature
    for i, offset in enumerate(offset_mapping): 
        sample_idx = sample_map[i] # get original example index
        answer = answers[sample_idx] # get the answer for that example
        start_char = answer["answer_start"][0] # start char of answer in original context
        end_char = answer["answer_start"][0] + len(answer["text"][0]) # end char of answer in original context
        
        # labels in tokenized input indicating whether token belongs to question (0), context (1), or special token (None)
        sequence_ids = inputs.sequence_ids(i) 

        # find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# pre-processing for validation examples
def preprocess_validation_examples(examples):
    stride = 50

    questions = [q.strip() for q in examples["question"]]
    context =  examples["context"]
    answers = examples["answers"] 
    
    # use model tokenizer to tokenize examples
    inputs = tokenizer(
        questions,
        context,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    # return_overflowing_tokens -- for each feature, it represents the original example it belonged to
    # return_offsets_mapping -- for each token, it returns the start and end position of the word represented by that token in the original context
    
    # pop overflow_to_sample mapping
    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i] # get original example index
        example_ids.append(examples["id"][sample_idx]) # get and store the id of the original sample index
        
        # labels in tokenized input indicating whether token belongs to question (0), context (1), or special token (None)
        sequence_ids = inputs.sequence_ids(i)  
        
        # update offset mapping so that only context offset mapping is stored and question offset mapping is discarded
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]
    
    # add a new column to inputs and return
    inputs["example_id"] = example_ids
    return inputs


In [None]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [None]:
train_dataset

In [None]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

In [None]:
validation_dataset

In [None]:
train_valid_dataset = raw_datasets["train"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [None]:
train_valid_dataset

#### Section 3: Setting up evaluation for validation

In [None]:
n_best = 20
max_answer_length = 30
metric = load_metric("squad")

In [None]:
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

#### Section 4: Finetuning the model

In [None]:
def current_milli_time():
    return round(time.time() * 1000)

# define a training loop
def finetune_model(model, args, train_dataset, val_dataset, tokenizer):
    from transformers import Trainer
    from codecarbon import EmissionsTracker
    import torch, time

    tracker = EmissionsTracker()
    tracker.start()
    start_time = current_milli_time()

    trainer = Trainer(
      model=model,
      args=args,
      train_dataset=train_dataset,
      eval_dataset=None,
      tokenizer=tokenizer,
    )
    trainer.train()

    emissions = tracker.stop()
    print('Emissions:', emissions, 'CO_2 eq (in KG)')
    if torch.cuda.is_available():
        print('GPU device name:', torch.cuda.get_device_properties(0).name)
        print('GPU device memory:', torch.cuda.get_device_properties(0).total_memory/(10**9), "GiB")
    print('Training time:', (current_milli_time()-start_time)/(1000*60))
    return trainer

In [None]:
# set model and training arguments
model = AutoModelForQuestionAnswering.from_pretrained(pre_trained_model_checkpoint)
args = TrainingArguments(
    model_name,
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    hub_model_id=hub_model_id,
    push_to_hub=True,
)

In [None]:
# finetune model
trainer = finetune_model(model, args, train_dataset, None, tokenizer)

#### Section 5: validating the model

In [None]:
# validate on training dataset
predictions = trainer.predict(train_valid_dataset)
start_logits, end_logits = predictions.predictions

print("validation metrics on training dataset are as follows:]\n",compute_metrics(start_logits, end_logits, train_valid_dataset,raw_datasets["train"]))

In [None]:
# validate on validation dataset
predictions = trainer.predict(validation_dataset)
start_logits, end_logits = predictions.predictions

print("validation metrics on validation dataset are as follows:]\n",compute_metrics(start_logits, end_logits, validation_dataset,raw_datasets["validation"]))

#### Section 6: Push model to hugging-face library

In [None]:
# push to github if needed
trainer.push_to_hub(commit_message="Run {}".format(datetime.now()))