# Script to fine-tune model
## COVID-QA Analysis
### Yash Khandelwal, Kaushik Ravindran

github: https://github.com/yashskhandelwal/Covid_QA_Analysis





In [None]:
%%capture
# env setup
# install relavant libraries
!pip install datasets transformers
!pip install accelerate
!pip install humanize
!pip install millify
!pip install tqdm
!apt-get install git-lfs
!pip install codecarbon
!git lfs install

In [None]:
%%capture
# for running on tpu
!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

In [None]:
# imports
import math, statistics, time
from collections import defaultdict
import numpy as np
from tqdm.auto import tqdm
from datetime import datetime
import torch_xla
import torch_xla.core.xla_model as xm

import torch
from codecarbon import EmissionsTracker
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments

import warnings
warnings.filterwarnings("ignore")

In [None]:
# login to hugging face
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# set constants
dataset = "covid_qa_deepset"
pre_trained_model_checkpoint = "twmkn9/bert-base-uncased-squad2"
model_name = "bert-base-uncased-squad2-covid-qa-deepset"
hub_model_id = "armageddon/bert-base-uncased-squad2-covid-qa-deepset"
stride = 150
max_answer_length=150

#### Section 1: Prepping the dataset

In [None]:
raw_datasets = load_dataset(dataset)

In [None]:
#Split dataset into train and test.
raw_datasets_split = raw_datasets["train"].train_test_split(train_size=0.9, seed=42)
raw_datasets = raw_datasets_split

#### Section 2: Tokenize the dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_checkpoint)

In [None]:
# pre-processing for training 
# split long context into multiple features 
# find answer start and end token id in each of the features
def preprocess_training_examples(examples):
    #overlapping between context split in multiple features
    questions = [q.strip() for q in examples["question"]]
    context =  examples["context"]
    answers = examples["answers"] 
    
    # use model tokenizer to tokenize examples
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    # return_overflowing_tokens -- for each feature, it represents the original example it belonged to
    # return_offsets_mapping -- for each token, it returns the start and end position of the word represented by that token in the original context
        
    # pop offset_mapping and overflow_to_sample mapping
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    
    # map the start and end token of answer in each feature
    start_positions = []
    end_positions = []
    
    # for each feature
    for i, offset in enumerate(offset_mapping): 
        sample_idx = sample_map[i] # get original example index
        answer = answers[sample_idx] # get the answer for that example
        start_char = answer["answer_start"][0] # start char of answer in original context
        end_char = answer["answer_start"][0] + len(answer["text"][0]) # end char of answer in original context
        
        # labels in tokenized input indicating whether token belongs to question (0), context (1), or special token (None)
        sequence_ids = inputs.sequence_ids(i) 

        # find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
    load_from_cache_file = False
)

#### Section 4: Finetuning the model

In [None]:
def current_milli_time():
    return round(time.time() * 1000)

# define a training loop
def finetune_model(model, args, train_dataset, val_dataset, tokenizer):
    from transformers import Trainer
    from codecarbon import EmissionsTracker
    import torch, time

    tracker = EmissionsTracker()
    tracker.start()
    start_time = current_milli_time()

    trainer = Trainer(
      model=model,
      args=args,
      train_dataset=train_dataset,
      eval_dataset=None,
      tokenizer=tokenizer,
    )
    trainer.train()

    emissions = tracker.stop()
    print('Emissions:', emissions, 'CO_2 eq (in KG)')
    if torch.cuda.is_available():
        print('GPU device name:', torch.cuda.get_device_properties(0).name)
        print('GPU device memory:', torch.cuda.get_device_properties(0).total_memory/(10**9), "GiB")
    print('Training time:', (current_milli_time()-start_time)/(1000*60))
    return trainer

In [None]:
# set model and training arguments
model = AutoModelForQuestionAnswering.from_pretrained(pre_trained_model_checkpoint)
args = TrainingArguments(
    model_name,
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    hub_model_id=hub_model_id,
    push_to_hub=True,
)

In [None]:
# finetune model
trainer = finetune_model(model, args, train_dataset, None, tokenizer)

#### Section 5: Push model to hugging-face library

In [None]:
# push to huggingface vcs if needed
trainer.push_to_hub(commit_message="Train finetuned model checkpoint")