# Learning references
1. BPE for Tamil/Hindi \
   a. https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models
2. Extractive Question and Answering Task\
    a. Main reference https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb \
    b. How to handle long context paragraphs that are longer than pretrained max_length
3. Extractive Q&A architecture
    a. https://www.youtube.com/watch?v=l8ZYCvgGu0o

# Trials
1. Training with provided dataset only
2. Training with external dataset + provided dataset

In [None]:
!pip install openpyxl
# For training in TPU
# !pip install accelerate
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp37-cp37m-linux_x86_64.whl

In [None]:
#torch libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

#tensorflow libraries
import tensorflow as tf
import tensorflow_addons as tfa

#huggingface libraries
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup
)
# For training with TPU
# from accelerate import Accelerator
# from accelerate import notebook_launcher


#sklearn libraries
from sklearn.model_selection import StratifiedKFold

# python libraries
import numpy as np 
import multiprocessing
import pandas as pd 
import random
import os
import gc
import time
from tqdm.notebook import tqdm

In [None]:
class CONFIG:
    #Debug
    debug = False
    debug_sample = 200 #use a smaller dataset for debug
    perform_fold = [4]
    
    #misc
    seed = 42
    num_workers = 2
    
    #training params 
    train_batchsize = 4
    val_batchsize = 8
    epochs = 2
    n_splits = 5
    
    #model params
    model = 'deepset/xlm-roberta-large-squad2'
    max_input_length = 384 #Hyperparameter to be tuned, following the guide from huggingface
    doc_stride = 128  #Hyperparameter to be tuned, following the guide from huggingface
    
    
    #optimizer
    optimizer = "AdamW" #implemented AdamW, Adam, SGD
    
    max_grad_norm = 1.0 #gradient clipping to prevent exploding gradient
    
    if (optimizer == "AdamW") or (optimizer == "Adam"): 
        optimizer_params = dict(
            betas = (0.9,0.999), 
            lr = 0.00001,
            eps = 1e-8,
            weight_decay= 0.01,
            amsgrad = False
        )
        
    elif optimizer == "SGD":
        optimizer_params = dict( 
            lr = 0.001,
            momentum = 0,
            weight_decay =0,
            dampening = 0,
            nesterov = False
        )
    

    
    #scheduler
    #implemented  pytorch CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateu, 
    scheduler = "cosine_with_warmup"
    if scheduler == "CosineAnnealing":
        scheduler_params = dict(
            T_max = 3,
            eta_min = 0,
            last_epoch = -1,
            verbose=True
        )
    elif scheduler == "ReduceLROnPlateu":
        scheduler_params = dict(
            mode = 'min',
            factor = 0.1,
            patience = 10,
            threshold = 1e-4,
            threshold_mode= 'rel',
            cooldown = 0,
            min_lr = 0,
            eps = 1e-8,
            verbose=True
        )
    elif scheduler == "CosineAnnealingWarmRestarts":
        scheduler_params = dict(
            T_0  = 3,
            T_mult  = 1,
            eta_min = 0,
            last_epoch = -1,
            verbose = True
        )
        
    elif scheduler == "linear_with_warmup": #huggingface scheduler
        scheduler_params = dict(
            warmup_steps_ratio =  0.1
        )
        
    elif scheduler == "cosine_with_warmup": #huggingface scheduler
        scheduler_params = dict(
            warmup_steps_ratio =  0.1
        )
        
    
    #SWA stochastic weight averaging
    SWA = False
     
    #FP16
    FP16 = False #torch.cuda.amp does not seem to work in this case as the loss did not decrease
    
    #accelerate 
    ACCELERATE = False


## Setting Seed

In [None]:
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    

def set_torch_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
#     torch.backends.cudnn.benchmark = False 
#     torch.backends.cudnn.deterministic =True

def set_tf_seed(seed):
    tf.random.set_seed(seed)  
    
set_random_seed(CONFIG.seed)
set_torch_seed(CONFIG.seed)
# set_tf_seed(CONFIG.seed)

## Reading the data

In [None]:
train_df = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/train.csv")

if CONFIG.debug:
    train_df = train_df.sample(CONFIG.debug_sample)

train_df['fold'] = -1
k_fold = StratifiedKFold(n_splits=CONFIG.n_splits, shuffle=True, random_state=CONFIG.seed)
for fold_num , (train_idx, test_idx) in enumerate(k_fold.split(train_df['id'], y=train_df['language'])):
    train_df.iloc[test_idx,-1]= fold_num

train_df.head()

## Adding external dataset

In [None]:
external_mlqa = pd.read_csv('../input/mlqa-hindi-processed/mlqa_hindi.csv')
external_xquad = pd.read_csv('../input/mlqa-hindi-processed/xquad.csv')
external_train = pd.concat([external_mlqa, external_xquad])
external_train['id'] = list(np.arange(1, len(external_train)+1))
external_train['fold'] = -1
train_df = pd.concat([train_df, external_train]).reset_index(drop=True)
train_df

## While we cannot preprocess the context as it will affect the answer_start, we can clean up the question

In [None]:
def preprocess_question(df):
    df['question'] = df['question'].str.strip()
    return df

train_df = preprocess_question(train_df)
train_df.head()

## Break up large context

In [None]:
def break_long_context(df, tokenizer, train=True):
    if train: 
        n_examples = len(df)
        full_train_set = []
        for i in tqdm(range(n_examples)):
            row = df.iloc[i]
            # tokenizer parameters can be found here 
            # https://huggingface.co/transformers/internal/tokenization_utils.html#transformers.tokenization_utils_base.PreTrainedTokenizerBase
            tokenized_examples = tokenizer(row['question'],
                                          row['context'],
                                          padding='max_length',
                                          max_length=CONFIG.max_input_length, 
                                          truncation='only_second',
                                          stride=CONFIG.doc_stride,
                                          return_overflowing_tokens=True, #returns the number of over flow
                                          return_offsets_mapping=True     #returns the BPE mapping to the original word
                                          ) 
            
            # tokenized_example keys
            #'input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'
            sample_mappings = tokenized_examples.pop("overflow_to_sample_mapping")
            offset_mappings = tokenized_examples.pop("offset_mapping")
            
            final_examples = [] #'input_ids','attention_mask' ,'start_position', 'end_position'
            n_sub_examples = len(sample_mappings)
            for j in range(n_sub_examples):
                input_ids = tokenized_examples["input_ids"][j]
                attention_mask = tokenized_examples["attention_mask"][j]
                
                sliced_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids))
                final_example = dict(input_ids = input_ids, 
                                     attention_mask = attention_mask,
                                     sliced_text = sliced_text,
                                     offset_mapping=offset_mappings[j],
                                     fold=row['fold'])
                
                
                
                # Most of the time cls_index is 0
                cls_index = input_ids.index(tokenizer.cls_token_id)
                # None, 0, 0, .... None, None, 1, 1,.....
                sequence_ids = tokenized_examples.sequence_ids(j)
                
                sample_index = sample_mappings[j]
                offset_map = offset_mappings[j]
                
                if np.isnan(row["answer_start"]) : # if no answer, start and end position is cls_index
                    final_example['start_position'] = cls_index
                    final_example['end_position'] = cls_index
                    final_example['tokenized_answer'] = ""
                    final_example['answer_text'] = ""
                else:
                    start_char  = row["answer_start"]
                    end_char  = start_char + len(row["answer_text"])
                    
                    token_start_index = sequence_ids.index(1)
                    token_end_index = len(sequence_ids)- 1 - (sequence_ids[::-1].index(1))
                    
                    if not (offset_map[token_start_index][0]<=start_char and offset_map[token_end_index][1] >= end_char):
                        final_example['start_position'] = cls_index
                        final_example['end_position'] = cls_index
                        final_example['tokenized_answer'] = ""
                        final_example['answer_text'] = ""
                    else:
                        #Move token_start_index to the correct context index
                        while token_start_index < len(offset_map) and offset_map[token_start_index][0] <= start_char:
                            token_start_index +=1
                        final_example['start_position'] = token_start_index -1
                        
                        while offset_map[token_end_index][1] >= end_char: #Take note that we will want the end_index inclusively, we will need to slice properly later
                            token_end_index -=1
                        final_example['end_position'] = token_end_index + 1   
                        tokenized_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[final_example['start_position']:final_example['end_position']+1]))
                        final_example['tokenized_answer'] = tokenized_answer
                        final_example['answer_text'] = row['answer_text']
                        
                final_examples.append(final_example)
            full_train_set += final_examples
    
            
            
        return full_train_set

## Exporting to do a sanity check to make sure preprocessing is correct

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG.model)

full_train_set = break_long_context(train_df,tokenizer)
full_train_df = pd.DataFrame.from_dict(full_train_set)
full_train_df.to_excel("full_train_df.xlsx")

print(f"Total training examples = {len(full_train_set)}")

In [None]:
full_train_df.head()

## Base on anaylsis, we can do the follow post processing 
post processing \
strip \
\"-" #strip front maybe dangerous\
(\
)\
[\
]

change\
கிமீ2 -> கிமீ²\ #kmsquare
கி.மீ2 ->கி.மீ²\ #kmsquare
இந்தியாவில் ->இந்தியா # from in india to india\
இந்தியாவின்->இந்தியா # from india's to india\
ஹென்றி பெக்கொரெலுக்கு ->ஹென்றி பெக்கொரெலு \
right strip என்றும் # always known as\
दोनों->दो #both to two

## Creating the dataset

In [None]:
full_train_set[0].keys()
# we will only need input_ids, attention_mask, start_position and end_position for training

In [None]:
class ChaiDataset(Dataset):
    def __init__(self, dataset, is_train=True):
        super(ChaiDataset, self).__init__()
        self.dataset = dataset #list of features
        self.is_train= is_train
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        features = self.dataset[index]
        if self.is_train:
            return {
                'input_ids': torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(features['attention_mask'], dtype=torch.long),
                'offset_mapping':torch.tensor(features['offset_mapping'], dtype=torch.long),
                'start_position':torch.tensor(features['start_position'], dtype=torch.long),
                'end_position':torch.tensor(features['end_position'], dtype=torch.long)
            }
        else:
            return {
                'input_ids': torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(features['attention_mask'], dtype=torch.long),
                'offset_mapping':torch.tensor(features['offset_mapping'], dtype=torch.long),
                'sequence_ids':features['sequence_ids'],
                'id':features['example_id'],
                'context':features['context'],
                'question':features['question']
            }
        

## Creating model class

In [None]:
class ChaiModel(nn.Module):
    def __init__(self, model_config):
        super(ChaiModel, self).__init__()
        self.backbone = AutoModel.from_pretrained(CONFIG.model)
        self.linear = nn.Linear(model_config.hidden_size, 2)
        
    def forward(self, input_ids, attention_mask):
        model_output = self.backbone(input_ids, attention_mask=attention_mask)
        sequence_output = model_output[0] # (batchsize, sequencelength, hidden_dim)
        
        qa_logits = self.linear(sequence_output) # (batchsize, sequencelength, 2)
        start_logit, end_logit = qa_logits.split(1, dim=-1) #  (batchsize, sequencelength), 1), (batchsize, sequencelength, 1)
        start_logits = start_logit.squeeze(-1) # remove last dim (batchsize, sequencelength)
        end_logits = end_logit.squeeze(-1)    #remove last dim (batchsize, sequencelength)
        
        return start_logits, end_logits # (2,batchsize, sequencelength)
        

## Utility functions

In [None]:
def loss_fn(y_pred, y_true):
    start_position_preds, end_position_preds = y_pred # (batchsize, sequencelength),(batchsize, sequencelength)
    start_position_trues, end_position_trues = y_true 
    
    start_loss = nn.CrossEntropyLoss(ignore_index=-1)(start_position_preds,start_position_trues)
    end_loss = nn.CrossEntropyLoss(ignore_index=-1)(end_position_preds,end_position_trues)
    
    return (start_loss+end_loss)/2

def get_optimizer(model):
    if CONFIG.optimizer == "AdamW":
        optimizer_params = {"params":model.parameters(), **CONFIG.optimizer_params}
        optimizer = torch.optim.AdamW(**optimizer_params)
    elif CONFIG.optimizer == "Adam":
        optimizer_params = {"params":model.parameters(), **CONFIG.optimizer_params}
        optimizer = torch.optim.Adam(**optimizer_params)
    elif CONFIG.optimizer == "SGD":
        optimizer_params = {"params":model.parameters(), **CONFIG.optimizer_params}
        optimizer = torch.optim.SGD(**optimizer_params)
    else:
        raise NotImplementedError 
    
    return optimizer

def get_scheduler(optimizer, total_steps_per_epoch):
    if CONFIG.scheduler == "CosineAnnealing":
        scheduler_params = CONFIG.scheduler_params
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, scheduler_params['T_max'], 
                                                               eta_min=scheduler_params['eta_min'],
                                                               last_epoch=scheduler_params['last_epoch'],
                                                               verbose=scheduler_params['verbose'])
    elif CONFIG.scheduler == "ReduceLROnPlateu":
        scheduler_params = {"optimizer":optimizer, **CONFIG.scheduler_params}      
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(**scheduler_params)
    elif CONFIG.scheduler == "CosineAnnealingWarmRestarts":
        scheduler_params = {"optimizer":optimizer, **CONFIG.scheduler_params}   
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(**scheduler_params)
    elif CONFIG.scheduler == "linear_with_warmup":
        num_warmup_steps = int(CONFIG.scheduler_params["warmup_steps_ratio"] * total_steps_per_epoch)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, total_steps_per_epoch)
    elif CONFIG.scheduler == "cosine_with_warmup":
        num_warmup_steps = int(CONFIG.scheduler_params["warmup_steps_ratio"] * total_steps_per_epoch)
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, total_steps_per_epoch)
    else:
        raise NotImplementedError 
        
    return scheduler
    
    

In [None]:
def get_data_loaders(train_data, val_data):
    train_dataset = ChaiDataset(train_data)
    val_dataset = ChaiDataset(val_data)
    
    train_dataloader = DataLoader(train_dataset, 
                                  CONFIG.train_batchsize, 
                                  shuffle=True, 
                                  num_workers= CONFIG.num_workers,
                                  drop_last=False,
                                  pin_memory=True)
    
    val_dataloader = DataLoader(val_dataset, 
                                CONFIG.val_batchsize, 
                                shuffle= False, 
                                num_workers= CONFIG.num_workers,
                                drop_last=False,
                                pin_memory=True)
    
    return train_dataloader, val_dataloader

In [None]:
class AverageMeter:
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.max = 0
        self.min = 1e5
    
    def update(self, val, count=1):
        self.sum += val * count
        self.count += count
        self.val = val
        self.avg = self.sum/self.count
        if self.max < val:
            self.max = val
        if self.min > val:
            self.min = val


In [None]:
def train_step(model, train_dataloader, optimizer,scheduler, device, scaler=None):
    model.train() #switch to train mode
    model.zero_grad()
    
    loss_meter = AverageMeter()
    prog_bar = tqdm(train_dataloader, desc='Train')
    for features in prog_bar:
        #move data to device
        input_ids = features['input_ids'].to(device)
        attention_mask = features['attention_mask'].to(device)
        start_position = features['start_position'].to(device)
        end_position = features['end_position'].to(device)
            
        if scaler is None:
            #forward
            y_pred = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(y_pred, (start_position, end_position))
            
            #backward
            optimizer.zero_grad() #clear any gradient first
            loss.backward()

            #update optimizer and zero gradient
            optimizer.step()
            scheduler.step()
                   
        else:
            with torch.cuda.amp.autocast():
                y_pred = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = loss_fn(y_pred, (start_position, end_position))
                
                #backward
                optimizer.zero_grad() #clear any gradient first
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                
        loss_meter.update(loss.detach().item(), input_ids.shape[0])
        prog_bar.set_postfix({"train_loss": loss_meter.avg})
        
    return loss_meter.avg
        
    
def evaluation_step(model, val_dataloader, device):
    model.eval()
    loss_meter = AverageMeter()
    prog_bar = tqdm(val_dataloader)
    full_pred = []
    with torch.no_grad():
        for features in prog_bar:
            input_ids = features['input_ids'].to(device)
            attention = features['attention_mask'].to(device)
            start_position = features['start_position'].to(device)
            end_position = features['end_position'].to(device)
            
            #forward
            y_pred = model(input_ids, attention_mask=attention)
            loss = loss_fn(y_pred, (start_position, end_position))
            loss_meter.update(loss.detach().item(), input_ids.shape[0])
            prog_bar.set_postfix({"val_loss":loss_meter.avg})
        
    return loss_meter.avg

## Main training loop

### Hardware selection

In [None]:
if CONFIG.ACCELERATE:
    accelerator = Accelerator()
    device = accelerator.device
else:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
print(f"Using {device}")

In [None]:
for n_fold in range(CONFIG.n_splits):
    if n_fold in CONFIG.perform_fold:
        train_features = full_train_df[full_train_df['fold']!=n_fold].to_dict('records')
        val_features   = full_train_df[full_train_df['fold']==n_fold].to_dict('records')

        train_dataloader, val_dataloader = get_data_loaders(train_features, val_features)

        total_steps = len(train_dataloader) * CONFIG.epochs

        config = AutoConfig.from_pretrained(CONFIG.model)
        model = ChaiModel(config)
        model.to(device)
        optimizer = get_optimizer(model)
        scheduler = get_scheduler(optimizer,total_steps)

        if CONFIG.FP16:
            scaler = torch.cuda.amp.GradScaler()
        else:
            scaler = None


        best_val_loss = 1e3

        for epoch in range(CONFIG.epochs):
            print(f"EPOCH {epoch}")

            train_loss = train_step(model, train_dataloader, optimizer,scheduler, device, scaler)
            val_loss = evaluation_step(model, val_dataloader, device)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), f"./pytorch_model_fold_{n_fold}_epoch{epoch}.pth")

        del model
        del optimizer
        del scheduler
        del train_dataloader
        del val_dataloader
        gc.collect()

In [None]:
print("done")