## References
https://www.kaggle.com/rhtsingh/chaii-qa-5-fold-xlmroberta-torch-infer

The post-processing codes have been taken from the above reference


In [None]:
#torch libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

#tensorflow libraries
import tensorflow as tf
import tensorflow_addons as tfa

#huggingface libraries
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModel,
    AdamW,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup
)

#sklearn libraries
from sklearn.model_selection import StratifiedKFold

#python libraries
import pandas as pd
import numpy as np
import os
import random
import gc
from tqdm.notebook import tqdm
import collections
from string import punctuation

In [None]:
class CONFIG:
    #Debug
    debug = False
    debug_sample = 200 #use a smaller dataset for debug
    perform_fold = [0]
    
    #misc
    seed = 42
    num_workers = 2
    
    #training params 
    train_batchsize = 4
    val_batchsize = 8
    epochs = 1
    n_splits = 5
    
    #model params
    model = '../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2'
    max_input_length = 384 #Hyperparameter to be tuned, following the guide from huggingface
    doc_stride = 128  #Hyperparameter to be tuned, following the guide from huggingface
    
    
    #optimizer
    optimizer = "AdamW" #implemented AdamW, Adam, SGD
    
    max_grad_norm = 1.0 #gradient clipping to prevent exploding gradient
    
    if (optimizer == "AdamW") or (optimizer == "Adam"): 
        optimizer_params = dict(
            betas = (0.9,0.999), 
            lr = 0.001,
            eps = 1e-8,
            weight_decay= 0.01,
            amsgrad = False
        )
        
    elif optimizer == "SGD":
        optimizer_params = dict( 
            lr = 0.001,
            momentum = 0,
            weight_decay =0,
            dampening = 0,
            nesterov = False
        )
    

    
    #scheduler
    #implemented  pytorch CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateu, 
    scheduler = "cosine_with_warmup"
    if scheduler == "CosineAnnealing":
        scheduler_params = dict(
            T_max = 3,
            eta_min = 0,
            last_epoch = -1,
            verbose=True
        )
    elif scheduler == "ReduceLROnPlateu":
        scheduler_params = dict(
            mode = 'min',
            factor = 0.1,
            patience = 10,
            threshold = 1e-4,
            threshold_mode= 'rel',
            cooldown = 0,
            min_lr = 0,
            eps = 1e-8,
            verbose=True
        )
    elif scheduler == "CosineAnnealingWarmRestarts":
        scheduler_params = dict(
            T_0  = 3,
            T_mult  = 1,
            eta_min = 0,
            last_epoch = -1,
            verbose = True
        )
        
    elif scheduler == "linear_with_warmup": #huggingface scheduler
        scheduler_params = dict(
            warmup_steps_ratio =  0.1
        )
        
    elif scheduler == "cosine_with_warmup": #huggingface scheduler
        scheduler_params = dict(
            warmup_steps_ratio =  0.1
        )
        
    
    #SWA stochastic weight averaging
    SWA = False
     
    #FP16
    FP16 = False
    
    #accelerate 
    ACCELERATE = False

## Setting seed

In [None]:
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    

def set_torch_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
#     torch.backends.cudnn.benchmark = False 
#     torch.backends.cudnn.deterministic =True

def set_tf_seed(seed):
    tf.random.set_seed(seed)  
    
set_random_seed(CONFIG.seed)
set_torch_seed(CONFIG.seed)
# set_tf_seed(CONFIG.seed)

# Loading the testset

In [None]:
test_df = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
test_df.head()

## Process the features

In [None]:
def preprocess_question(df):
    df['question'] = df['question'].str.strip()
    return df

test_df = preprocess_question(test_df)

In [None]:
def break_long_context(df, tokenizer, train=True):
    if train: 
        n_examples = len(df)
        full_set = []
        for i in tqdm(range(n_examples)):
            row = df.iloc[i]
            # tokenizer parameters can be found here 
            # https://huggingface.co/transformers/internal/tokenization_utils.html#transformers.tokenization_utils_base.PreTrainedTokenizerBase
            tokenized_examples = tokenizer(row['question'],
                                          row['context'],
                                          padding='max_length',
                                          max_length=CONFIG.max_input_length, 
                                          truncation='only_second',
                                          stride=CONFIG.doc_stride,
                                          return_overflowing_tokens=True, #returns the number of over flow
                                          return_offsets_mapping=True     #returns the BPE mapping to the original word
                                          ) 
            
            # tokenized_example keys
            #'input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'
            sample_mappings = tokenized_examples.pop("overflow_to_sample_mapping")
            offset_mappings = tokenized_examples.pop("offset_mapping")
            
            final_examples = [] 
            n_sub_examples = len(sample_mappings)
            for j in range(n_sub_examples):
                input_ids = tokenized_examples["input_ids"][j]
                attention_mask = tokenized_examples["attention_mask"][j]
                
                sliced_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids))
                final_example = dict(input_ids = input_ids, 
                                     attention_mask = attention_mask,
                                     sliced_text = sliced_text,
                                     offset_mapping=offset_mappings[j],
                                     fold=row['fold'])
                
                
                
                # Most of the time cls_index is 0
                cls_index = input_ids.index(tokenizer.cls_token_id)
                # None, 0, 0, .... None, None, 1, 1,.....
                sequence_ids = tokenized_examples.sequence_ids(j)
                
                sample_index = sample_mappings[j]
                offset_map = offset_mappings[j]
                
                if np.isnan(row["answer_start"]) : # if no answer, start and end position is cls_index
                    final_example['start_position'] = cls_index
                    final_example['end_position'] = cls_index
                    final_example['tokenized_answer'] = ""
                    final_example['answer_text'] = ""
                else:
                    start_char  = row["answer_start"]
                    end_char  = start_char + len(row["answer_text"])
                    
                    token_start_index = sequence_ids.index(1)
                    token_end_index = len(sequence_ids)- 1 - (sequence_ids[::-1].index(1))
                    
                    if not (offset_map[token_start_index][0]<=start_char and offset_map[token_end_index][1] >= end_char):
                        final_example['start_position'] = cls_index
                        final_example['end_position'] = cls_index
                        final_example['tokenized_answer'] = ""
                        final_example['answer_text'] = ""
                    else:
                        #Move token_start_index to the correct context index
                        while token_start_index < len(offset_map) and offset_map[token_start_index][0] <= start_char:
                            token_start_index +=1
                        final_example['start_position'] = token_start_index -1
                        
                        while offset_map[token_end_index][1] >= end_char: #Take note that we will want the end_index inclusively, we will need to slice properly later
                            token_end_index -=1
                        final_example['end_position'] = token_end_index + 1   
                        tokenized_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[final_example['start_position']:final_example['end_position']+1]))
                        final_example['tokenized_answer'] = tokenized_answer
                        final_example['answer_text'] = row['answer_text']
                        
                final_examples.append(final_example)
            full_set += final_examples
            
    else:
        n_examples = len(df)
        full_set = []
        for i in tqdm(range(n_examples)):
            row = df.iloc[i]
            tokenized_examples = tokenizer(row['question'], 
                                          row['context'],
                                          padding='max_length',
                                          max_length=CONFIG.max_input_length, 
                                          truncation='only_second',
                                          stride=CONFIG.doc_stride,
                                          return_overflowing_tokens=True, #returns the number of over flow
                                          return_offsets_mapping=True     #returns the BPE mapping to the original word
                                          )
            
            sample_mappings = tokenized_examples.pop("overflow_to_sample_mapping")
            offset_mappings = tokenized_examples.pop("offset_mapping")
            n_sub_examples = len(sample_mappings)
            
            final_examples = []
            for j in range(n_sub_examples):
                input_ids = tokenized_examples["input_ids"][j]
                attention_mask = tokenized_examples["attention_mask"][j]
                
                final_example = dict(
                    input_ids = input_ids, 
                    attention_mask = attention_mask,
                    offset_mapping=offset_mappings[j],
                    example_id = row['id'],
                    context = row['context'],
                    question = row['question'],
                    sequence_ids = [0 if value is None else value for value in tokenized_examples.sequence_ids(j)]  
                )
                
                final_examples.append(final_example)
            full_set += final_examples
        return full_set

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG.model)
test_set = break_long_context(test_df, tokenizer, train=False)
# full_test_df = pd.DataFrame.from_dict(test_set)
# full_test_df.to_excel("full_test_df.xlsx")

# print(f"Total test examples = {len(full_test_df)}")

## Creating the dataset

In [None]:
class ChaiDataset(Dataset):
    def __init__(self, dataset, is_train=True):
        super(ChaiDataset, self).__init__()
        self.dataset = dataset #list of features
        self.is_train= is_train
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        features = self.dataset[index]
        if self.is_train:
            return {
                'input_ids': torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(features['attention_mask'], dtype=torch.long),
                'offset_mapping':torch.tensor(features['offset_mapping'], dtype=torch.long),
                'start_position':torch.tensor(features['start_position'], dtype=torch.long),
                'end_position':torch.tensor(features['end_position'], dtype=torch.long)
            }
        else:
            return {
                'input_ids': torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(features['attention_mask'], dtype=torch.long),
                'offset_mapping':torch.tensor(features['offset_mapping'], dtype=torch.long),
                'sequence_ids':features['sequence_ids'],
                'id':features['example_id'],
                'context':features['context'],
                'question':features['question']
            }
        

## Creating the model

In [None]:
class ChaiModel(nn.Module):
    def __init__(self, model_config):
        super(ChaiModel, self).__init__()
        self.backbone = AutoModel.from_pretrained(CONFIG.model)
        self.linear = nn.Linear(model_config.hidden_size, 2)
        
    def forward(self, input_ids, attention_mask):
        model_output = self.backbone(input_ids, attention_mask=attention_mask)
        sequence_output = model_output[0] # (batchsize, sequencelength, hidden_dim)
        
        qa_logits = self.linear(sequence_output) # (batchsize, sequencelength, 2)
        start_logit, end_logit = qa_logits.split(1, dim=-1) #  (batchsize, sequencelength), 1), (batchsize, sequencelength, 1)
        start_logits = start_logit.squeeze(-1) # remove last dim (batchsize, sequencelength)
        end_logits = end_logit.squeeze(-1)    #remove last dim (batchsize, sequencelength)
        
        return start_logits, end_logits # (2,batchsize, sequencelength)
        

In [None]:
def get_model(model_checkpoint):
    config = AutoConfig.from_pretrained(CONFIG.model)
    reloaded_model = ChaiModel(config)
    reloaded_model.load_state_dict(torch.load(model_checkpoint))
    reloaded_model.eval()
    return reloaded_model

In [None]:
def get_test_loader(test_features):
    test_dataset = ChaiDataset(test_features, is_train=False)
    test_dataloader = DataLoader(test_dataset, 
                                CONFIG.val_batchsize, 
                                shuffle= False, 
                                num_workers= CONFIG.num_workers,
                                drop_last=False,
                                pin_memory=True)
    
    return test_dataloader

In [None]:
def get_prediction(test_dataloader, model_checkpoint, device):
    model = get_model(model_checkpoint)
    model.eval()
    model.to(device)
    
    start_logits =[]
    end_logits=[]
    for features in tqdm(test_dataloader, total=len(test_dataloader)):
        input_ids = features['input_ids'].to(device)
        attention_mask = features['attention_mask'].to(device)
        with torch.no_grad():
            start_logit, end_logit = model(input_ids, attention_mask) #(batch, 384,1) , (batch, 384,1)
            start_logits.append(start_logit.to("cpu").numpy())
            end_logits.append(end_logit.to("cpu").numpy())
        
    del model
    gc.collect()
    return np.vstack(start_logits),  np.vstack(end_logits)

## Set device

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

## Prepare dataloader

In [None]:
test_dataloader = get_test_loader(test_set)

## Predictions

In [None]:
start_logits_0, end_logits_0 = get_prediction(test_dataloader,
                                          "../input/chai-finetuned-model-provided-data/pytorch_model_fold_0.pth", 
                                          device)
start_logits_1, end_logits_1 = get_prediction(test_dataloader,
                                          "../input/chai-finetuned-model-provided-data/pytorch_model_fold_1.pth", 
                                          device)
start_logits_2, end_logits_2 = get_prediction(test_dataloader,
                                          "../input/chai-finetuned-model-provided-data/pytorch_model_fold_2_epoch1.pth", 
                                          device)
start_logits_3, end_logits_3 = get_prediction(test_dataloader,
                                          "../input/chai-finetuned-model-provided-data/pytorch_model_fold_3_epoch0.pth", 
                                          device)
start_logits_4, end_logits_4 = get_prediction(test_dataloader,
                                          "../input/chai-finetuned-model-provided-data/pytorch_model_fold_4_epoch0.pth", 
                                          device)

In [None]:
start_logits = (start_logits_0 + start_logits_1 + start_logits_2 +start_logits_3+ start_logits_4 )/5
end_logits = (end_logits_0 + end_logits_1 + end_logits_2 +end_logits_3 + end_logits_4)/5

## Post processing

In [None]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1

            features[feature_index]["offset_mapping"] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(features[feature_index]["offset_mapping"])
            ]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["text"]
        
        
    return predictions

In [None]:
predictions = postprocess_qa_predictions(test_df, test_set, (start_logits, end_logits))

submission = []
for p1, p2 in predictions.items():
    p2 = " ".join(p2.split())
    p2 = p2.strip(punctuation)
    submission.append((p1, p2))
    
sample = pd.DataFrame(submission, columns=["id", "PredictionString"])

test_df =pd.merge(left=test_df,right=sample,on='id')
test_df

In [None]:
bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

tamil_ad = "கி.பி"
tamil_bc = "கி.மு"
tamil_km = "கி.மீ"
hindi_ad = "ई"
hindi_bc = "ई.पू"


cleaned_preds = []
for pred, context in test_df[["PredictionString", "context"]].to_numpy():
    if pred == "":
        cleaned_preds.append(pred)
        continue
    while any([pred.startswith(y) for y in bad_starts]):
        pred = pred[1:]
    while any([pred.endswith(y) for y in bad_endings]):
        if pred.endswith("..."):
            pred = pred[:-3]
        else:
            pred = pred[:-1]
    if pred.endswith("..."):
            pred = pred[:-3]
    
    if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
        pred = pred+"."
        
    cleaned_preds.append(pred)

In [None]:
test_df["PredictionString"] = cleaned_preds
test_df

In [None]:
submission = test_df[['id','PredictionString']]
submission.to_csv("submission.csv", index=False)
submission