# DistilBERT

Tokeniser: DistilBertTokenizerFast.pre_trained('distilbert-base-uncased')

Model: DistilBertForQuestionAnswering

Pre-trained model: [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased)

In [1]:
import json

### Data Prep

In [2]:
from collections.abc import Iterable

# To return a string with the contexts concatenated

def flatten(xs):
    for x in xs:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            yield from flatten(x)
        else:
            yield x

In [3]:
# To convert data into 3 lists: contexts, questions and answers respectively

def read_data(path):
    with open(path, 'r') as f:
      data = json.load(f)
    
    contexts = []
    questions = []
    answers = []
    
    for group in data:
        # Removing yes/no questions not found in the context
        if "yes" in group['answer'] or "no" in group['answer']:
            continue
        contexts.append(''.join(flatten(group['context'])))
        questions.append(group['question'])
        answers.append(group['answer'])
        
    return contexts, questions, answers

In [4]:
train_contexts, train_questions, train_answers = read_data('train_set.json')
val_contexts, val_questions, val_answers = read_data('dev_set.json')

In [5]:
# The DistilBERT takes in input answer_start (start index of answer in context) and answer_end (end index of answer in context).
# NOTE: Some questions would be inadvertenly be removed becuase they contain yes/no answers and will hence, be removed from the corpus
# Return a list of dictionaries with {text:answerstring, answer_start: start_idx, answer_end: end_idx}

def update_train_answers(answers, contexts):
    temp = []
    for answer, context in zip(answers,contexts):
        gold_text = answer
        start_idx = context.find(answer)
        # There are some yes/no answers not found in the context
        if start_idx == -1:
            print(answer)
        end_idx = start_idx + len(gold_text)
        if context[start_idx:end_idx] == gold_text:
            temp.append({'text':answer, 'answer_start':start_idx, 'answer_end':end_idx})
        else:
            for n in [1,2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    temp.append({'text':answer, 'answer_start':start_idx, 'answer_end':end_idx})
    return temp

In [6]:
train_answers_n = update_train_answers(train_answers,train_contexts)
val_answers_n = update_train_answers(val_answers,val_contexts)

### Tokenize Inputs

In [7]:
from transformers import DistilBertTokenizerFast

# Tokenise input using DistilBERT tokeniser from pretrained 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [8]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [9]:
# Add the start and end position tokens which is required for the DistilBERTQA model
def add_token_answers(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i,answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i,answers[i]['answer_end']))
        
        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift one token forward
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1
    encodings.update({
        'start_positions':start_positions,
        'end_positions':end_positions
                     })

In [10]:
add_token_answers(train_encodings,train_answers_n)

In [11]:
add_token_answers(val_encodings,val_answers_n)

In [None]:
import torch


class NLDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = NLDataset(train_encodings)
val_dataset = NLDataset(val_encodings)

### Fine-tuning

In [None]:
from transformers import DistilBertForQuestionAnswering

# Initialising the DistilBERTQA model using pre-trained 'distilbert-base-uncased'
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [None]:
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')# move model over to detected device
model.to(device)

In [None]:
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
model_path = 'models/distilbert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=16)

acc = []

# initialize loop for progress bar
loop = tqdm(val_loader)
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)

In [None]:
print("T/F\tstart\tend\n")
for i in range(len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

In [None]:
print(acc)

### Load Pre-trained model 

In [18]:
model = DistilBertForQuestionAnswering.from_pretrained('models/distilbert-custom')