Dependency import

In [1]:
import os
import requests
import json

Creating a dataset folder

In [2]:
os.mkdir("./finetune_squad")

FileExistsError: [Errno 17] File exists: './finetune_squad'

In [4]:
link = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'

Download data

In [5]:
train_test_key = ['train-v2.0.json','dev-v2.0.json']

for key in train_test_key:
    response = requests.get(f'{link}{key}')
    with open (f'finetune_squad/{key}','wb') as file:
        for small_chunk in response.iter_content(chunk_size=10):
            file.write(small_chunk)

Data Prep

In [7]:
def get_train_test_data(path):
    contexts = []
    answers = []
    questions = []
    with open(path) as file:
        json_data = json.load(file)
        for item in json_data['data']:
            paragraph = item['paragraphs']
            for sub_item in paragraph:
                context = sub_item['context']
                for qas in sub_item['qas']:
                    question = qas['question']

                    if 'plausible_answers' not in qas.keys():
                        answer = qas['answers'][0]
                    else:
                        try:
                            answer = qas['plausible_answers'][0]
                        except:
                            continue
                    
                    contexts.append(context)
                    answers.append(answer)
                    questions.append(question)
    return contexts,answers,questions

In [8]:
train_path = 'finetune_squad/train-v2.0.json'
test_path = 'finetune_squad/dev-v2.0.json'

train_contexts,train_answers,train_questions = get_train_test_data(train_path)
test_contexts,test_answers,test_questions = get_train_test_data(test_path)

Adding end index for SQUAD

In [9]:
def add_end_index(answers,contexts):
    for answer, context in zip(answers,contexts):
        answer_text = answer['text']
        starting_index = answer['answer_start']
        ending_index = starting_index + len(answer_text)

        if context[starting_index:ending_index] == answer_text:
            answer['answer_end'] = ending_index

        else:
            for shift in [1,2]:
                if context[starting_index-shift:ending_index-shift] == answer_text:
                    answer['answer_start'] = starting_index-shift
                    answer['answer_end'] = ending_index-shift

add_end_index(train_answers,train_contexts)
add_end_index(test_answers,test_contexts)

Encoding/ Tokenizing

In [11]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [12]:
train_encodings = tokenizer(train_contexts,train_questions,truncation=True,padding=True)
test_encodings = tokenizer(test_contexts,test_questions,truncation=True,padding=True)

In [13]:
tokenizer.decode(train_encodings['input_ids'][0])

'[CLS] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( born september 4, 1981 ) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r & b girl - group destiny\'s child. managed by her father, mathew knowles, the group became one of the world\'s best - selling girl groups of all time. their hiatus saw the release of beyonce\'s debut album, dangerously in love ( 2003 ), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number - one singles " crazy in love " and " baby boy ". [SEP] when did beyonce start becoming popular? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [14]:
train_encodings['input_ids'][0][67]

1999

In [15]:
train_encodings.char_to_token(0,train_answers[0]['answer_end'])

In [17]:
train_answers[0]

{'text': 'in the late 1990s', 'answer_start': 269, 'answer_end': 286}

In [18]:
def get_answer_tokens(encodings,answers):
    start_pos = []
    end_pos = []
    for i in range(len(answers)):
        start_pos.append(encodings.char_to_token(i,answers[i]['answer_start']))
        end_pos.append(encodings.char_to_token(i,answers[i]['answer_end']))

        if start_pos[-1] is None:
            start_pos[-1] = tokenizer.model_max_length
        
        back = 1
        while end_pos[-1] is None:
            end_pos[-1] = encodings.char_to_token(i,answers[i]['answer_end']-back)

            back += 1 

        encodings.update({"start_pos":start_pos,"end_pos":end_pos})

get_answer_tokens(train_encodings,train_answers)
get_answer_tokens(test_encodings,test_answers)

In [19]:
import torch

class sqdat(torch.utils.data.Dataset):
    def __init__(self,encodings):
        self.encodings = encodings
    
    def __getitem__(self,idx):
        return {key:torch.tensor(val[idx]) for key,val, in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


train_datasets = sqdat(train_encodings)


test_datasets = sqdat(test_encodings)


Fine Tuning using a transformers model

In [20]:
from transformers import DistilBertForQuestionAnswering

model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading: 100%|██████████| 256M/256M [00:19<00:00, 13.5MB/s] 
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_output

In [21]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [22]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr = 5e-5)



In [23]:
train_loader = DataLoader(train_datasets, batch_size=8,shuffle=True)



for epoch in range(10):
    loop = tqdm(train_loader)
    for batch in loop:
        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)

        attention_mask = batch['attention_mask'].to(device)

        start_pos = batch['start_pos'].to(device)

        end_pos = batch['end_pos'].to(device)

        outputs = model(input_ids,attention_mask = attention_mask,start_positions = start_pos,end_positions = end_pos)
        

        loss = outputs[0]

        loss.backward()

        optim.step()


        loop.set_description(f'Epoch {epoch}')

        loop.set_postfix(loss=loss.item())

Epoch 0:   1%|          | 85/16290 [05:48<18:26:24,  4.10s/it, loss=3.73]


KeyboardInterrupt: 

Saving the model

In [None]:
model_path = 'model/distil_fine_tuned'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Inference

In [None]:
model.eval()

In [None]:
test_loader = DataLoader(test_datasets)

accuracy = []


loop = tqdm(test_loader)
for batch in loop:

    with torch.no_grad():

        input_ids = batch['input_ids'].to(device)

        attention_mask = batch['attention_mask'].to(device)

        start_pos = batch['start_pos'].to(device)

        end_pos = batch['end_pos'].to(device)

        outputs = model(input_ids,attention_mask = attention_mask)

        start_preds = torch.argmax(outputs['start_logits'],dim=1)
        end_preds = torch.argmax(outputs['end_logits'],dim=1)

        accuracy.append(((start_pos == start_preds).sum()/len(start_preds)).item())
        accuracy.append(((end_pos == end_preds).sum()/len(end_preds)).item())
# calculate average accuracy in total
acc = sum(accuracy)/len(accuracy)

In [None]:
torch.argmax(outputs['start_logits'], dim=1)
torch.argmax(outputs['end_logits'], dim=1)