# Libraries

Installing HuggingFace Transformers (https://github.com/huggingface/transformers)

In [None]:
!pip install transformers

import json
from pathlib import Path
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast 
import torch

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 9.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 30.2MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 22.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=9925cfd

# Datset processing

Uploading the dataset, splitting the data into train and validation sets

In [None]:
path = Path('qa_dataset.json')
data = json.loads(path.read_text(encoding='utf-8'))

train, val = train_test_split(all_data, test_size=0.2, shuffle=True)

Getting contexts, questions and answers from the the train and validation set

In [None]:
def read_set(set):
    
    contexts = []
    questions = []
    answers = []

    for group in set:
        context = group['context']
        for qa in group['qas']:
            question = qa['question']
            for answer in qa['answers']:
                contexts.append(context)
                questions.append(question)
                answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_set(train)
val_contexts, val_questions, val_answers = read_set(val)

Adding span tags to answers and contexts

In [None]:
def add_tags(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_tags = answer['answer_start']
        end_tags = answer['answer_end']

add_tags(train_answers, train_contexts)
add_tags(val_answers, val_contexts)

# Tokenization and vectorization

Initializating BertTokenizerFast from HuggingFace for BERT base multilingual cased pre-trained model

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

Tokenizing and vectorizing questions and contexts with BertTokenizerFast

In [None]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

# Preparing the data for training

Adding token positions to answers

In [None]:
def add_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_positions(train_encodings, train_answers)
add_positions(val_encodings, val_answers)

Adapting the data for training with PyTorch

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = Dataset(train_encodings)
val_dataset = Dataset(val_encodings)