In [1]:
import json
import requests
from google.colab import drive
drive.mount('/content/drive')
# Constants
NUM_PARAGRAPHS = 20
NUM_QAS = 20

# Download the dataset
url = "https://trung19991.github.io/squad_data/squad.json"
response = requests.get(url)
data = response.json()

# Function to create a lite version of the data
def create_lite_data(data, start_idx, end_idx):
    lite_data = {
        "version": data["version"],
        "data": []
    }
    for article in data["data"][start_idx:end_idx]:
        lite_article = {
            "title": article["title"],
            "paragraphs": []
        }
        for paragraph in article["paragraphs"][:NUM_PARAGRAPHS]:
            lite_paragraph = {
                "context": paragraph["context"],
                "qas": paragraph["qas"][:NUM_QAS]  # Take the first NUM_QAS Q&A pairs
            }
            lite_article["paragraphs"].append(lite_paragraph)
        lite_data["data"].append(lite_article)
    return lite_data

# Split data into training (70%) and validation (30%) parts
total_articles = len(data["data"])
train_size = int(total_articles * 0.8)

# Create the training data
train_data = create_lite_data(data, 0, train_size)

# Create the validation data
val_data = create_lite_data(data, train_size, total_articles)

# Save the lite data to JSON files
train_file_path = "/content/drive/My Drive/ColabNotebooks/train.json"
val_file_path = "/content/drive/My Drive/ColabNotebooks/dev.json"

with open(train_file_path, "w", encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open(val_file_path, "w", encoding='utf-8') as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)

print(f"Data has been split and saved into {train_file_path} and {val_file_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data has been split and saved into /content/drive/My Drive/ColabNotebooks/train.json and /content/drive/My Drive/ColabNotebooks/dev.json


# **Question Answering❓**
with fine-tuned BERT on SQuAD 2.0.  

Question answering comes in many forms. We’ll look at the particular type of extractive QA that involves answering a question about a passage by highlighting the segment of the passage that answers the question. This involves fine-tuning a model which predicts a start position and an end position in the passage. More specifically, we will fine tune the [bert-base-uncased](https://huggingface.co/bert-base-uncased) model on the [SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/) dataset.

I have followed [this tutorial](https://huggingface.co/transformers/v3.2.0/custom_datasets.html#question-answering-with-squad-2-0) from the huggingface community for how to fine tune BERT on custom datasets which in our case is the SQuAD 2.0.

**Some first imports**

In [2]:
import requests
import json
import torch
import os
from tqdm import tqdm

**Connecting Google Drive in order to save the model**

In [3]:
if not os.path.exists('/content/drive/MyDrive/ColabNotebooks/Roberta_xlm_new'):
  os.mkdir('/content/drive/MyDrive/ColabNotebooks/Roberta_xlm_new')

In [4]:
!pip install transformers



### **Download SQuAD 2.0 ⬇️**

SQuAD consists of two json files.

* train dataset
* validation dataset

## **Data preprocessing 💽**

In this section of data preprocessing, our goal is to get our data in the following form:

<div>
<img src="http://www.mccormickml.com/assets/BERT/SQuAD/input_formatting.png" width="650"/>
</div>


In short, we have to do the following:

1. Extract the data from the jsons files
2. Tokenize the data
3. Define the datasets

In [5]:
file_path = '/content/drive/My Drive/ColabNotebooks/train.json'
with open(file_path, 'r', encoding='utf-8') as f:
    squad = json.load(f)

In [6]:
# Each 'data' dict has two keys (title and paragraphs)
squad['data'][0].keys()

dict_keys(['title', 'paragraphs'])

In [7]:
# Iterate through all groups and print their titles
for idx, group in enumerate(squad['data']):
    print(group['title'])


HUIT
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Tuyển sinh, mã trường, tên ngành
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các câu hỏi thường gặp
Các

In [8]:
# let's check on Greece which is 186th (0-based indexing)
# we can see that we have a context and many questions and answers following
squad['data'][0]

{'title': 'HUIT',
 'paragraphs': [{'context': 'Trường Đại học Công Thương Thành phố Hồ Chí Minh, từ ngày 01/07/2023',
   'qas': [{'question': 'HUIT là gì?',
     'id': 'a7f8880e-0a14-4259-8659-63b2c8628115',
     'answers': [{'answer_start': 0,
       'text': 'Trường Đại học Công Thương Thành phố Hồ Chí Minh'}],
     'is_impossible': False},
    {'question': 'HUIT là viết tắt của trường gì?',
     'id': '456b6dda-79e3-4f87-892c-274c0161c3f6',
     'answers': [{'answer_start': 0,
       'text': 'Trường Đại học Công Thương Thành phố Hồ Chí Minh'}],
     'is_impossible': False},
    {'question': 'Trường gì có chữ viết tắt là HUIT? Ở đâu',
     'id': '92b8a280-8248-4dd0-b75d-199e74ef4274',
     'answers': [{'answer_start': 0,
       'text': 'Trường Đại học Công Thương Thành phố Hồ Chí Minh'}],
     'is_impossible': False},
    {'question': 'Trường nào có chữ viết tắt là HUIT? Ở đâu',
     'id': 'd4e2e23a-2372-49d3-a33d-b8e9265acc3e',
     'answers': [{'answer_start': 0,
       'text': 'Trư

In [9]:
# and this is the context given for NYC
squad['data'][0]

{'title': 'HUIT',
 'paragraphs': [{'context': 'Trường Đại học Công Thương Thành phố Hồ Chí Minh, từ ngày 01/07/2023',
   'qas': [{'question': 'HUIT là gì?',
     'id': 'a7f8880e-0a14-4259-8659-63b2c8628115',
     'answers': [{'answer_start': 0,
       'text': 'Trường Đại học Công Thương Thành phố Hồ Chí Minh'}],
     'is_impossible': False},
    {'question': 'HUIT là viết tắt của trường gì?',
     'id': '456b6dda-79e3-4f87-892c-274c0161c3f6',
     'answers': [{'answer_start': 0,
       'text': 'Trường Đại học Công Thương Thành phố Hồ Chí Minh'}],
     'is_impossible': False},
    {'question': 'Trường gì có chữ viết tắt là HUIT? Ở đâu',
     'id': '92b8a280-8248-4dd0-b75d-199e74ef4274',
     'answers': [{'answer_start': 0,
       'text': 'Trường Đại học Công Thương Thành phố Hồ Chí Minh'}],
     'is_impossible': False},
    {'question': 'Trường nào có chữ viết tắt là HUIT? Ở đâu',
     'id': 'd4e2e23a-2372-49d3-a33d-b8e9265acc3e',
     'answers': [{'answer_start': 0,
       'text': 'Trư

### **Get data 📁**

After we got a taste of the jsons files data format let's extract our data and store them into some data structures.

In [10]:
def read_data(path):
  # load the json file
  with open(path, 'rb') as f:
    squad = json.load(f)

  contexts = []
  questions = []
  answers = []

  for group in squad['data']:
    for passage in group['paragraphs']:
      context = passage['context']
      for qa in passage['qas']:
        question = qa['question']
        for answer in qa['answers']:
          contexts.append(context)
          questions.append(question)
          answers.append(answer)

  return contexts, questions, answers

In [11]:
train_contexts, train_questions, train_answers = read_data('/content/drive/My Drive/ColabNotebooks/train.json')
valid_contexts, valid_questions, valid_answers = read_data('/content/drive/My Drive/ColabNotebooks/dev.json')

In [12]:
# print a random question and answer
print(f'There are {len(train_questions)} questions')
print(train_questions[-1])
print(train_answers[-1])

There are 284 questions
E muốn đăng ký làm thẻ ocb thì đến đâu để đăng ký ạ
{'answer_start': -1, 'text': 'chào em em đến ngân hàng ocb tại trường nhé.'}


As you can see above, the answers are dictionaries whith the answer text and an integer which indicates the start index of the answer in the context. As the SQuAD does not give us the end index of the answer in the context we have to find it ourselves. So, let's get the character position at which the answer ends in the passage. Note that sometimes SQuAD answers are off by one or two characters, so we will also adjust for that.

In [13]:
def add_end_idx(answers, contexts):
  for answer, context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)

    # sometimes squad answers are off by a character or two so we fix this
    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = end_idx
    elif context[start_idx-1:end_idx-1] == gold_text:
      answer['answer_start'] = start_idx - 1
      answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
      answer['answer_start'] = start_idx - 2
      answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(valid_answers, valid_contexts)

In [14]:
# You can see that now we get the answer_end also
print(train_questions[-1])
print(train_answers[-1])

E muốn đăng ký làm thẻ ocb thì đến đâu để đăng ký ạ
{'answer_start': -1, 'text': 'chào em em đến ngân hàng ocb tại trường nhé.'}


### **Tokenization 🔢**

As we know we have to tokenize our data in form that is acceptable for the BERT model. We are going to use the `BertTokenizerFast` instead of `BertTokenizer` as the first one is much faster. Since we are going to train our model in batches we need to set `padding=True`.

In [15]:
from transformers import XLMRobertaTokenizerFast

tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Let's see what we got after tokenizing our data.

In [16]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [17]:
no_of_encodings = len(train_encodings['input_ids'])
print(f'We have {no_of_encodings} context-question pairs')

We have 284 context-question pairs


In [18]:
train_encodings['input_ids']

[[0,
  33015,
  18832,
  2546,
  8215,
  81629,
  22049,
  15902,
  21433,
  33872,
  13985,
  4,
  2368,
  3063,
  3413,
  46288,
  1549,
  3742,
  2,
  2,
  22438,
  6371,
  580,
  4062,
  32,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [19]:
tokenizer.decode(train_encodings['input_ids'][1])

'<s> Trường Đại học Công Thương Thành phố Hồ Chí Minh, từ ngày 01/07/2023</s></s> HUIT là viết tắt của trường gì?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

We can see that each word is assigned a number.

For example,

beyonce $\rightarrow$ 20773  
[CLS] $\rightarrow$ 101  
[SEP] $\rightarrow$ 102   
[PAD] $\rightarrow$ 0  

We see that the above form matches the one in the image we saw in the Data preprocessing section before.

Next we need to convert our character start/end positions to token start/end positions. Why is that? Because our words converted into tokens, so the answer start/end needs to show the index of start/end token which contains the answer and not the specific characters in the context.

In [20]:
def add_token_positions(encodings, answers, tokenizer):
    start_positions = []
    end_positions = []

    for i in range(len(answers)):
        # Handle cases with negative 'answer_start' or missing key
        start_pos = answers[i].get('answer_start', -1)  # Use .get() to handle missing key
        if start_pos >= 0:
            start_token = encodings.char_to_token(i, start_pos)
        else:
            start_token = None

        # Handle potential negative end position or missing key
        end_pos = answers[i].get('answer_end', 0) - 1  # Use .get() to handle missing key
        if end_pos >= 0:
            end_token = encodings.char_to_token(i, end_pos)
        else:
            end_token = None

        # if start position is None, the answer passage has been truncated
        if start_token is None:
            start_token = tokenizer.model_max_length
        if end_token is None:
            end_token = tokenizer.model_max_length

        # Convert None to the tokenizer.pad_token_id for filtering
        start_positions.append(start_token if start_token != tokenizer.pad_token_id else None)
        end_positions.append(end_token if end_token != tokenizer.pad_token_id else None)

    # Update encodings with start and end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# Example usage
add_token_positions(train_encodings, train_answers, tokenizer)
add_token_positions(valid_encodings, valid_answers, tokenizer)


In [21]:
train_encodings['start_positions']

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 42,
 512,
 115,
 None,
 65,
 512,
 126,
 512,
 None,
 24,
 40,
 512,
 93,
 None,
 33,
 112,
 512,
 512,
 None,
 41,
 None,
 512,
 512,
 512,
 118,
 171,
 171,
 512,
 512,
 None,
 17,
 51,
 85,
 111,
 136,
 164,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 None,
 512,
 146,
 99,
 231,
 512,
 None,
 36,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 50,
 None,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 None,
 512,
 None,
 512,
 512,
 512,
 None,
 512,
 512,
 512,
 30,
 512,
 None,
 512,
 512,
 512,
 None,
 None,
 None,
 None,
 None,
 None,
 512,
 512,
 512,
 None,
 None,
 None,
 No

### **Dataset definition 🗄️**

We have to define our dataset using the PyTorch Dataset class from `torch.utils` in order create our dataloaders after that.

In [22]:
class SQuAD_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        # Handle potential None values by replacing them with a default
        return {key: torch.tensor(val[idx]) if val[idx] is not None else torch.tensor(0)
                for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [23]:
train_dataset = SQuAD_Dataset(train_encodings)
valid_dataset = SQuAD_Dataset(valid_encodings)

### **Dataloaders 🔁**

In [24]:
from torch.utils.data import DataLoader

# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32)

## **Fine-Tuning ⚙️**

### **Model definition 🤖**

We are going to use the `bert-case-uncased` from the huggingface transformers.

In [25]:
from transformers import XLMRobertaForQuestionAnswering

model = XLMRobertaForQuestionAnswering.from_pretrained("xlm-roberta-base")

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Training 🏋️‍♂️**

Μy choices for some parameters:

* Use of `AdamW` which is a stochastic optimization method that modifies the typical implementation of weight decay in Adam, by decoupling weight decay from the gradient update. This helps to avoid overfitting which is necessary in this case were the model is very complex.

* Set the `lr=5e-5` as I read that this is the best value for the learning rate for this task.

In [26]:
# Check on the available device - use GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

Working on cpu


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import torch

# Hyperparameters
N_EPOCHS = 2  # Giảm số lượng epochs để nhanh chóng thử nghiệm
LEARNING_RATE = 2e-5  # Thay đổi learning rate
WARMUP_STEPS = 0
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
SEED = 42
TOTAL_STEPS = len(train_loader) * N_EPOCHS


# Set seed for reproducibility
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Optimizer and scheduler
optim = AdamW(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999), eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=WARMUP_STEPS, num_training_steps=TOTAL_STEPS)

# Move model to device
model.to(device)
model.train()

# Training loop
for epoch in range(N_EPOCHS):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optim.step()
        scheduler.step()

        loop.set_description(f'Epoch {epoch + 1}')
        loop.set_postfix(loss=loss.item())


  0%|          | 0/9 [00:00<?, ?it/s]

**Save the model in my drive in order not to run it each time**

In [None]:
model_save_path = '/content/drive/MyDrive/ColabNotebooks/Roberta_xlm_new/model'
tokenizer_save_path = '/content/drive/MyDrive/ColabNotebooks/Roberta_xlm_new/tokenizer'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

**Respectively, load the saved model**

In [None]:
from transformers import XLMRobertaForQuestionAnswering, XLMRobertaTokenizerFast
import torch

model_path = '/content/drive/MyDrive/ColabNotebooks/Roberta_xlm_new'

try:
    model = XLMRobertaForQuestionAnswering.from_pretrained(model_path)
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_path)
except Exception as e:
    print(f"An error occurred: {e}")

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

model = model.to(device)


### **Testing ✅**

We are evaluating the model on the validation set by checking the model's predictions for the answer's start and end indexes and comparing with the true ones.

In [None]:
from sklearn.metrics import f1_score

model.eval()

acc = []
f1_scores = []
em_scores = []

for batch in tqdm(valid_loader):
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)

        start_pred = torch.argmax(outputs.start_logits, dim=1)
        end_pred = torch.argmax(outputs.end_logits, dim=1)

        # Accuracy
        acc.append(((start_pred == start_true).sum().item() / len(start_true)))
        acc.append(((end_pred == end_true).sum().item() / len(end_true)))

        # F1 Score
        f1_start = f1_score(start_true.cpu(), start_pred.cpu(), average='macro')
        f1_end = f1_score(end_true.cpu(), end_pred.cpu(), average='macro')
        f1_scores.append((f1_start + f1_end) / 2)

        # Exact Match
        em = ((start_pred == start_true) & (end_pred == end_true)).float().mean().item()
        em_scores.append(em)

acc = sum(acc) / len(acc)
f1 = sum(f1_scores) / len(f1_scores)
em = sum(em_scores) / len(em_scores)

print(f"\n\nAccuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Exact Match: {em:.4f}\n")


In [None]:
#model_path = '/content/drive/MyDrive/BERT-SQuAD'
#model.save_pretrained(model_path)
#tokenizer.save_pretrained(model_path)
model_save_path = '/content/drive/MyDrive/Roberta_xlm_new/model'
tokenizer_save_path = '/content/drive/MyDrive/Roberta_xlm_new/tokenizer'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

### **Ask questions 🙋**

We are going to use some functions from the [*official Evaluation Script v2.0*](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) of SQuAD in order to test the fine-tuned model by asking some questions given a context. I have also looked at this [notebook](https://colab.research.google.com/github/fastforwardlabs/ff14_blog/blob/master/_notebooks/2020-06-09-Evaluating_BERT_on_SQuAD.ipynb#scrollTo=MzPlHgWEBQ8D) which evaluates BERT on SQuAD.

In [None]:
def get_prediction(context, question):
  inputs = tokenizer.encode_plus(question, context, return_tensors='pt').to(device)
  outputs = model(**inputs)

  answer_start = torch.argmax(outputs[0])
  answer_end = torch.argmax(outputs[1]) + 1

  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

  return answer

def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re
  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)
  def white_space_fix(text):
    return " ".join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match(prediction, truth):
    return bool(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()

  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)

  common_tokens = set(pred_tokens) & set(truth_tokens)

  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0

  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)

  return round(2 * (prec * rec) / (prec + rec), 2)

def question_answer(context, question, answer):
    prediction = get_prediction(context, question)
    em_score = exact_match(prediction, answer)
    f1_score = compute_f1(prediction, answer)

    print(f'Question: {question}')
    print(f'Prediction: {prediction}')
    print(f'True Answer: {answer}')
    print(f'Exact match: {em_score}')
    print(f'F1 score: {f1_score}\n')
