In [1]:
import requests
import json
import torch
import os
from tqdm import tqdm
import pickle

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### preprocess data and save it

In [3]:
with open('COVID-QA.json', 'rb') as f:
    covid = json.load(f)

print(len(covid))
print(len(covid['data']))
print(len(covid['data'][0]['paragraphs'][0]))
print(covid['data'][0]['paragraphs'][0]['context'])

1
147
3
Functional Genetic Variants in DC-SIGNR Are Associated with Mother-to-Child Transmission of HIV-1

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2752805/

Boily-Larouche, Geneviève; Iscache, Anne-Laure; Zijenah, Lynn S.; Humphrey, Jean H.; Mouland, Andrew J.; Ward, Brian J.; Roger, Michel
2009-10-07
DOI:10.1371/journal.pone.0007211
License:cc-by

Abstract: BACKGROUND: Mother-to-child transmission (MTCT) is the main cause of HIV-1 infection in children worldwide. Given that the C-type lectin receptor, dendritic cell-specific ICAM-grabbing non-integrin-related (DC-SIGNR, also known as CD209L or liver/lymph node–specific ICAM-grabbing non-integrin (L-SIGN)), can interact with pathogens including HIV-1 and is expressed at the maternal-fetal interface, we hypothesized that it could influence MTCT of HIV-1. METHODS AND FINDINGS: To investigate the potential role of DC-SIGNR in MTCT of HIV-1, we carried out a genetic association study of DC-SIGNR in a well-characterized cohort of 197 H

In [4]:
def read_data(groups):
    contexts = []
    questions = []
    answers = []
    for group in groups:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers

In [5]:
# split the dataset
train_contexts, train_questions, train_answers = read_data(covid['data'][0:103])
test_contexts, test_questions, test_answers = read_data(covid['data'][103:])

In [6]:
# add the end index
def add_end_idx(answers, contexts):
  for answer, context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)

    # sometimes answers are off by a character or two so we fix this
    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = end_idx
    elif context[start_idx-1:end_idx-1] == gold_text:
      answer['answer_start'] = start_idx - 1
      answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
      answer['answer_start'] = start_idx - 2
      answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(test_answers, test_contexts)

In [7]:
# save the train data and test data
train_data = [train_contexts, train_questions, train_answers]
test_data = [test_contexts, test_questions, test_answers]
with open('train.pkl', 'wb') as f:
    pickle.dump(train_data, f)

with open('test.pkl', 'wb') as f:
    pickle.dump(test_data, f)

### read data

In [8]:
# load train data
with open('train.pkl', 'rb') as f:
    train_data = pickle.load(f)

train_contexts, train_questions, train_answers = train_data[0], train_data[1], train_data[2]
print(f'There are {len(train_questions)} questions')
print(train_answers[100])

There are 1025 questions
{'text': 'amplification step', 'answer_start': 9670, 'answer_end': 9688}


In [9]:
# load test data
with open('test.pkl', 'rb') as f:
    test_data = pickle.load(f)

test_contexts, test_questions, test_answers = test_data[0], test_data[1], test_data[2]

### Tokenization
As we know we have to tokenize our data in form that is acceptable for the BERT model. We are going to use the `BertTokenizerFast` instead of `BertTokenizer` as the first one is much faster. Since we are going to train our model in batches we need to set `padding=True`

In [10]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# conduct trunction and padding
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

print(train_encodings.keys())

no_of_encodings = len(train_encodings['input_ids'])
print(f'We have {no_of_encodings} context-question pairs')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
We have 1025 context-question pairs


In [11]:
# change the start and end positions
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
      end_positions[-1] = tokenizer.model_max_length

  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
  return encodings

train_encodings = add_token_positions(train_encodings, train_answers)
test_encodings = add_token_positions(test_encodings, test_answers)

In [12]:
train_encodings['start_positions'][:10]

[131, 512, 512, 512, 512, 512, 512, 512, 512, 512]

### Dataset definition
define our dataset using the PyTorch Dataset class from `torch.utils` in order to create our dataloaders after that.

In [13]:
class COVID_Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

  def __len__(self):
    return len(self.encodings.input_ids)

train_dataset = COVID_Dataset(train_encodings)
test_dataset = COVID_Dataset(test_encodings)

In [14]:
# define dataloaders
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1)

### Fine-Tuning

In [15]:
# import pretrained model
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# check on the available device - use GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

Working on cuda


In [None]:
# training
from transformers import AdamW

N_EPOCHS = 5
optim = AdamW(model.parameters(), lr=5e-5)

model.to(device)
model.train()

for epoch in range(N_EPOCHS):
  loop = tqdm(train_loader, leave=True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch+1}')
    loop.set_postfix(loss=loss.item())

model_path = './drive/MyDrive/colab/main/model/bert-COVID-QA'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Epoch 1: 100%|██████████| 65/65 [01:29<00:00,  1.37s/it, loss=nan]
Epoch 2: 100%|██████████| 65/65 [01:28<00:00,  1.37s/it, loss=1.61]
Epoch 3: 100%|██████████| 65/65 [01:28<00:00,  1.37s/it, loss=nan]
Epoch 4: 100%|██████████| 65/65 [01:28<00:00,  1.36s/it, loss=nan]
Epoch 5: 100%|██████████| 65/65 [01:28<00:00,  1.36s/it, loss=nan]


('./drive/MyDrive/colab/main/model/bert-COVID-QA/tokenizer_config.json',
 './drive/MyDrive/colab/main/model/bert-COVID-QA/special_tokens_map.json',
 './drive/MyDrive/colab/main/model/bert-COVID-QA/vocab.txt',
 './drive/MyDrive/colab/main/model/bert-COVID-QA/added_tokens.json',
 './drive/MyDrive/colab/main/model/bert-COVID-QA/tokenizer.json')

### load the saved model

In [17]:
from transformers import BertForQuestionAnswering, BertTokenizerFast
import torch

model_path = './drive/MyDrive/colab/Bert/model/bert-COVID-QA'
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

model = model.to(device)

Working on cuda


### Evaluate

In [18]:
model.eval()
predictions, references = [], []
loop = tqdm(test_loader, leave=True)
i = 0
for batch in loop:
  input_ids = batch['input_ids'].to(device)
  attention_mask = batch['attention_mask'].to(device)
  start_positions = batch['start_positions'].to(device)
  end_positions = batch['end_positions'].to(device)
  outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
  start_scores = outputs.start_logits
  end_scores = outputs.end_logits
  # Convert token scores to tokens
  start_index = torch.argmax(start_scores, dim=1).item()
  end_index = torch.argmax(end_scores, dim=1).item()
  predicted_tokens = input_ids[0][start_index:end_index+1]
  predicted_answer = tokenizer.decode(predicted_tokens, skip_special_tokens=True)
  prediction = {"id": i, "prediction_text": predicted_answer}
  predictions.append(prediction)
  reference = {"id": i, "answers": {"text": [test_answers[i]['text']], "answer_start": [test_answers[i]['answer_start']]}}
  references.append(reference)
  i += 1

100%|██████████| 994/994 [00:33<00:00, 29.42it/s]


In [22]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-an

In [19]:
from datasets import load_metric
metric = load_metric("squad")
metric.compute(predictions=predictions, references=references)

  metric = load_metric("squad")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

{'exact_match': 0.1006036217303823, 'f1': 4.689592791214975}