### Adapted from:

https://blog.scaleway.com/2019/understanding-text-with-bert/

In [48]:
# Make imports
import time
import os
import torch
import numpy as np
import random
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from torch.optim import Adam
#from apex.optimizers import FusedAdam
#from apex.fp16_utils import FP16_Optimizer
from pytorch_transformers import BertForQuestionAnswering, BertTokenizer
from utils_squad import (read_squad_examples, convert_examples_to_features, RawResult, write_predictions)

In [4]:
# Set parameters
random.seed(123)
np.random.seed(123)
torch.manual_seed(123)
torch.cuda.manual_seed_all(123)
num_train_epochs = 1
train_batch_size = 8
model_arch = 'bert-base-uncased'

In [5]:
# Load SQUAD training examples
train_examples = read_squad_examples('data/train-v1.1.json', is_training = True, version_2_with_negative = False)

In [6]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained(model_arch)

In [7]:
# Tokenize train examples
train_features = convert_examples_to_features(train_examples, tokenizer, max_seq_length=384, doc_stride=128,
                                             max_query_length=64, is_training=True)

In [8]:
#the command above seems to return a number of examples which is higher than the original sample size, see output below:
print('original number of examples: ', len(train_examples))
print('length of generated features: ', len(train_features))

original number of examples:  87599
length of generated features:  88641


In [9]:
# Transform to TensorDataset
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions)

In [10]:
# Initialize random data sampler (iterator)
train_sampler = RandomSampler(train_data)

In [11]:
# Initialize data loader (iterator)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

In [12]:
# Initialize BERT model for question answering with half precision
bert_model = BertForQuestionAnswering.from_pretrained(model_arch).cuda()

In [13]:
# Initialize optimizer
param_optimizer = list(bert_model.named_parameters())
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] #this command removes the two pooler layers from the optimizer list
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [14]:
bert_model.train()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [15]:
start_time = time.time()

In [16]:
for epoch in range(num_train_epochs):
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.cuda() for t in batch)
        input_ids, input_mask, segment_ids, start_positions, end_positions = batch
        outputs = bert_model(input_ids, segment_ids, input_mask, start_positions, end_positions)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
    if epoch==0:
        print("Time it took to complete the first training epoch: {}".format(time.time()-start_time))
    print('Loss after epoch ', epoch, ': ', loss.item())

Time it took to complete the first training epoch: 12638.740458250046
Loss after epoch  0 :  0.18543410301208496


In [25]:
#Save trained model to model directory (Create the directory if it does not exist; otherwise override the contents)
if not os.path.exists('models'):
    os.makedirs('models')

model_to_save = bert_model.module if hasattr(bert_model, 'module') else bert_model
model_to_save.save_pretrained('models')

In [54]:
!ls

BERT_Exploratory_Analysis.ipynb  data	 __pycache__  utils_squad_evaluate.py
BERT_fine_tuning_SQUAD.ipynb	 models  results      utils_squad.py


In [38]:
# Load development data
dev_file = "data/dev-v1.1.json"

In [39]:
predict_batch_size = 32

In [40]:
eval_examples = read_squad_examples(input_file=dev_file, is_training=False, version_2_with_negative=False)

In [41]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained(model_arch)

In [42]:
eval_features = convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=384, 
                                             doc_stride=128, max_query_length=64, is_training=False)

In [43]:
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)

In [44]:
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=predict_batch_size)

In [49]:
bert_model.eval()
all_results = []
for input_ids, input_mask, segment_ids, example_indices in eval_dataloader:
    input_ids = input_ids.cuda()
    input_mask = input_mask.cuda()
    segment_ids = segment_ids.cuda()
    with torch.no_grad():
        batch_start_logits, batch_end_logits = bert_model(input_ids, segment_ids, input_mask)
    for i, example_index in enumerate(example_indices):
        start_logits = batch_start_logits[i].detach().cpu().tolist()
        end_logits = batch_end_logits[i].detach().cpu().tolist()
        eval_feature = eval_features[example_index.item()]
        unique_id = int(eval_feature.unique_id)
        all_results.append(RawResult(unique_id=unique_id,
                                             start_logits=start_logits,
                                             end_logits=end_logits))

In [50]:
#Save results to results directory (Create the directory if it does not exist; otherwise override the contents)
if not os.path.exists('results'):
    os.makedirs('results')

output_prediction_file = os.path.join('results', "predictions.json")
output_nbest_file = os.path.join('results', "nbest_predictions.json")
output_null_log_odds_file = os.path.join('results', "null_odds.json")

preds = write_predictions(eval_examples, eval_features, all_results, 20, 30, True, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file, True, False, 0.0)

In [58]:
! python evaluate-v1.1.py data/dev-v1.1.json results/predictions.json

{"f1": 87.11795233073177, "exact_match": 79.3755912961211}


In [60]:
BertForQuestionAnswering.from_pretrained('models')

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_