# Install transformers

In [None]:
!pip install git+https://github.com/huggingface/transformers
!git clone https://github.com/huggingface/transformers

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from tqdm import trange
from tqdm.notebook import tqdm
import os
import json
import sys
import logging
from dataclasses import dataclass, field
from typing import Dict, Optional

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)


# Download SQuAD dataset

In [2]:
!wget -P ./squad_data https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
!wget -P ./squad_data https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json

--2020-08-23 00:59:17--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.109.153, 185.199.110.153, 185.199.111.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.109.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30288272 (29M) [application/json]
Saving to: ‘./squad_data/train-v1.1.json’


2020-08-23 00:59:20 (13.5 MB/s) - ‘./squad_data/train-v1.1.json’ saved [30288272/30288272]

--2020-08-23 00:59:20--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.110.153, 185.199.111.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4854279 (4.6M) [application/json]
Saving to: ‘./squad_data/dev-v1.1.json’


2020-08-23 00:59:21 (6.99 MB/s) - ‘./squad_data/dev-v1.1.jso

# Load Dataset

## Raw data

In [6]:
data_dir = "./squad_data"
train_path = os.path.join(data_dir, 'train-v1.1.json')
dev_path = os.path.join(data_dir, 'dev-v1.1.json')
with open(train_path, 'r') as f:
    train_data = json.load(f)
with open(dev_path, 'r') as f:
    dev_data = json.load(f)

In [7]:
# train_data["data"]
print("Nb of data: ", len(train_data["data"]))
print()
# print(train_data["data"][0].keys())
# print(len(train_data["data"][0]["paragraphs"]))
print("Context example: ")
print(train_data["data"][0]["paragraphs"][0]["context"][:200])
print()
print("QA example: ")
for k, v in train_data["data"][0]["paragraphs"][0]["qas"][0].items():
    print(k, v)

Nb of data:  442

Context example: 
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper sta

QA example: 
answers [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}]
question To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
id 5733be284776f41900661182


## Preprocess data

In [8]:
from transformers.data.processors.squad import SquadProcessor, SquadV1Processor
from transformers.data.processors.squad import squad_convert_examples_to_features

In [9]:
processor = SquadV1Processor()
examples = processor.get_dev_examples(data_dir)

100%|██████████| 48/48 [00:03<00:00, 15.21it/s]


In [10]:
features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=256,
            doc_stride=128, 
            max_query_length=64,
            is_training=False,
            return_dataset="pt", 
            tqdm_enabled=False)



In [11]:
len(dataset[1])
print(dataset[1])
# all_input_ids,
# all_attention_masks,
# all_token_type_ids,
# all_start_positions,
# all_end_positions,
# all_cls_index,
# all_p_mask,
# all_is_impossible,

(tensor([  101,  2029,  5088,  2136,  3421,  1996, 22309,  2012,  3565,  4605,
         2753,  1029,   102,  3565,  4605,  2753,  2001,  2019,  2137,  2374,
         2208,  2000,  5646,  1996,  3410,  1997,  1996,  2120,  2374,  2223,
         1006,  5088,  1007,  2005,  1996,  2325,  2161,  1012,  1996,  2137,
         2374,  3034,  1006, 10511,  1007,  3410,  7573, 14169,  3249,  1996,
         2120,  2374,  3034,  1006, 22309,  1007,  3410,  3792, 12915,  2484,
         1516,  2184,  2000,  7796,  2037,  2353,  3565,  4605,  2516,  1012,
         1996,  2208,  2001,  2209,  2006,  2337,  1021,  1010,  2355,  1010,
         2012, 11902,  1005,  1055,  3346,  1999,  1996,  2624,  3799,  3016,
         2181,  2012,  4203, 10254,  1010,  2662,  1012,  2004,  2023,  2001,
         1996, 12951,  3565,  4605,  1010,  1996,  2223, 13155,  1996,  1000,
         3585,  5315,  1000,  2007,  2536,  2751,  1011, 11773, 11107,  1010,
         2004,  2092,  2004,  8184, 28324,  2075,  1996,  4535,

# Train

In [12]:
# If you do not use colabs
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="2"

from tqdm.notebook import tqdm


!python transformers/examples/question-answering/run_squad.py \
    --model_type bert \
    --model_name_or_path bert-base-uncased \
    --do_train \
    --train_file squad_data/dev-v1.1.json \
    --predict_file squad_data/dev-v1.1.json \
    --learning_rate 3e-5 \
    --num_train_epochs 10 \
    --max_seq_length 256 \
    --doc_stride 128 \
    --output_dir ./outputs/squad \
    --per_gpu_train_batch_size=4   \
    --save_steps 4000 \
    --overwrite_output_dir


08/23/2020 01:02:46 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/jennybae/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
08/23/2020 01:02:46 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

08/23/2020 01:02:47 - INFO - transformers.configuration_utils -   loading configu

# Evaluate

In [33]:
!python transformers/examples/question-answering/run_squad.py \
    --model_type bert \
    --model_name_or_path bert-base-uncased \
    --do_eval \
    --predict_file squad_data/dev-v1.1.json \
    --n_best_size 5 \
    --max_seq_length 256 \
    --doc_stride 128 \
    --output_dir ./outputs/squad \
    --per_gpu_eval_batch_size=4  \


08/21/2020 14:43:21 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/jennybae/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
08/21/2020 14:43:21 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

08/21/2020 14:43:21 - INFO - transformers.configuration_utils -   loading configu

## Outputs

In [16]:
output_dir = "./outputs/squad"
pred_path = os.path.join(output_dir, 'predictions_.json')
nbest_pred_path = os.path.join(output_dir, 'nbest_predictions_.json')

In [35]:
with open(pred_path, "r") as pred_json:
    pred = json.load(pred_json)
with open(nbest_pred_path, "r") as nbest_pred_json:
    nbest_pred = json.load(nbest_pred_json)

In [46]:
import os, json
squad_path = "./squad_data"
file_path = os.path.join(squad_path, 'dev-v1.1.json')
with open(file_path, 'r') as f:
    dev_data = json.load(f)

In [63]:
import random
# data_idx = random.randint(0,len(dev_data['data'])-1)
data_idx =0
paragraph_idx = random.randint(0, len(dev_data['data'][data_idx])-1)
qas_idx = random.randint(0, len(dev_data['data'][data_idx]['paragraphs'][paragraph_idx]["qas"])-1)
test_data_sample = dev_data['data'][data_idx]['paragraphs'][paragraph_idx]["qas"][qas_idx]
idx = test_data_sample['id']

print("PROBLEM")
print(test_data_sample)
print()
print("PREDICT")
print(pred[idx])
print()
print("N-BEST")
for ele in nbest_pred[idx]:
    print(ele)

PROBLEM
{'answers': [{'answer_start': 116, 'text': '2015'}, {'answer_start': 346, 'text': '2016'}, {'answer_start': 116, 'text': '2015'}], 'question': 'What year did the Denver Broncos secure a Super Bowl title for the third time?', 'id': '56bf10f43aeaaa14008c94fd'}

PREDICT
with various gold-themed initiatives, as well as

N-BEST
{'text': 'with various gold-themed initiatives, as well as', 'probability': 0.2824446786483579, 'start_logit': 0.6809132695198059, 'end_logit': 0.946496307849884}
{'text': 'themed initiatives, as well as', 'probability': 0.25602599568125195, 'start_logit': 0.582709550857544, 'end_logit': 0.946496307849884}
{'text': 'with various gold-themed initiatives, as well as temporarily suspending the tradition of naming', 'probability': 0.24208653932372287, 'start_logit': 0.6809132695198059, 'end_logit': 0.7923088669776917}
{'text': 'themed initiatives, as well as temporarily suspending the tradition of naming', 'probability': 0.21944278634666725, 'start_logit': 0.5827

# Load fine-tuned model 

In [7]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [8]:
question = "How many parameters does BERT-large have?"
answer_text = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance."

## Tokenize the input and convert to ids 

In [9]:
input_ids = tokenizer.encode(question, answer_text)
print('The input has a total of {:} tokens.'.format(len(input_ids)))

The input has a total of 70 tokens.


In [10]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)

for token, id in zip(tokens, input_ids):    
    # If this is the [SEP] token, add some space around it to make it stand out.
    if id == tokenizer.sep_token_id:
        print('')
    # Print the token string and its ID in two columns.
    print('{:<12} {:>6,}'.format(token, id))
    if id == tokenizer.sep_token_id:
        print('')
    

[CLS]           101
how           2,129
many          2,116
parameters   11,709
does          2,515
bert         14,324
-             1,011
large         2,312
have          2,031
?             1,029

[SEP]           102

bert         14,324
-             1,011
large         2,312
is            2,003
really        2,428
big           2,502
.             1,012
.             1,012
.             1,012
it            2,009
has           2,038
24            2,484
-             1,011
layers        9,014
and           1,998
an            2,019
em            7,861
##bed         8,270
##ding        4,667
size          2,946
of            1,997
1             1,015
,             1,010
02            6,185
##4           2,549
,             1,010
for           2,005
a             1,037
total         2,561
of            1,997
340          16,029
##m           2,213
parameters   11,709
!               999
altogether   10,462
it            2,009
is            2,003
1             1,015
.             1,01

## Make segment ids for the input (question:0, context:1)

In [16]:
# Search the input_ids for the first instance of the `[SEP]` token.
sep_index = input_ids.index(tokenizer.sep_token_id)

# The number of segment A tokens includes the [SEP] token istelf.
num_seg_a = sep_index + 1

# The remainder are segment B.
num_seg_b = len(input_ids) - num_seg_a

# Construct the list of 0s and 1s.
segment_ids = [0]*num_seg_a + [1]*num_seg_b

# There should be a segment_id for every input token.
assert len(segment_ids) == len(input_ids)

## Predict the logits

In [18]:
start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                 token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text

In [19]:
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

answer = ' '.join(tokens[answer_start:answer_end+1])

print('Answer: "' + answer + '"')

Answer: "340 ##m"


In [20]:
# Start with the first token.
answer = tokens[answer_start]

for i in range(answer_start + 1, answer_end + 1):
    if tokens[i][0:2] == '##':
        answer += tokens[i][2:]
    else:
        answer += ' ' + tokens[i]

print('Answer: "' + answer + '"')

Answer: "340m"
