# Evaluating the architecture of BERT for Question-Answering

In [31]:
import torch
from transformers import AutoTokenizer, BertForQuestionAnswering

In [32]:
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-uncased-squad2")

In [33]:
model

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

## Testing with the cleaned SQuAD-2.0 Dataset for random samples

In [34]:
train_csv_path = "../data/train_df.csv"

In [35]:
import pandas as pd

In [36]:
train_df = pd.read_csv(train_csv_path)

In [33]:
train_df

Unnamed: 0,idx,question,answer,answer_start,is_impossible,context
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,in the late 1990s,269,False,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,singing and dancing,207,False,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,2003,526,False,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,"Houston, Texas",166,False,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,late 1990s,276,False,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
...,...,...,...,...,...,...
86816,5735d259012e2f140011a09d,In what US state did Kathmandu first establish...,Oregon,229,False,"Kathmandu Metropolitan City (KMC), in order to..."
86817,5735d259012e2f140011a09e,What was Yangon previously known as?,Rangoon,414,False,"Kathmandu Metropolitan City (KMC), in order to..."
86818,5735d259012e2f140011a09f,With what Belorussian city does Kathmandu have...,Minsk,476,False,"Kathmandu Metropolitan City (KMC), in order to..."
86819,5735d259012e2f140011a0a0,In what year did Kathmandu create its initial ...,1975,199,False,"Kathmandu Metropolitan City (KMC), in order to..."


In [37]:
tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-uncased-squad2")

In [38]:
q = train_df.iloc[0:10]["question"]

In [39]:
q

0             When did Beyonce start becoming popular?
1    What areas did Beyonce compete in when she was...
2    When did Beyonce leave Destiny's Child and bec...
3        In what city and state did Beyonce  grow up? 
4           In which decade did Beyonce become famous?
5           In what R&B group was she the lead singer?
6        What album made her a worldwide known artist?
7               Who managed the Destiny's Child group?
8                       When did Beyoncé rise to fame?
9       What role did Beyoncé have in Destiny's Child?
Name: question, dtype: object

In [40]:
c = train_df.iloc[0:10]["context"]

In [65]:
type(c)

pandas.core.series.Series

In [73]:
def process( q: str, c: str):

    if isinstance(q, str) and isinstance(c, str):
        return tokenizer(
        q,
        c,
        truncation="only_second",
        padding="longest",
        max_length=512,
        return_tensors="pt")

    return tokenizer(
        [[a, b] for a, b in zip(q, c)],
        truncation="only_second",
        padding="longest",
        max_length=512,
        return_tensors="pt")

In [75]:
process(q, c)

{'input_ids': tensor([[ 101, 2043, 2106,  ...,    0,    0,    0],
        [ 101, 2054, 2752,  ...,  102,    0,    0],
        [ 101, 2043, 2106,  ..., 1000, 1012,  102],
        ...,
        [ 101, 2040, 3266,  ...,    0,    0,    0],
        [ 101, 2043, 2106,  ...,    0,    0,    0],
        [ 101, 2054, 2535,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [45]:
prompt = tokenizer([[a, b] for a, b in zip(q, c)], truncation="only_second", padding="longest", return_tensors="pt")

In [47]:
prompt.input_ids.shape

torch.Size([10, 181])

In [42]:
outputs = model(**prompt)

In [43]:
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ 2.2298, -4.6121, -5.3336, -5.2261, -5.8015, -5.8606, -5.7483, -6.2092,
         -1.6701,  1.5185, -0.9734, -2.7100,  0.3109, -3.1559, -1.0464, -1.3659,
         -2.8769, -3.2209, -5.1629, -5.4390, -5.0180, -5.1036, -5.0535, -5.1325,
         -4.6149, -2.8693, -5.4491, -4.8233, -5.2988, -5.5535, -4.6595, -2.8359,
          0.8846,  2.3510,  4.0506,  1.6662, -1.7486,  4.2999, -1.2688, -2.3620,
         -2.6707, -1.9643, -2.0897, -5.1541, -1.9234, -4.9418, -2.4350, -2.4140,
         -4.7538, -2.3700, -1.5672, -0.1169, -4.5734, -2.2279, -2.8869, -0.0741,
         -3.8213, -0.8613, -0.4781,  0.9880, -2.6846, -4.3749, -2.4699, -1.6268,
         -4.8492, -2.6007, -2.2432, -1.2542, -2.6415, -0.3159,  0.3272,  1.4804,
          3.8874, -0.3280,  1.7922,  7.5550,  9.6724, 10.5494,  9.2525, -2.0965,
         -1.5884, -1.9894, -3.6670, -0.9500, -4.3003, -3.6991, -1.9921, -4.9055,
         -3.2902,  0.3638, -2.8201, -3.9051, -1.7892, -0

In [54]:
start_id = outputs.start_logits.argmax()
end_id = outputs.end_logits.argmax()

In [55]:
start_id, end_id

(tensor(77), tensor(78))

In [56]:
predict_answer_tokens = prompt.input_ids[0, start_id : end_id + 1]

In [57]:
predict_answer_tokens

tensor([2397, 4134])

In [58]:
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

'late 1990s'

In [59]:
train_df.iloc[0]["answer"]

'in the late 1990s'

## Checking output shape of headless BERT

In [52]:
from transformers import AutoModel, AutoConfig

In [54]:
config = AutoConfig.from_pretrained("bert-base-uncased", add_pooling_layer=False)

In [57]:
model = AutoModel.from_config(config, add_pooling_layer=False)

In [58]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [11]:
test_text = ["This is a sentence"] * 5

In [12]:
test_text

['This is a sentence',
 'This is a sentence',
 'This is a sentence',
 'This is a sentence',
 'This is a sentence']

In [13]:
inputs = tokenizer(test_text, return_tensors="pt")

In [14]:
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 6251,  102],
        [ 101, 2023, 2003, 1037, 6251,  102],
        [ 101, 2023, 2003, 1037, 6251,  102],
        [ 101, 2023, 2003, 1037, 6251,  102],
        [ 101, 2023, 2003, 1037, 6251,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1]])}

In [16]:
outputs = model(**inputs)

In [20]:
outputs.last_hidden_state.shape

torch.Size([5, 6, 768])

In [21]:
import torch.nn as nn

In [22]:
linear = nn.Linear(in_features=768, out_features=2)

In [23]:
qa_outputs = linear(outputs.last_hidden_state)

In [28]:
start_logits, end_logits = qa_outputs.split(1, dim=-1)

In [30]:
start_logits.shape, end_logits.shape

(torch.Size([5, 6, 1]), torch.Size([5, 6, 1]))