In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

In [2]:
data = pd.read_csv("exec.csv")

In [3]:
data.columns

Index(['Unnamed: 0', 'id', 'question', 'context', 'context_id', 'answer_start',
       'answer_text'],
      dtype='object')

In [4]:
data = data.drop(columns='Unnamed: 0')

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [6]:
data

Unnamed: 0,id,question,context,context_id,answer_start,answer_text
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",0,515,Saint Bernadette Soubirous
1,5733bf84d058e614000b61be,When did the Scholastic Magazine of Notre dame...,"As at most other universities, Notre Dame's st...",1,248,September 1876
2,5733bed24776f41900661188,Where is the headquarters of the Congregation ...,The university is the major seat of the Congre...,2,119,Rome
3,5733a6424776f41900660f51,How many BS level degrees are offered in the C...,The College of Engineering was established in ...,3,487,eight
4,5733a70c4776f41900660f64,What entity provides help with the management ...,All of Notre Dame's undergraduate students are...,4,496,Learning Resource Center
...,...,...,...,...,...,...
819,57324bd1b9d445190005e9de,How often did Jehovah Witnesses congregations ...,Meetings for worship and study are held at Kin...,18479,779,three times each week
820,573255bce99e3014001e66d8,When did NYC buy land for its parks?,The northern side of the borough includes the ...,18610,1240,1888
821,5732a488d6dcfa19001e8a5b,Who quoted the line of Terence most notably?,The ad fontes principle also had many applicat...,18678,1035,Seneca
822,5735a9fbe853931400426ab2,What is the Kathmandu Valley's average tempera...,Five major climatic regions are found in Nepal...,18847,749,50.2


In [7]:
def encode_data(row):
    question = row["question"]
    context = row["context"]
    inputs = tokenizer(question, context, padding="max_length", max_length=512, truncation=True, return_tensors="pt")
    return inputs

data["encoded_data"] = data.apply(encode_data, axis=1)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [8]:
data["input_ids"] = data["encoded_data"].apply(lambda x: x["input_ids"])
data["attention_mask"] = data["encoded_data"].apply(lambda x: x["attention_mask"])
data["token_type_ids"] = data["encoded_data"].apply(lambda x: x.get("token_type_ids"))

In [9]:
from torch.utils.data import DataLoader, TensorDataset

input_ids = torch.stack(data["input_ids"].tolist())
attention_mask = torch.stack(data["attention_mask"].tolist())
token_type_ids = torch.stack(data["token_type_ids"].tolist())

dataset = TensorDataset(input_ids, attention_mask, token_type_ids)

dataloader = DataLoader(dataset, batch_size=16)


In [10]:
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
import torch

def answer_question(question, context, model, tokenizer):
    inputs = tokenizer(question, context, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    
    start_idx = torch.argmax(start_logits)
    end_idx = torch.argmax(end_logits)
    
    answer = tokenizer.decode(inputs.input_ids[0][start_idx:end_idx+1])
    
    return answer

In [12]:
sample_question = "what did the mob do"
sample_context= "jfk was a victim of murder, the assasination was long distance, jfk's murder reason is unexplained. the mob might have 'whacked' him maybe but not central intelligence authority"
answer = answer_question(sample_question, sample_context, model, tokenizer)
print("Answer:", answer)

Answer: the mob might have'whacked'him
