In [None]:
# couldn't train the model right now coz of GPU constraints, will train this in the future

In [None]:
# downloads squad dataset
!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

In [4]:
import json
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForQuestionAnswering

In [5]:
def get_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers

In [6]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

In [7]:
train_contexts, train_questions, train_answers = get_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = get_squad('squad/dev-v2.0.json')
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [8]:
print(len(train_contexts), len(train_questions), len(train_answers))

86821 86821 86821


In [9]:
print(train_contexts[0], train_questions[0], train_answers[0])

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy". When did Beyonce start becoming popular? {'text': 'in the late 1990s', 'answer_start': 269, 'answer_end': 286}


In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [11]:
print(len(train_encodings))
for key, val in train_encodings.items():
    print(key, len(val))

3
input_ids 86821
token_type_ids 86821
attention_mask 86821


In [12]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [13]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)
for key, val in train_encodings.items():
    print(key, len(val))

input_ids 86821
token_type_ids 86821
attention_mask 86821
start_positions 86821
end_positions 86821


In [14]:
class GetDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return 500 #len(self.encodings['input_ids']) #use all encodings when you get a better GPU

In [15]:
train_data = GetDataset(train_encodings)
val_data = GetDataset(val_encodings)
print(len(train_data), len(val_data), train_data[42])

500 500 {'input_ids': tensor([  101, 20773, 21025, 19358, 22815,  2001,  2141,  1999,  5395,  1010,
         3146,  1010,  2000,  8292,  4244, 10196,  5754,  1000, 11958,  1000,
        22815,  1006,  7663, 20289,  2378,  3401,  1007,  1010,  1037,  2606,
        16200, 18116,  1998, 11090,  3954,  1010,  1998, 25436, 22815,  1010,
         1037,  1060, 10624,  2595,  4341,  3208,  1012, 20773,  1005,  1055,
         2171,  2003,  1037,  7050,  2000,  2014,  2388,  1005,  1055, 10494,
         2171,  1012, 20773,  1005,  1055,  3920,  2905, 14017, 22043,  2003,
         2036,  1037,  3220,  1998,  1037,  2280,  2266,  1997, 10461,  1005,
         1055,  2775,  1012, 25436,  2003,  3060,  1011,  2137,  1010,  2096,
        11958,  2003,  1997,  5773, 21414,  6934,  1006,  2007,  3060,  1010,
         3128,  2137,  1010,  2413,  1010,  6187, 19792,  1010,  1998,  6802,
         3493,  1998,  3009, 11377,  1007,  1012,  2083,  2014,  2388,  1010,
        20773,  2003,  1037, 12608,  1997,

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_epochs = 10
batch_size = 16
lr = 3e-4
print(device)

cuda


In [None]:
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased").to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [18]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
batch = next(iter(train_loader))
print(len(train_loader), len(val_loader), batch['input_ids'].shape, batch['attention_mask'].shape, batch['start_positions'].shape, batch['end_positions'].shape)

32 32 torch.Size([16, 512]) torch.Size([16, 512]) torch.Size([16]) torch.Size([16])


In [19]:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
start_positions = batch['start_positions'].to(device)
end_positions = batch['end_positions'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
print(outputs)

QuestionAnsweringModelOutput(loss=tensor(6.2325, device='cuda:0', grad_fn=<DivBackward0>), start_logits=tensor([[-2.2985e-01,  8.5316e-05, -2.8801e-01,  ..., -6.6056e-02,
         -9.5035e-02, -2.8879e-01],
        [-1.5738e-01, -1.5655e-01, -3.6717e-01,  ...,  1.9020e-01,
         -4.4244e-03, -1.2919e-02],
        [-1.4702e-01, -1.0066e-01, -6.2619e-02,  ..., -1.1542e-01,
         -1.5171e-01, -1.8741e-01],
        ...,
        [-1.2539e-01,  1.9737e-01, -3.4023e-01,  ..., -2.2637e-01,
         -1.3013e-01, -9.2296e-02],
        [-9.6327e-02,  9.6213e-02, -5.1269e-01,  ..., -7.0526e-02,
         -2.1700e-01, -1.1765e-01],
        [-2.5480e-01, -1.0072e-01, -9.5956e-02,  ..., -3.4874e-01,
         -2.4745e-01, -2.0516e-01]], device='cuda:0',
       grad_fn=<SqueezeBackward1>), end_logits=tensor([[-0.1982,  0.3241, -0.4594,  ..., -0.5038, -0.5163, -0.0474],
        [-0.1607,  0.0071,  0.0885,  ..., -0.4400, -0.2835, -0.2438],
        [-0.1918,  0.0542, -0.0493,  ..., -0.4441, -0.2341, 

In [20]:
def loop(model, loader, is_train):
    model.train(is_train)
    losses = []
    pbar = tqdm(loader, total=len(loader))
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        with torch.set_grad_enabled(is_train):
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs[0]
            losses.append(loss.item())
        if is_train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        pbar.set_description(f'epoch={epoch}, train={int(is_train)}, loss={np.mean(losses):.4f}')

In [None]:
# run this cell when you get a better GPU
for epoch in range(n_epochs):
    loop(model, train_loader, True)
    loop(model, val_loader, False)

In [38]:
@torch.no_grad()
def get_answer(context, question):
    model.eval()
    encoding = tokenizer(context, question, truncation=True, padding=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)
    attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(device)
    dummy_position = torch.tensor([0]).to(device)
    output = model(input_ids, attention_mask=attention_mask, start_positions=dummy_position, end_positions=dummy_position)
    start_idx = output['start_logits'].softmax(-1).argmax(-1)
    end_idx = output['end_logits'].softmax(-1).argmax(-1)
    if start_idx > end_idx:
        return 'start_idx > end_idx, could not answer the question'
    else:
        answer_ids = input_ids[0, start_idx:end_idx]
        answer = tokenizer.convert_ids_to_tokens(answer_ids)
        answer = ' '.join(answer)
        return answer

In [39]:
context = "Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\"."
question = "When did Beyonce start becoming popular?"
answer = get_answer(context, question)
print(answer) # start_idx > end_idx, coz model is not trained on data

start_idx > end_idx, could not answer the question
