<a href="https://colab.research.google.com/github/tienhuynh96/End-to-end-Question-Answering/blob/main/Question_Answering_Extractive_Approach_LSTM_based_SQuAD_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Question answering Extractive Approach with LSTM-based***

input_text = question + ' sep ' + context

Backbone: LSTM

## **1. Import libraries and create temp datasets**

In [None]:
!pip install datasets



In [None]:
import numpy as np
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset



In [None]:
qa_dataset = load_dataset('squad', split='train').shard(num_shards=40, index=0)
qa_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 2190
})

In [None]:
qa_dataset['context'][0]

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [None]:

data_size = len(qa_dataset)
data_size

2190

In [None]:
np.shape(qa_dataset)

(2190, 5)

## **2. Build vocabulary**

In [None]:
import re

# Replace specila charactor
# \w matches any word character (equivalent to [a-zA-Z0-9_]).
# \s matches any whitespace character (like spaces, tabs, and line breaks).
# [^...] is a negation operator, meaning it matches any character not in the brackets.
# [^\w\s] matches any character that is not a word character or a whitespace character
def text_normalize(text):
  text = re.sub(r'[^\w\s]', ' ', text)

  return text


# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
# This yield function is required in function "build_vocab_from_iterator"
# Get data from item context and question in data
def yield_tokens(data):
  for item in data:
    yield tokenizer(
        text_normalize(item['context']) + ' <sep> ' + text_normalize(item['question'])
        )

# Create vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(qa_dataset),
    specials = ['<unk>','<pad>','<bos>','<eos>','<sep>']
)

# Set default index for this vocab is 'unk' = 0, when the unknow word is replace the 'unk'
vocab.set_default_index(vocab['<unk>'])
# Build vocab (stoi mean string to index)
vocab.get_stoi()


{'하': 26910,
 '큰': 26909,
 '소': 26904,
 '사람': 26903,
 '나라이름': 26900,
 '魄': 26899,
 '魂': 26898,
 '金陵邑': 26896,
 '越城': 26891,
 '豊臣秀吉': 26890,
 '義皇帝': 26888,
 '禅': 26884,
 '现代汉语通用字表': 26883,
 '浙江': 26879,
 '法王': 26878,
 '汉语水平考试': 26876,
 '水': 26874,
 '校尉': 26873,
 '李閏': 26872,
 '方块字': 26869,
 '平安': 26867,
 '小': 26866,
 '唐入り': 26862,
 '北軍': 26860,
 '凹田': 26859,
 '冶城': 26858,
 '下': 26855,
 'トワイライトプリンセス': 26853,
 'ゼルダの伝説': 26850,
 'ἥλιος': 26849,
 'โรงเร': 26844,
 'สลาม': 26843,
 'ยนศาสนาอ': 26842,
 'ब': 26841,
 'نصراني': 26839,
 'نصارى': 26838,
 'العربية': 26834,
 'языка': 26830,
 'хийума': 26828,
 'стиль': 26826,
 'русского': 26825,
 'рмонтов': 26824,
 'пу': 26823,
 'никола': 26822,
 'насекомое': 26821,
 'н': 26820,
 'мок': 26819,
 'михаи': 26818,
 'кий': 26815,
 'замо': 26811,
 'дов': 26809,
 'голь': 26806,
 'высо': 26804,
 'χαλκός': 26799,
 'φαναῖος': 26797,
 'τεχνικά': 26795,
 'πικρό': 26794,
 'πάνορμος': 26793,
 'λύκη': 26792,
 'λύκειος': 26791,
 'λύκειο': 26790,
 'λόγος': 26789,
 'λυκ

In [None]:
# Check
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

text = 'I love AIVN'
tokens = tokenizer(text)
tokens = [vocab[token] for token in tokens]
print(tokens)

[127, 1081, 0]


## **3. Create vectorize function**

In [None]:
# Pad an truncate
MAX_SEQ_LENGTH = 512
PAD_IDX = vocab['<pad>']

# define pad and truncate function
def pad_and_truncate(input_ids, max_seq_len):
  if len(input_ids) > max_seq_len:
    input_ids = input_ids[:max_seq_len]
  elif len(input_ids) < max_seq_len:
    input_ids += [PAD_IDX] * (max_seq_len - len(input_ids))

  return input_ids

In [None]:
# Check
# Pad an truncate
MAX_SEQ_LENGTH = 512
PAD_IDX = vocab['<pad>']

text = 'I love AIVN'
tokens = tokenizer(text)
tokens = [vocab[token] for token in tokens]
print(tokens)
padded_tokens = pad_and_truncate(tokens, MAX_SEQ_LENGTH)
print(padded_tokens)


[127, 1081, 0]
[127, 1081, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
# Define vectorize function
def vectorize(question, context, answer):
  input_text = text_normalize(question) + ' <sep> ' + text_normalize(context)
  input_ids = [vocab[token] for token in tokenizer(input_text)]
  input_ids = pad_and_truncate(input_ids, MAX_SEQ_LENGTH)


  answer_ids = [vocab[token] for token in tokenizer(text_normalize(answer))]

  try:
    # Find the start position of answer in input text
    start_positions = input_ids.index(answer_ids[0])
    # Calculate the end position of answer in input text
    end_positions = start_positions + len(answer_ids) - 1

  except:
    start_positions = 0
    end_positions = 0

  input_ids = torch.tensor(input_ids, dtype = torch.long)
  start_positions = torch.tensor(start_positions, dtype = torch.long)
  end_positions = torch.tensor(end_positions, dtype = torch.long)

  return input_ids, start_positions, end_positions



In [None]:
print(qa_dataset[0]['question'])
print(qa_dataset[0]['context'])
print(qa_dataset[0]['answers']['text'][0])
# print(text_normalize(qa_dataset[0]['answers']['text'][0]))



To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Saint Bernadette Soubirous


In [None]:
#  Check function
MAX_SEQ_LENGTH = 512
input_ids, start_positions, end_positions  = vectorize(
    qa_dataset[0]['question'],
    qa_dataset[0]['context'],
    qa_dataset[0]['answers']['text'][0]
)

print(input_ids)
print(start_positions)
print(end_positions)

tensor([    9,  1255,    47,     5,  1419,   586,  8194,  1477,     8,  4079,
            8,  9275,   243,     4, 10671,     5,   187,    33,    10,   766,
         1325,  5137,     5,   370,   379,    16,  1390,  4217,    12,    10,
         1992,  3679,     6,     5,  1419,   586,  1185,     8,   849,     6,
            5,   370,   379,     7,  6071,    25,    12,    10,  1278,  3679,
            6,  1104,    17,  1433, 26012,    17,     5,  3308, 26126,  1168,
         2108, 22331,   613,     9,     5,   370,   379,    12,     5,  4141,
            6,     5,  2400,  2347,  1185,  1023,     5,  4141,    12,     5,
        12011,    10, 12671,   221,     6,  3965,     7, 23629,    25,    12,
           10, 13474,     6,     5, 12011,    26,  9275,   243,    79,     5,
         1419,   586, 23758,  1375,     9,   644, 16125, 24745,     8,  4079,
           26,     5,   168,     6,     5,   370,  1700,     7,     8,    10,
          583,   281,    18,  5954,   115,   117,  4428,     7, 

## **4. Create datasets**

In [None]:
class QADataset(Dataset):
  def __init__(self,data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    item = self.data[idx]
    question_text = item['question']
    context_text = item['context']
    answer_text = item['answers']['text'][0]

    input_ids, start_positions, end_positions = vectorize(question_text, context_text, answer_text)

    return   input_ids, start_positions, end_positions


In [None]:
# Decode
# Define decode function: convert id to token
def decode(input_ids):
  return ' '.join([vocab.lookup_token(token) for token in input_ids])

In [None]:
# Dataloader
train_dataset = QADataset(qa_dataset)
train_loader = DataLoader(train_dataset, batch_size = 20, shuffle=True)

In [None]:
next(iter(train_loader))

[tensor([[ 162,   10, 7941,  ...,    1,    1,    1],
         [   5,  330,    7,  ...,    1,    1,    1],
         [  40,  605,    5,  ...,    1,    1,    1],
         ...,
         [  21,   24,    5,  ...,    1,    1,    1],
         [  21, 1346, 1006,  ...,    1,    1,    1],
         [  21,   64,  608,  ...,    1,    1,    1]]),
 tensor([ 52,  43,  97, 153,  10,  35,  51,  54,  72,  23,  85,   2,   7,  31,
          17,  37,  15,  23,  17,  28]),
 tensor([ 52,  46,  98, 154,  12,  37,  70,  56,  77,  23,  85,   6,   9,  32,
          18,  39,  15,  34,  18,  30])]

In [None]:
decode(next(iter(train_loader))[0][0])


'compared to other countries how large is the us <sep> by total area water as well as land the united states is either slightly larger or smaller than the people s republic of china making it the world s third or fourth largest country china and the united states are smaller than russia and canada in total area but are larger than brazil by land area only exclusive of waters the united states is the world s third largest country after russia and china with canada in fourth whether the us or china is the third largest country by total area depends on two factors 1 the validity of china s claim on aksai chin and trans karakoram tract both these territories are also claimed by india so are not counted and 2 how us calculates its own surface area since the initial publishing of the world factbook the cia has updated the total area of united states a number of times <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

In [None]:

for batch in train_loader:
  input_ids, start_positions, end_positions = batch
  print (f'{decode(input_ids[1])} \n {input_ids[1]} \n {start_positions[1]} \n  {end_positions[1]} \n')


  print(decode(input_ids[1]).split()[start_positions[1]: end_positions[1]+1])


    #   context_tokens = tokenizer(context)
    # predicted_answer_tokens = context[start_position: end_position + 1]
    # predicted_answer = ' '.join(predicted_answer_tokens)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
          354,  1194,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,

## **5. Create models**

In [None]:
# This model use Bi-LSTM
class QAModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers):
    super(QAModel, self).__init__()
    # Get embedding for input
    self.input_embedding = nn.Embedding(vocab_size, embedding_dim)

    # Use bi-LSTM
    # concat question embed, context embed
    self.lstm = nn.LSTM(
        embedding_dim, hidden_size,
        num_layers    = n_layers,
        batch_first   = True,
        bidirectional = True
    )

    self.start_linear = nn.Linear(hidden_size * 2, 1)
    self.end_linear = nn.Linear(hidden_size * 2, 1)

  def forward(self, text):
    input_embedded = self.input_embedding(text)

    lstm_out, _ = self.lstm(input_embedded) #(output: bs, seq_len, hidden_units)

    # squeeze(-1) is removes the singleton dimension at the last position of the tensor
    # Logits (bs, 22, 1) => (bs, 22)
    start_logits = self.start_linear(lstm_out).squeeze(-1)
    end_logits = self.end_linear(lstm_out).squeeze(-1)

    return start_logits, end_logits




In [None]:
# Model parameters
EMBEDDING_DIM = 64
HIDDEN_SIZE = 128
VOCAB_SIZE = len(vocab)
N_LAYERS = 3
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = QAModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, N_LAYERS).to(device)

input = torch.randint(0, 10, size=(1, 10)).to(device) # (size = 1, 10 =>  batch, sequence length)
print(input.shape)

model.eval()
with torch.no_grad():
  start_logits, end_logits = model(input)

print(start_logits.shape)

torch.Size([1, 10])
torch.Size([1, 10])


## **6. Training models**

In [None]:
LR = 1e-3
EPOCHS = 15
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:
# Import library used to create progress bars
# useful for tracking the progress of long-running operations
from tqdm import tqdm

model.train()
# iterating over a range with a progress bar
for epoch in tqdm(range(EPOCHS)):
  train_losses = []

  for idx, (input_ids, start_positions, end_positions) in enumerate(train_loader):
    input_ids = input_ids.to(device)
    start_positions = start_positions.to(device)
    end_positions = end_positions.to(device)

    optimizer.zero_grad()
    start_logits, end_logits = model(input_ids)

    start_loss = criterion(start_logits, start_positions)
    end_loss = criterion(end_logits, end_positions)
    total_loss = (start_loss + end_loss) / 2

    total_loss.backward()
    optimizer.step()
    train_losses.append(total_loss.item())

  train_loss = sum(train_losses) / len(train_losses)
  print(f'EPOCH {epoch +1}\tTraining Loss: {train_loss}')

  7%|▋         | 1/15 [00:04<01:03,  4.56s/it]

EPOCH 1	Training Loss: 4.897293333573775


 13%|█▎        | 2/15 [00:09<01:01,  4.71s/it]

EPOCH 2	Training Loss: 4.555733689394864


 20%|██        | 3/15 [00:13<00:55,  4.61s/it]

EPOCH 3	Training Loss: 4.171222942525691


 27%|██▋       | 4/15 [00:18<00:50,  4.56s/it]

EPOCH 4	Training Loss: 3.655843563513322


 33%|███▎      | 5/15 [00:23<00:46,  4.64s/it]

EPOCH 5	Training Loss: 2.8159552617506547


 40%|████      | 6/15 [00:27<00:41,  4.60s/it]

EPOCH 6	Training Loss: 1.7791858434677124


 47%|████▋     | 7/15 [00:32<00:36,  4.60s/it]

EPOCH 7	Training Loss: 0.9988406953486529


 53%|█████▎    | 8/15 [00:37<00:32,  4.65s/it]

EPOCH 8	Training Loss: 0.5093625824559819


 60%|██████    | 9/15 [00:41<00:27,  4.59s/it]

EPOCH 9	Training Loss: 0.2953043590892445


 67%|██████▋   | 10/15 [00:46<00:23,  4.64s/it]

EPOCH 10	Training Loss: 0.1508714346215129


 73%|███████▎  | 11/15 [00:50<00:18,  4.60s/it]

EPOCH 11	Training Loss: 0.09026917082003572


 80%|████████  | 12/15 [00:55<00:13,  4.56s/it]

EPOCH 12	Training Loss: 0.055852409659630876


 87%|████████▋ | 13/15 [00:59<00:09,  4.61s/it]

EPOCH 13	Training Loss: 0.020423701155761427


 93%|█████████▎| 14/15 [01:04<00:04,  4.57s/it]

EPOCH 14	Training Loss: 0.009936430339637975


100%|██████████| 15/15 [01:08<00:00,  4.59s/it]

EPOCH 15	Training Loss: 0.0031602649780159645





## **7. Test**

In [None]:
model.eval()
with torch.no_grad():
  # Get data to test
  sample = qa_dataset[4]
  question = sample['question']
  context = sample['context']
  answer = sample['answers']['text'][0]

  # text normalize, unless tokenizer at after step run wrong
  question = text_normalize(question)
  context = text_normalize(context)
  answer = text_normalize(answer)

  # Vectorize
  input_ids, start_positions, end_positions = vectorize(question, context, answer)
  input_ids =  input_ids.to(device)
  start_positions = start_positions.to(device)
  end_positions = end_positions.to(device)

  # Add batch
  input_ids = input_ids.unsqueeze(0)

  # Compute start and eng logits
  start_logits, end_logits = model(input_ids)

  # Compute offset is number tokens of question and <sep>
  offset = len(tokenizer(question)) + 1
  start_position = torch.argmax(start_logits, dim=1).cpu().numpy()[0]
  end_position = torch.argmax(end_logits, dim=1).cpu().numpy()[0]

  # minus to offset
  start_position -=offset
  end_position -=offset

  # set condition to start and end position
  start_position = max(start_position, 0)
  end_position = min(end_position, len(tokenizer(context)) - 1)

  # Check start and position
  if end_position >= start_position:
    # Extract the predicted answer span
    context_tokens = tokenizer(context)
    predicted_answer_tokens = context_tokens[start_position: end_position + 1]
    predicted_answer = ' '.join(predicted_answer_tokens)

  else:
    predicted_answer = ' '

  print(f'Context: {context}')
  print(f'Question: {question}')
  print(f'Answer: {answer}')
  print(f'Start position: {start_position}')
  print(f'End position: {end_position}')
  print(f'Prediction: {predicted_answer}')

Context: The university owns several centers around the world used for international studies and research  conferences abroad  and alumni support  The university has had a presence in London  England  since 1968  Since 1998  its London center has been based in the former United University Club at 1 Suffolk Street in Trafalgar Square  The center enables the Colleges of Arts   Letters  Business Administration  Science  Engineering and the Law School to develop their own programs in London  as well as hosting conferences and symposia  Other Global Gateways are located in Beijing  Chicago  Dublin  Jerusalem and Rome 
Question: In what year did the Suffolk Street location start to house a Notre Dame facility 
Answer: 1998
Start position: 31
End position: 31
Prediction: 1998


In [None]:
model.eval()
with torch.no_grad():
  # Get data to test
  question = "what is Tien from?"
  context = 'Tien is a student and he is from Vietnam'
  answer = 'Vietnam'

  # text normalize, unless tokenizer at after step run wrong
  question = text_normalize(question)
  context = text_normalize(context)
  answer = text_normalize(answer)

  # Vectorize
  input_ids, start_positions, end_positions = vectorize(question, context, answer)
  input_ids =  input_ids.to(device)
  start_positions = start_positions.to(device)
  end_positions = end_positions.to(device)

  # Add batch
  input_ids = input_ids.unsqueeze(0)

  # Compute start and eng logits
  start_logits, end_logits = model(input_ids)

  # Compute offset is number tokens of question and <sep>
  offset = len(tokenizer(question)) + 1
  start_position = torch.argmax(start_logits, dim=1).cpu().numpy()[0]
  end_position = torch.argmax(end_logits, dim=1).cpu().numpy()[0]

  # minus to offset
  start_position -=offset
  end_position -=offset

  # set condition to start and end position
  start_position = max(start_position, 0)
  end_position = min(end_position, len(tokenizer(context)) - 1)

  # Check start and position
  if end_position >= start_position:
    # Extract the predicted answer span
    context_tokens = tokenizer(context)
    predicted_answer_tokens = context_tokens[start_position: end_position + 1]
    predicted_answer = ' '.join(predicted_answer_tokens)

  else:
    predicted_answer = ' '

  print(f'Context: {context}')
  print(f'Question: {question}')
  print(f'Answer: {answer}')
  print(f'Start position: {start_position}')
  print(f'End position: {end_position}')
  print(f'Prediction: {predicted_answer}')

Context: Tien is a student and he is from Vietnam
Question: what is Tien from 
Answer: Vietnam
Start position: 2
End position: 8
Prediction: a student and he is from vietnam
