<a href="https://colab.research.google.com/github/tienhuynh96/End-to-end-Question-Answering/blob/main/Question_Answering_Extractive_Approach_LSTM_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Question answering Extractive Approach with LSTM-based***

input_text = question + ' sep ' + context

Backbone: LSTM

## **1. Import libraries and create temp datasets**

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

In [None]:
qa_dataset = [
    {
        'context': 'My name is AIVN and I am from Vietnam.',
        'question': 'What is my name?',
        'answer': 'AIVN'
    },
    {
        'context': 'I love painting and my favorite artist is Vicent Van Gogh.',
        'question': 'What is my favorite activity?',
        'answer': 'painting'
    },
    {
        'context': 'I am studying computer science at the University of Tokyo.',
        'question': 'What am I Studying?',
        'answer': 'computer science'
    },
    {
        'context': 'My favorite book is "To kill a Mockingbird" by Harper Lee.',
        'question': 'What is my favorite book?',
        'answer': '"To kill a Mockingbird"'
    },
    {
        'context': 'I have a pet dog named Max who loves to play fetch',
        'question': 'What is the name of my pet?',
        'answer': 'Max'
    },
    {
        'context': 'I was born in Paris, but now I live in New york City',
        'question': 'Where do I live now?',
        'answer': 'New York City'
    }
    # {
    #     'context': '',
    #     'question': '',
    #     'answer': ''
    # },

]

data_size = len(qa_dataset)
data_size

6

In [None]:
np.shape(qa_dataset)

(6,)

## **2. Build vocabulary**

In [None]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
# This yield function is required in function "build_vocab_from_iterator"
# Get data from item context and question in data
def yield_tokens(data):
  for item in data:
    yield tokenizer(item['context'] + ' <sep> ' + item['question'])

# Create vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(qa_dataset),
    specials = ['<unk>','<pad>','<bos>','<eos>','<sep>']
)

# Set default index for this vocab is 'unk' = 0, when the unknow word is replace the 'unk'
vocab.set_default_index(vocab['<unk>'])
# Build vocab (stoi mean string to index)
vocab.get_stoi()


{'to': 24,
 ',': 25,
 'pet': 21,
 'who': 61,
 'gogh': 39,
 'the': 23,
 'fetch': 37,
 'play': 52,
 'van': 56,
 'now': 19,
 'was': 59,
 'a': 14,
 'name': 13,
 'aivn': 27,
 'i': 5,
 'studying': 22,
 'and': 15,
 'where': 60,
 '<unk>': 0,
 'favorite': 11,
 'by': 32,
 'artist': 28,
 'live': 18,
 '<eos>': 3,
 'harper': 40,
 'dog': 36,
 'loves': 45,
 '<pad>': 1,
 'computer': 34,
 '.': 10,
 'born': 30,
 'is': 6,
 'my': 7,
 'book': 16,
 'science': 53,
 'of': 20,
 '<bos>': 2,
 '<sep>': 4,
 'what': 9,
 'am': 12,
 'named': 48,
 'at': 29,
 'but': 31,
 'in': 17,
 'from': 38,
 'tokyo': 54,
 'city': 33,
 'kill': 42,
 'lee': 43,
 'love': 44,
 '?': 8,
 'do': 35,
 'max': 46,
 'mockingbird': 47,
 'york': 62,
 'new': 49,
 'painting': 50,
 'paris': 51,
 'university': 55,
 'have': 41,
 'vicent': 57,
 'activity': 26,
 'vietnam': 58}

In [None]:
# Check
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

text = 'I love AIVN'
tokens = tokenizer(text)
tokens = [vocab[token] for token in tokens]
print(tokens)

[5, 44, 27]


## **3. Create vectorize function**

In [None]:
# Pad an truncate
MAX_SEQ_LENGTH = 22
PAD_IDX = vocab['<pad>']

# define pad and truncate function
def pad_and_truncate(input_ids, max_seq_len):
  if len(input_ids) > max_seq_len:
    input_ids = input_ids[:max_seq_len]
  elif len(input_ids) < max_seq_len:
    input_ids += [PAD_IDX] * (max_seq_len - len(input_ids))

  return input_ids

In [None]:
# Check
# Pad an truncate
MAX_SEQ_LENGTH = 22
PAD_IDX = vocab['<pad>']

text = 'I love AIVN'
tokens = tokenizer(text)
tokens = [vocab[token] for token in tokens]
print(tokens)
padded_tokens = pad_and_truncate(tokens, MAX_SEQ_LENGTH)
print(padded_tokens)


[5, 44, 27]
[5, 44, 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
# Define vectorize function
def vectorize(question, context, answer):
  input_text = question + ' <sep> ' + context
  input_ids = [vocab[token] for token in tokenizer(input_text)]
  input_ids = pad_and_truncate(input_ids, MAX_SEQ_LENGTH)

  answer_ids = [vocab[token] for token in tokenizer(answer)]
  # Find the start position of answer in input text
  start_positions = input_ids.index(answer_ids[0])
  # Calculate the end position of answer in input text
  end_positions = start_positions + len(answer_ids) - 1

  input_ids = torch.tensor(input_ids, dtype = torch.long)
  start_positions = torch.tensor(start_positions, dtype = torch.long)
  end_positions = torch.tensor(end_positions, dtype = torch.long)

  return input_ids, start_positions, end_positions



In [None]:
#  Check function
MAX_SEQ_LENGTH = 22
input_ids, start_positions, end_positions  = vectorize(
    qa_dataset[0]['question'],
    qa_dataset[0]['context'],
    qa_dataset[0]['answer']
)

print(input_ids)
print(start_positions)
print(end_positions)

tensor([ 9,  6,  7, 13,  8,  4,  7, 13,  6, 27, 15,  5, 12, 38, 58, 10,  1,  1,
         1,  1,  1,  1])
tensor(9)
tensor(9)


## **4. Create datasets**

In [None]:
class QADataset(Dataset):
  def __init__(self,data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    item = self.data[idx]
    question_text = item['question']
    context_text = item['context']
    answer_text = item['answer']

    input_ids, start_positions, end_positions = vectorize(question_text, context_text, answer_text)

    return   input_ids, start_positions, end_positions


In [None]:
# Decode
# Define decode function: convert id to token
def decode(input_ids):
  return ' '.join([vocab.lookup_token(token) for token in input_ids])

In [None]:
train_dataset = QADataset(qa_dataset)
train_loader = DataLoader(train_dataset, batch_size = 2, shuffle=True)

In [None]:
next(iter(train_loader))

[tensor([[60, 35,  5, 18, 19,  8,  4,  5, 59, 30, 17, 51, 25, 31, 19,  5, 18, 17,
          49, 62, 33,  1],
         [ 9,  6,  7, 11, 26,  8,  4,  5, 44, 50, 15,  7, 11, 28,  6, 57, 56, 39,
          10,  1,  1,  1]]),
 tensor([18,  9]),
 tensor([20,  9])]

In [None]:
decode(next(iter(train_loader))[0][0])


'what is the name of my pet ? <sep> i have a pet dog named max who loves to play fetch <pad>'

In [None]:

for batch in train_loader:
  input_ids, start_positions, end_positions = batch
  print (f'{decode(input_ids[1])} \n {input_ids[1]} \n {start_positions[1]} \n  {end_positions[1]} \n')
  text = tokenizer(decode(input_ids[1]))
  kq =

  print(decode(input_ids[1]).split()[start_positions[1]: end_positions[1]+1])


    #   context_tokens = tokenizer(context)
    # predicted_answer_tokens = context[start_position: end_position + 1]
    # predicted_answer = ' '.join(predicted_answer_tokens)

SyntaxError: invalid syntax (<ipython-input-116-722dfffa30fe>, line 5)

## **5. Create models**

In [None]:
# This model use Bi-LSTM
class QAModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers):
    super(QAModel, self).__init__()
    # Get embedding for input
    self.input_embedding = nn.Embedding(vocab_size, embedding_dim)

    # Use bi-LSTM
    # concat question embed, context embed
    self.lstm = nn.LSTM(
        embedding_dim, hidden_size,
        num_layers    = n_layers,
        batch_first   = True,
        bidirectional = True
    )

    self.start_linear = nn.Linear(hidden_size * 2, 1)
    self.end_linear = nn.Linear(hidden_size * 2, 1)

  def forward(self, text):
    input_embedded = self.input_embedding(text)

    lstm_out, _ = self.lstm(input_embedded) #(output: bs, seq_len, hidden_units)

    # squeeze(-1) is removes the singleton dimension at the last position of the tensor
    # Logits (bs, 22, 1) => (bs, 22)
    start_logits = self.start_linear(lstm_out).squeeze(-1)
    end_logits = self.end_linear(lstm_out).squeeze(-1)

    return start_logits, end_logits




In [None]:
# Model parameters
EMBEDDING_DIM = 64
HIDDEN_SIZE = 128
VOCAB_SIZE = len(vocab)
N_LAYERS = 2

model = QAModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, N_LAYERS)

input = torch.randint(0, 10, size=(1, 10)) # (size = 1, 10 =>  batch, sequence length)
print(input.shape)

model.eval()
with torch.no_grad():
  start_logits, end_logits = model(input)

print(start_logits.shape)

torch.Size([1, 10])
torch.Size([1, 10])


## **6. Training models**

In [None]:
LR = 1e-3
EPOCHS = 20
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:

model.train()
for _ in range(EPOCHS):
  for idx, (input_ids, start_positions, end_positions) in enumerate(train_loader):
    optimizer.zero_grad()

    start_logits, end_logits = model(input_ids)

    start_loss = criterion(start_logits, start_positions)
    end_loss = criterion(end_logits, end_positions)
    total_loss = (start_loss + end_loss) / 2

    total_loss.backward()
    optimizer.step()
    print(total_loss.item())

3.088099241256714
3.088763952255249
3.076913356781006
2.989846706390381
3.003505229949951
2.935987710952759
2.9173898696899414
2.842987060546875
2.7384867668151855
2.728424072265625
2.640113353729248
2.4949307441711426
2.428497791290283
2.2472143173217773
2.2086668014526367
2.0647835731506348
1.758601188659668
1.7666841745376587
1.4358246326446533
1.2988473176956177
1.5729725360870361
1.1286625862121582
1.2045471668243408
0.9661874771118164
1.1532917022705078
0.7436903119087219
0.7646517157554626
0.6815716028213501
0.7659229636192322
0.6826803684234619
0.6893978118896484
0.42751944065093994
0.6361417770385742
0.4645516872406006
0.46301761269569397
0.3367407023906708
0.33089935779571533
0.21035964787006378
0.24283066391944885
0.2306172400712967
0.14449387788772583
0.1378726363182068
0.1386144459247589
0.07431986927986145
0.10917755961418152
0.06932646036148071
0.12645593285560608
0.012542685493826866
0.09758003056049347
0.009313525632023811
0.030084727331995964
0.05382411181926727
0.015

## **7. Test**

In [None]:
model.eval()
with torch.no_grad():
  sample = qa_dataset[2]
  context, question, answer = sample.values()
  input_ids, start_positions, end_positions = vectorize(question, context, answer)

  # Add batch
  input_ids = input_ids.unsqueeze(0)

  # Compute start and eng logits
  start_logits, end_logits = model(input_ids)

  # Compute offset is number tokens of question and <sep>
  offset = len(tokenizer(question)) + 1
  start_position = torch.argmax(start_logits, dim=1).numpy()[0]
  end_position = torch.argmax(end_logits, dim=1).numpy()[0]

  # minus to offset
  start_position -=offset
  end_position -=offset

  # set condition to start and end position
  start_position = max(start_position, 0)
  end_position = min(end_position, len(tokenizer(context)) - 1)

  # Check start and position
  if end_position >= start_position:
    # Extract the predicted answer span
    context_tokens = tokenizer(context)
    predicted_answer_tokens = context_tokens[start_position: end_position + 1]
    predicted_answer = ' '.join(predicted_answer_tokens)

  else:
    predicted_answer = ' '

  print(f'Context: {context}')
  print(f'Question: {question}')
  print(f'Start position: {start_position}')
  print(f'End position: {end_position}')
  print(f'Prediction: {predicted_answer}')

Context: I am studying computer science at the University of Tokyo.
Question: What am I Studying?
Start position: 3
End position: 4
Prediction: computer science
