<a href="https://colab.research.google.com/github/tienhuynh96/NLP_Projects/blob/main/Demo_Question_Answering_Classification_Approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **1. Import libraries and create temp dataset**

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader



In [None]:
qa_dataset = [
    {
        'context': 'My name is AIVN and I am from Vietnam.',
        'question': 'What is my name?',
        'answer': 'AIVN'
    },
    {
        'context': 'I love painting and my favorite artist is Vicent Van Gogh.',
        'question': 'What is my favorite activity?',
        'answer': 'painting'
    },
    {
        'context': 'I am studying computer science at the University of Tokyo.',
        'question': 'What am I Studying?',
        'answer': 'computer science'
    },
    {
        'context': 'My favorite book is "To kill a Mockingbird" by Harper Lee.',
        'question': 'What is my favorite book?',
        'answer': '"To kill a Mockingbird"'
    },
    {
        'context': 'I have a pet dog named Max who loves to play fetch',
        'question': 'What is the name of my pet?',
        'answer': 'Max'
    },
    {
        'context': 'I was born in Paris, but now I live in New york City',
        'question': 'Where do I live now?',
        'answer': 'New York City'
    }
    # {
    #     'context': '',
    #     'question': '',
    #     'answer': ''
    # },

]

data_size = len(qa_dataset)
data_size

6

In [None]:
np.shape(qa_dataset)

(6,)

## **2. Vectorization**

In [None]:
# check
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

text = 'I love AIVN'
tokenizer(text)

['i', 'love', 'aivn']

In [None]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
# This yield function is required in function "build_vocab_from_iterator"
# Get data from item context and question in data
def yield_tokens(data):
  for item in data:
    yield tokenizer(item['context'] + ' ' + item['question'])

# Create vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(qa_dataset),
    specials = ['<unk>','<pad>','<bos>','<eos>','<sep>']
)

# Set default index for this vocab is 'unk' = 0, when the unknow word is replace the 'unk'
vocab.set_default_index(vocab['<unk>'])
# Build vocab (stoi mean string to index)
vocab.get_stoi()


{'to': 24,
 ',': 25,
 'pet': 21,
 'who': 61,
 'gogh': 39,
 'the': 23,
 'fetch': 37,
 'play': 52,
 'van': 56,
 'now': 19,
 'was': 59,
 'a': 14,
 'name': 13,
 'aivn': 27,
 'i': 5,
 'studying': 22,
 'and': 15,
 'where': 60,
 '<unk>': 0,
 'favorite': 11,
 'by': 32,
 'artist': 28,
 'live': 18,
 '<eos>': 3,
 'harper': 40,
 'dog': 36,
 'loves': 45,
 '<pad>': 1,
 'computer': 34,
 '.': 10,
 'born': 30,
 'is': 6,
 'my': 7,
 'book': 16,
 'science': 53,
 'of': 20,
 '<bos>': 2,
 '<sep>': 4,
 'what': 9,
 'am': 12,
 'named': 48,
 'at': 29,
 'but': 31,
 'in': 17,
 'from': 38,
 'tokyo': 54,
 'city': 33,
 'kill': 42,
 'lee': 43,
 'love': 44,
 '?': 8,
 'do': 35,
 'max': 46,
 'mockingbird': 47,
 'york': 62,
 'new': 49,
 'painting': 50,
 'paris': 51,
 'university': 55,
 'have': 41,
 'vicent': 57,
 'activity': 26,
 'vietnam': 58}

In [None]:
# Check
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

text = 'I love AIVN'
tokens = tokenizer(text)
tokens = [vocab[token] for token in tokens]
print(tokens)

[5, 44, 27]


In [None]:
# Class for answer
# Set is for use only unique answer
classes = set([item['answer'] for item in qa_dataset])
# Class to index
classes_to_idx = {
    cls_name : idx for idx, cls_name in enumerate(classes)
}
# Index to class
idx_to_classes = {
    idx : cls_name for idx, cls_name in enumerate(classes)
}

print(idx_to_classes)

{0: 'AIVN', 1: '"To kill a Mockingbird"', 2: 'New York City', 3: 'Max', 4: 'painting', 5: 'computer science'}


In [None]:
# Pad an truncate
max_seq_len = 10
PAD_IDX = vocab['<pad>']
# PAD_IDX = 1

# define pad and truncate function
def pad_and_truncate(input_ids, max_seq_len):
  if len(input_ids) > max_seq_len:
    input_ids = input_ids[:max_seq_len]
  elif len(input_ids) < max_seq_len:
    input_ids += [PAD_IDX] * (max_seq_len - len(input_ids))

  return input_ids

In [None]:
# Check
# Pad an truncate
max_seq_len = 10
PAD_IDX = 1

text = 'I love AIVN'
tokens = tokenizer(text)
tokens = [vocab[token] for token in tokens]
print(tokens)
padded_tokens = pad_and_truncate(tokens, max_seq_len)
print(padded_tokens)


[5, 44, 27]
[5, 44, 27, 1, 1, 1, 1, 1, 1, 1]


In [None]:
# Define vectorize function
def vectorize(question, context):
  input_question_ids = [vocab[token] for token in tokenizer(question)]
  input_context_ids = [vocab[token] for token in tokenizer(context)]

  input_question_ids = pad_and_truncate(input_question_ids, MAX_QUESTION_LEN)
  input_context_ids = pad_and_truncate(input_context_ids, MAX_CONTEXT_LEN)

  input_question_ids = torch.tensor(input_question_ids, dtype = torch.long)
  input_context_ids = torch.tensor(input_context_ids, dtype = torch.long)

  return input_question_ids, input_context_ids



In [None]:
MAX_QUESTION_LEN, MAX_CONTEXT_LEN = 10, 10
input_question_ids, input_context_ids = vectorize(
    qa_dataset[0]['question'],
    qa_dataset[0]['context']
)

print(input_question_ids)
print(input_context_ids)
print(classes_to_idx[qa_dataset[0]['answer']])

tensor([ 9,  6,  7, 13,  8,  1,  1,  1,  1,  1])
tensor([ 7, 13,  6, 27, 15,  5, 12, 38, 58, 10])
0


## **3. Create datasets**

In [None]:
class QADataset(Dataset):
  def __init__(self,data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    item = self.data[idx]
    question_text = item['question']
    context_text = item['context']
    answer_text = item['answer']

    input_question_ids, input_context_ids = vectorize(question_text, context_text)

    answer_id = classes_to_idx[answer_text]
    answer_id = torch.tensor(answer_id, dtype = torch.long)

    return  input_question_ids, input_context_ids, answer_id


In [None]:
# Decode
# Define decode function: convert id to token
def decode(input_ids):
  return ' '.join([vocab.lookup_token(token) for token in input_ids])

In [None]:
train_dataset = QADataset(qa_dataset)
train_loader = DataLoader(train_dataset, batch_size = 2, shuffle=True)

In [None]:
next(iter(train_loader))

[tensor([[ 9,  6,  7, 11, 26,  8,  1,  1,  1,  1],
         [ 9, 12,  5, 22,  8,  1,  1,  1,  1,  1]]),
 tensor([[ 5, 44, 50, 15,  7, 11, 28,  6, 57, 56],
         [ 5, 12, 22, 34, 53, 29, 23, 55, 20, 54]]),
 tensor([4, 5])]

In [None]:
for batch in train_loader:
  input_question_ids, input_context_ids, answer_id = batch
  print(input_question_ids, input_context_ids, answer_id)

tensor([[ 9,  6,  7, 11, 16,  8,  1,  1,  1,  1],
        [ 9, 12,  5, 22,  8,  1,  1,  1,  1,  1]]) tensor([[ 7, 11, 16,  6, 24, 42, 14, 47, 32, 40],
        [ 5, 12, 22, 34, 53, 29, 23, 55, 20, 54]]) tensor([1, 5])
tensor([[ 9,  6,  7, 13,  8,  1,  1,  1,  1,  1],
        [60, 35,  5, 18, 19,  8,  1,  1,  1,  1]]) tensor([[ 7, 13,  6, 27, 15,  5, 12, 38, 58, 10],
        [ 5, 59, 30, 17, 51, 25, 31, 19,  5, 18]]) tensor([0, 2])
tensor([[ 9,  6,  7, 11, 26,  8,  1,  1,  1,  1],
        [ 9,  6, 23, 13, 20,  7, 21,  8,  1,  1]]) tensor([[ 5, 44, 50, 15,  7, 11, 28,  6, 57, 56],
        [ 5, 41, 14, 21, 36, 48, 46, 61, 45, 24]]) tensor([4, 3])


## **4. Create models**

In [None]:
# This model use Bi-LSTM
class QAModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers, n_classes ):
    super(QAModel, self).__init__()
    # Get embedding for question and context
    self.question_embedding = nn.Embedding(vocab_size, embedding_dim)
    self.context_embedding  = nn.Embedding(vocab_size, embedding_dim)

    # Use bi-LSTM
    # concat question embed, context embed
    self.lstm = nn.LSTM(
        embedding_dim, hidden_size,
        num_layers    = n_layers,
        batch_first   = True,
        bidirectional = True
    )

    self.fc = nn.Linear(hidden_size * 2, n_classes)

  def forward(self, question, context):
    question_embed = self.question_embedding(question)
    context_embed = self.context_embedding(context)
    # print(question_embed[0].shape, context_embed[0].shape)

    combined = torch.cat (
        (question_embed, context_embed),
        dim = 1
    )
    # print(combined[0].shape)

    lstm_out, _ = self.lstm(combined)
    # print(lstm_out.shape)

    lstm_out = lstm_out[:,-1, :]  # get the last hidden state (output: bs, seq_len, hidden_units)
    # print(lstm_out.shape)

    out = self.fc(lstm_out)
    # print(out.shape)

    return out




In [None]:
# Model parameters
EMBEDDING_DIM = 64
HIDDEN_SIZE = 128
VOCAB_SIZE = len(vocab)
N_LAYERS = 2
N_CLASSES = len(classes)

model = QAModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, N_LAYERS, N_CLASSES)

input_context = torch.randint(0, 10, size=(1, MAX_CONTEXT_LEN))
input_question = torch.randint(0, 10, size=(1, MAX_QUESTION_LEN))

model.eval()
with torch.no_grad():
  logits = model(input_question, input_context)

print(logits.shape)

torch.Size([1, 6])


In [None]:
# Check lai doan LSTM, embedding dim vs hidden size

## **5. Training models**

In [None]:
LR = 1e-3
EPOCHS = 20
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

criterion = nn.CrossEntropyLoss()

model.train()
for _ in range(EPOCHS):
  for idx, (input_question_ids, input_context_ids, answer_id) in enumerate(train_loader):
    optimizer.zero_grad()
    outputs = model(input_question_ids, input_context_ids)
    loss = criterion(outputs, answer_id)
    loss.backward()
    optimizer.step()
    print(loss.item())

1.8037892580032349
1.8395943641662598
1.7732521295547485
1.6463603973388672
1.7430801391601562
1.6934337615966797
1.633399486541748
1.670586109161377
1.490000605583191
1.4819023609161377
1.5229260921478271
1.4440586566925049
1.2778379917144775
1.3840556144714355
1.3430676460266113
1.093340516090393
1.230642318725586
1.090537428855896
1.0080044269561768
1.0771441459655762
0.6168181896209717
0.8835021257400513
0.4259416162967682
0.6235527992248535
0.6059258580207825
0.45278531312942505
0.19909530878067017
0.2641212046146393
0.2421870231628418
0.19094818830490112
0.1267157942056656
0.138309508562088
0.08771412819623947
0.052369602024555206
0.08635278046131134
0.03566078469157219
0.02458029054105282
0.02104884944856167
0.03563749045133591
0.015992309898138046
0.022911496460437775
0.011570308357477188
0.01159745454788208
0.00997006893157959
0.008980734273791313
0.00824988167732954
0.006879919208586216
0.006042861379683018
0.005948035977780819
0.005709173157811165
0.0038714581169188023
0.004

## **6. Test**

In [None]:
model.eval()
with torch.no_grad():
  sample = qa_dataset[1]
  context, question, answer = sample.values()
  question_ids, context_ids = vectorize(question, context)
  question_ids = question_ids.unsqueeze(0)
  context_ids = context_ids.unsqueeze(0)

  outputs = model(question_ids, context_ids)
  _, predicted = torch.max(outputs.data, 1)

  print(f'Context: {context}')
  print(f'Question: {question}')
  print(f'Prediction: {idx_to_classes[predicted.numpy()[0]]}')

Context: I love painting and my favorite artist is Vicent Van Gogh.
Question: What is my favorite activity?
Prediction: painting
