# **QA-LSTM**
As an alternative to running the scripts on the command line, this notebook shows how to train and evaluate a baseline LSTM model for non-factoid question answering.

In [0]:
!git clone https://github.com/yuanbit/FinBERT-QA

%cd FinBERT-QA/
from src.utils import *
from src.evaluate import *

In [2]:
import os
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torchtext
from tqdm import tqdm

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

Using device: cuda

Tesla P100-PCIE-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


<torch._C.Generator at 0x7f8e95b3ee50>

## **Configure hyperparameters and load data**

In [0]:
config = {'max_seq_len': 128,
          'batch_size': 64,
          'n_epochs': 3,
          'lr': 1e-3,
          'emb_dim': 100,
          'hidden_size': 256,
          'dropout': 0.2,
          'margin': 0.2}

In [4]:
# Dictonary with token to id mapping
vocab = load_pickle("data/qa_lstm_tokenizer/word2index.pickle")
# Dictonary with qid to tokenized text mapping
qid_to_tokenized_text = load_pickle('data/qa_lstm_tokenizer/qid_to_tokenized_text.pickle')
# Dictionary with docid to tokenized text mapping
docid_to_tokenized_text = load_pickle('data/qa_lstm_tokenizer/docid_to_tokenized_text.pickle')

# List of lists:
# Each element is a list contraining [qid, list of pos docid, list of candidate docid]
train_set = load_pickle('data/data_pickle/train_set_50.pickle')
valid_set = load_pickle('data/data_pickle/valid_set_50.pickle')
test_set = load_pickle('data/data_pickle/test_set_50.pickle')

# Labels
labels = load_pickle('data/data_pickle/labels.pickle')

print("Number of questions in the training set: {}".format(len(train_set)))
print("Number of questions in the validation set: {}".format(len(valid_set)))
print("Number of questions in the test set: {}".format(len(test_set)))

Number of questions in the training set: 5676
Number of questions in the validation set: 631
Number of questions in the test set: 333


# **Prepare data**
Create the required inputs and convert them to DataLoader objects to be trained in batches.

**Required training inputs:**
1. question input ids
2. positive answer input ids
3. negative answer input ids

In [0]:
def pad_seq(seq_idx, max_seq_len):
    """Creates padded or truncated sequence.

    Returns:
        seq: list of padded vectorized sequence
    ----------
    Arguements:
        seq_idx: tensor with similarity of a question and a positive answer
        max_seq_len: int
    """
    # Pad each sequence to be the same length to process in batches
    # pad_token = 0
    if len(seq_idx) >= max_seq_len:
        seq_idx = seq_idx[:max_seq_len]
    else:
        seq_idx += [0]*(max_seq_len - len(seq_idx))
    seq = seq_idx

    return seq

def vectorize(seq, max_seq_len):
    """Creates vectorized sequence.

    Returns:
        vectorized_seq: List of padded vectorized sequence
    ----------
    Arguements:
        seq: List of tokens in a sequence
        max_seq_len: int
    """
    # Map tokens in seq to idx
    seq_idx = [vocab[token] for token in seq]
    # Pad seq idx
    vectorized_seq = pad_seq(seq_idx, max_seq_len)

    return vectorized_seq

In [0]:
def get_lstm_input_data(dataset, max_seq_len):
    """Creates input data for model.

    Returns:
        q_input_ids: List of lists of vectorized question sequence
        pos_input_ids: List of lists of vectorized positve ans sequence
        neg_input_ids: List of lists of vectorized negative ans sequence
    ----------
    Arguements:
        dataset: List of lists in the form of [qid, [pos ans], [ans cands]]
        max_seq_len: int
    """
    q_input_ids = []
    pos_input_ids = []
    neg_input_ids = []

    for i, seq in enumerate(tqdm(dataset)):
        qid, ans_labels, cands = seq[0], seq[1], seq[2]

        # Remove the positive answers for the candidates
        filtered_cands = list(set(cands)-set(ans_labels))
        # Select a positive answer from the list of positive answers
        pos_docid = random.choice(ans_labels)
        # Map question id to text
        q_text = qid_to_tokenized_text[qid]
        # Pad and vectorize text
        q_input_id = vectorize(q_text, max_seq_len)

        # For all the negative answers
        for neg_docid in filtered_cands:
            # Map the docid to text
            pos_ans_text = docid_to_tokenized_text[pos_docid]
            neg_ans_text = docid_to_tokenized_text[neg_docid]
            # Pad and vectorize sequences
            pos_input_id = vectorize(pos_ans_text, max_seq_len)
            neg_input_id = vectorize(neg_ans_text, max_seq_len)

            q_input_ids.append(q_input_id)
            pos_input_ids.append(pos_input_id)
            neg_input_ids.append(neg_input_id)

    return q_input_ids, pos_input_ids, neg_input_ids

In [0]:
def get_dataloader(dataset, type, max_seq_len, batch_size):
    """Creates train and validation DataLoaders with question, positive
    answer, and negative answer vectorized inputs.

    Returns:
        train_dataloader: DataLoader object
        validation_dataloader: DataLoader object
    ----------
    Arguements:
        dataset: List of lists in the form of [qid, [pos ans], [ans cands]]
        type: str - 'train' or 'validation'
        max_seq_len: int
        batch_size: int
    """
    question_input, pos_ans_input, neg_ans_input = get_lstm_input_data(dataset, max_seq_len)

    question_inputs = torch.tensor(question_input)
    pos_ans_inputs = torch.tensor(pos_ans_input)
    neg_ans_inputs = torch.tensor(neg_ans_input)

    # Create the DataLoader
    data = TensorDataset(question_inputs, pos_ans_inputs, neg_ans_inputs)
    if type == "train":
        sampler = RandomSampler(data)
    else:
        sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

    return dataloader

In [8]:
# Get dataloaders
train_dataloader = get_dataloader(train_set, 'train', 
                                  config['max_seq_len'], 
                                  config['batch_size'])
validation_dataloader = get_dataloader(valid_set, 'validation', 
                                       config['max_seq_len'], 
                                       config['batch_size'])

print("\n\nSize of the training DataLoader: {}".format(len(train_dataloader)))
print("Size of the validation DataLoader: {}".format(len(validation_dataloader)))

100%|██████████| 5676/5676 [00:13<00:00, 414.70it/s]
100%|██████████| 631/631 [00:01<00:00, 472.74it/s]




Size of the training DataLoader: 4342
Size of the validation DataLoader: 483


# **Model**
Uses pre-trained GloVe embeddings and a biLSTM network with the cosine similarity measure.

In [0]:
class QA_LSTM(nn.Module):
    """
    QA-LSTM model
    """
    def __init__(self, config):
        super(QA_LSTM, self).__init__()
        # Embedding dimension
        self.emb_dim = config['emb_dim']
        # Hidden size
        self.hidden_size = config['hidden_size']
        # Dropout rate
        self.dropout = config['dropout']
        # Vocabulary size
        self.vocab_size = len(vocab)
        # Create embedding layer
        self.embedding = self.create_emb_layer()
        # The question and answer representations share the same biLSTM network
        self.lstm = nn.LSTM(self.emb_dim, \
                            self.hidden_size, \
                            num_layers=1, \
                            batch_first=True, \
                            bidirectional=True)
        # Cosine similiarty metric
        self.sim = nn.CosineSimilarity(dim=1)
        # Apply dropout
        self.dropout = nn.Dropout(self.dropout)

    def create_emb_layer(self):
        """Creates embedding layerself using pre-trained
        GloVe embeddings (6B tokens)

        Returns:
            emb_layer: Torch embedding layer
        """
        print("\n\nInitializing model...")
        print("\nDownloading pre-trained GloVe embeddings...\n")
        # Use GloVe embeddings from torchtext
        emb = torchtext.vocab.GloVe("6B", dim=self.emb_dim)
        # Dictionary mapping of word idx to GloVe vectors
        emb_weights = np.zeros((self.vocab_size, self.emb_dim))
        # Count
        words_found = 0

        for token, idx in vocab.items():
            # emb.stoi is a dict of token to idx mapping
            # If token from the vocabulary exist in GloVe
            if token in emb.stoi:
                # Add the embedding to the index
                emb_weights[idx] = emb[token]
                words_found += 1

        print("\n")
        print(words_found, "words are found in GloVe")

        # Convert matrix to tensor
        emb_weights = torch.from_numpy(emb_weights).float()

        vocab_size, emb_dim = emb_weights.shape
        # Create embedding layer
        emb_layer = nn.Embedding(vocab_size, emb_dim)
        # Load the embeddings
        emb_layer.load_state_dict({'weight': emb_weights})

        return emb_layer

    def forward(self, question, answer):
        """Forward pass to generate biLSTM representations for the question and
        answer independently, and then utilize cosine similarity to measure
        their distance.

        Returns:
            similarity: Torch tensor with cosine similarity score.
        ----------
        Arguements:
            question: Torch tensor of vectorized question
            answer: Torch tensor of vectorized answer
        """
        # Embedding layers - (batch_size, max_seq_len, emb_dim)
        question_embedding = self.embedding(question)
        answer_embedding = self.embedding(answer)

        # biLSTM - (batch_size, max_seq_len, 2*hidden_size)
        question_lstm, (hidden, cell) = self.lstm(question_embedding)
        answer_lstm, (hidden, cell) = self.lstm(answer_embedding)

        # Max-pooling - (batch_size, 2*hidden_size)
        # There are n word level biLSTM representations where n is the max_seq_len
        # Use max pooling to generate the best representation
        question_maxpool = torch.max(question_lstm, 1)[0]
        answer_maxpool = torch.max(answer_lstm, 1)[0]

        # Apply dropout
        question_output = self.dropout(question_maxpool)
        answer_output = self.dropout(answer_maxpool)

        # Similarity -(batch_size,)
        similarity = self.sim(question_output, answer_output)

        return similarity

In [10]:
model = QA_LSTM(config).to(device)



Initializing model...

Downloading pre-trained GloVe embeddings...



50456 words are found in GloVe


# **Training/Validation methods**

In [0]:
def hinge_loss(pos_sim, neg_sim):
    """
    Returns:
        loss: Tensor with hinge loss value
    ----------
    Arguements:
        pos_sim: Tensor with similarity of a question and a positive answer
        neg_sim: Tensor with similarity of a question and a negative answer
    """
    margin = config['margin']

    loss = torch.max(torch.tensor(0, dtype=torch.float).to(device), 
                     margin - pos_sim + neg_sim)
    return loss

In [0]:
def train(model, train_dataloader, optimizer):
    """Trains the model and returns the average loss

    Returns:
        avg_loss: Float
    ----------
    Arguements:
        model: Torch model
        train_dataloader: DataLoader object
        optimizer: Optimizer object
    """
    # Cumulated Training loss
    train_loss = 0.0
    # Set model to training mode
    model.train()
    # For each batch of training data
    for step, batch in enumerate(tqdm(train_dataloader)):
        # batch contains 3 PyTorch tensors
        # Move tensors to gpu
        question = batch[0].to(device)
        pos_ans = batch[1].to(device)
        neg_ans = batch[2].to(device)

        # 1. Zero gradients
        model.zero_grad()
        # 2. Compute similarity scores of pos and neg QA pairs
        pos_sim = model(question, pos_ans)
        neg_sim = model(question, neg_ans)
        # 3. Compute loss
        loss = hinge_loss(pos_sim, neg_sim).mean()
        # 4. Use loss to compute gradients
        loss.backward()
        # 5. Use optimizer to take gradient step
        optimizer.step()
        # Cumulate loss
        train_loss += loss.item()
    # Compute average loss
    avg_loss = train_loss/len(train_dataloader)

    return avg_loss

In [0]:
def validate(model, validation_dataloader):
    """Validates the model and returns the average loss

    Returns:
        avg_loss: Float
    ----------
    Arguements:
        model: Torch model
        validation_dataloader: DataLoader object
    """
    # Cumulated validation loss
    valid_loss = 0.0
    # Set model to evaluation mode
    model.eval()
    # Evaluate data
    for batch in tqdm(validation_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from Dataloader
        question, pos_ans, neg_ans = batch
        # Don't calculate the gradients
        with torch.no_grad():
            # Compute similarity score of pos and neg QA pairs
            pos_sim = model(question, pos_ans)
            neg_sim = model(question, neg_ans)
            # Compute loss
            loss = hinge_loss(pos_sim, neg_sim).mean()
            # Coumulate loss
            valid_loss += loss.item()
    # Compute average loss
    avg_loss = valid_loss/len(validation_dataloader)

    return avg_loss

# **Train model**

In [0]:
!mkdir model

In [15]:
optimizer = optim.Adam(model.parameters(), lr=config['lr'])

# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(config['n_epochs']):
    # Evaluate training loss
    train_loss = train(model, train_dataloader, optimizer)
    # Evaluate validation loss
    valid_loss = validate(model, validation_dataloader)
    # At each epoch, if the validation loss is the best
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # Save the parameters of the model
        torch.save(model.state_dict(), 'model/'+str(epoch+1)+'_qa_lstm.pt')

    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {0:.3f}".format(train_loss))
    print("\t Validation Loss: {0:.3f}\n".format(valid_loss))

100%|██████████| 4342/4342 [05:48<00:00, 12.44it/s]
100%|██████████| 483/483 [00:15<00:00, 31.28it/s]
  0%|          | 2/4342 [00:00<06:31, 11.09it/s]



 Epoch 1:
	 Train Loss: 0.073
	 Validation Loss: 0.269



100%|██████████| 4342/4342 [05:47<00:00, 12.48it/s]
100%|██████████| 483/483 [00:15<00:00, 31.27it/s]
  0%|          | 2/4342 [00:00<06:27, 11.19it/s]



 Epoch 2:
	 Train Loss: 0.022
	 Validation Loss: 0.267



100%|██████████| 4342/4342 [05:48<00:00, 12.46it/s]
100%|██████████| 483/483 [00:15<00:00, 31.41it/s]



 Epoch 3:
	 Train Loss: 0.012
	 Validation Loss: 0.258






# **Evaluate**

In [0]:
def get_rank(model, test_set, max_seq_len):
    """Re-ranks the answer candidates per question using trained model.

    Returns:
        qid_pred_rank: Dictionary
                key - qid
                value - List of re-ranked candidate answers
    -------------------
    Arguments:
        model - Trained PyTorch model
        test_set - List of lists
                Each element is a list contraining
                [qid, list of pos docid, list of candidate docid]
        max_seq_len - int   
    """
    # Dictionary - key: qid, value: ranked list of docids
    qid_pred_rank = {}
    # Set model to evaluation mode
    model.eval()
    # For each sample in the test set
    for i, seq in enumerate(tqdm(test_set)):
        # Extract input data
        ques, pos_ans, cands = seq[0], seq[1], seq[2]
        # Tokenize and vectorize question
        q_text = qid_to_tokenized_text[ques]
        q_vec = torch.tensor([vectorize(q_text, max_seq_len)]).to(device)
        # Tokenize candidate answers
        cands_text = [docid_to_tokenized_text[c] for c in cands]
        cands_id = np.array(cands)
        # List to store similarity score of QA pair
        scores = []
        # For each candidate answer
        for cand in cands_text:
            # Vectorize the answers
            a_vec = torch.tensor([vectorize(cand, max_seq_len)]).to(device)
            # Compute similarity score of QA pair and add to scores
            scores.append(model(q_vec, a_vec).item())

        # Get the indices of the sorted (descending) similarity scores
        sorted_index = np.argsort(scores)[::-1]
        # Get the cand ans docid from the sorted indices
        ranked_ans = cands_id[sorted_index]
        # Set the qid keys to the list of re-ranked docids
        qid_pred_rank[ques] = ranked_ans

    return qid_pred_rank

In [0]:
# Download trained model
checkpoint = get_trained_model('qa-lstm')

In [23]:
k = 10

trained_model_path = "model/trained/qa-lstm/" + checkpoint
# Load model
model.load_state_dict(torch.load(trained_model_path), strict=False)

# Get rank
qid_pred_rank = get_rank(model, test_set, config['max_seq_len'])

k = 10
num_q = len(test_set)

# Evaluate
MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, labels, k)

print("\n\nAverage nDCG@{0} for {1} queries: {2:.3f}".format(k, num_q, average_ndcg))
print("MRR@{0} for {1} queries: {2:.3f}".format(k, num_q, MRR))
print("Average Precision@1 for {0} queries: {1:.3f}".format(num_q, precision))

100%|██████████| 333/333 [01:49<00:00,  3.04it/s]



Average nDCG@10 for 333 queries: 0.115
MRR@10 for 333 queries: 0.073
Average Precision@1 for 333 queries: 0.033



