# Question word prediction

> Group 12: Tristan Perrot & Romain Darous

Task is to train and evaluate a QWP model using any available QA-corpus, for instance, the [SQuAD corpus](https://rajpurkar.github.io/SQuAD-explorer/).


In [56]:
import json
import math
import os

import matplotlib.pyplot as plt
import nltk
import pandas as pd
import requests
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from tqdm.notebook import tqdm, trange

In [57]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(i).name)

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
device

NVIDIA H100 80GB HBM3 MIG 1g.10gb


device(type='cuda')

## Data


In [58]:
data_dir = 'data'

In [59]:
if data_dir not in os.listdir():
    os.mkdir(data_dir)

if "squad_train.json" not in os.listdir(data_dir):
    # Download data at https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
    res = requests.get("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
    data = json.loads(res.text)

    # Save data to file
    with open(data_dir + "/squad_train.json", "w") as f:
        json.dump(data, f)

with open(data_dir + "/squad_train.json", "r") as f:
    data = json.load(f)

# Extract answer text and question text
answers = []
questions = []
for article in data["data"]:
    for paragraph in article["paragraphs"]:
        for qa in paragraph["qas"]:
            if qa["is_impossible"]:
                continue
            answers.append(qa["answers"][0]["text"])
            questions.append(qa["question"])

print("Number of questions:", len(questions))

# Print some examples
for i in range(5):
    print()
    print("Question:", questions[i])
    print("Answer:", answers[i])

Number of questions: 86821

Question: When did Beyonce start becoming popular?
Answer: in the late 1990s

Question: What areas did Beyonce compete in when she was growing up?
Answer: singing and dancing

Question: When did Beyonce leave Destiny's Child and become a solo singer?
Answer: 2003

Question: In what city and state did Beyonce  grow up? 
Answer: Houston, Texas

Question: In which decade did Beyonce become famous?
Answer: late 1990s


In [60]:
# Tokenize questions
tokenized_questions = [nltk.word_tokenize(q) for q in questions]

# Tokenize answers
tokenized_answers = [nltk.word_tokenize(a) for a in answers]

for i in range(5):
    print()
    print("Question:", tokenized_questions[i])
    print("Answer:", tokenized_answers[i])


Question: ['When', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?']
Answer: ['in', 'the', 'late', '1990s']

Question: ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?']
Answer: ['singing', 'and', 'dancing']

Question: ['When', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?']
Answer: ['2003']

Question: ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?']
Answer: ['Houston', ',', 'Texas']

Question: ['In', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous', '?']
Answer: ['late', '1990s']


In [61]:
# Merge questions and answers
merged = [q + a for q, a in zip(tokenized_questions, tokenized_answers)]

for i in range(5):
    print()
    print("Merged:", merged[i])


Merged: ['When', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']

Merged: ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']

Merged: ['When', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']

Merged: ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']

Merged: ['In', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous', '?', 'late', '1990s']


In [62]:
# Create vocabulary
vocab = set()
for m in merged:
    vocab.update(m)

vocab = list(vocab)

# Add "<qw>" to vocabulary
vocab.append("<qw>")
vocab.append("<pad>")

# Create word to index and index to word mappings
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

In [63]:
words_to_hide = 1

hidden_merged = []

for m in merged:
    hidden = m.copy()
    hidden[:words_to_hide] = ["<qw>"] * words_to_hide
    hidden_merged.append(hidden)

for i in range(5):
    print()
    print("Merged:", merged[i])
    print("Hidden:", hidden_merged[i])


Merged: ['When', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']
Hidden: ['<qw>', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']

Merged: ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']
Hidden: ['<qw>', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']

Merged: ['When', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']
Hidden: ['<qw>', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']

Merged: ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']
Hidden: ['<qw>', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']

Merged: ['In', 'which', 'decade', 'did', 'Beyonce', 'beco

In [64]:
x_words = [[word for word in hidden] for hidden in hidden_merged]
y_words = [word[:words_to_hide] for word in merged]

for i in range(5):
    print()
    print("X_words:", x_words[i])
    print("Y_words:", y_words[i])


X_words: ['<qw>', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']
Y_words: ['When']

X_words: ['<qw>', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']
Y_words: ['What']

X_words: ['<qw>', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']
Y_words: ['When']

X_words: ['<qw>', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']
Y_words: ['In']

X_words: ['<qw>', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous', '?', 'late', '1990s']
Y_words: ['In']


## Training

We will now our model to predict the first or the two first words of the question given the rest of the question and the answer. We will use a transformer model to do so.


### LSTM model

In [65]:
# Convert words to indices
def words_to_indices(words):
    return [word_to_idx[word] for word in words]

x = [words_to_indices(words) for words in x_words]
y = [words_to_indices(words) for words in y_words]

for i in range(5):
    print()
    print("X:", x[i])
    print("Y:", y[i])


X: [67298, 8984, 29517, 28632, 22220, 19157, 40097, 26292, 53678, 49363, 51350]
Y: [18804]

X: [67298, 19074, 8984, 29517, 57998, 26292, 50081, 36999, 40015, 17179, 11189, 40097, 53805, 56733, 41215]
Y: [39356]

X: [67298, 8984, 29517, 47630, 39684, 38256, 41866, 56733, 44054, 203, 18845, 61306, 40097, 30973]
Y: [18804]

X: [67298, 40043, 43611, 56733, 50593, 8984, 29517, 49550, 11189, 40097, 32581, 54593, 1101]
Y: [43311]

X: [67298, 59092, 35236, 8984, 29517, 44054, 23312, 40097, 49363, 51350]
Y: [43311]


In [66]:
# Pad sequences
max_len = max([len(words) for words in x])

for i in range(len(x)):
    x[i] += [word_to_idx["<pad>"]] * (max_len - len(x[i]))
    y[i] += [word_to_idx["<pad>"]] * (max_len - len(y[i]))

for i in range(5):
    print()
    print("X:", x[i])
    print("Y:", y[i])


X: [67298, 8984, 29517, 28632, 22220, 19157, 40097, 26292, 53678, 49363, 51350, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299]
Y: [18804, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299]

X: [67298, 19074, 8984, 29517, 57998, 26292, 50081, 36999, 40015, 17179, 11189, 40097, 53805, 56733, 41215, 6

In [67]:
# Reverse the order of the input sequence
x = [x_i[::-1] for x_i in x]
y = [y_i[::-1] for y_i in y]

In [68]:
# Convert to tensors
x = torch.tensor(x).to(device)
y = torch.tensor(y).to(device)

In [69]:
# Train val test split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.5)

print("Train size:", len(x_train))
print("Val size:", len(x_val))
print("Test size:", len(x_test))

# Create dataset
class WordDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
train_dataset = WordDataset(x_train, y_train)
val_dataset = WordDataset(x_val, y_val)
test_dataset = WordDataset(x_test, y_test)

# Create dataloaders
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define model
class WordPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(WordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x
    
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 100

model = WordPredictor(vocab_size, embedding_dim, hidden_dim).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Train model
num_epochs = 10

for epoch in trange(num_epochs):
    model.train()
    train_loss = 0
    for x_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_pred = model(x_batch)
        loss = criterion(y_pred.view(-1, vocab_size), y_batch.view(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x_batch, y_batch in tqdm(val_loader):
            y_pred = model(x_batch)
            loss = criterion(y_pred.view(-1, vocab_size), y_batch.view(-1))
            val_loss += loss.item()
        val_loss /= len(val_loader)
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

# Evaluate model
model.eval()
test_loss = 0
with torch.no_grad():
    for x_batch, y_batch in tqdm(test_loader):
        y_pred = model(x_batch)
        loss = criterion(y_pred.view(-1, vocab_size), y_batch.view(-1))
        test_loss += loss.item()
    test_loss /= len(test_loader)

print(f"Test Loss: {test_loss:.4f}")

# Predict
def predict(model, words):
    x = torch.tensor(words_to_indices(words)).unsqueeze(0).to(device)
    y_pred = model(x)
    y_pred = torch.argmax(y_pred, dim=2).squeeze(0).cpu().numpy()
    return [idx_to_word[idx] for idx in y_pred]

for i in range(5):
    print()
    print("X:", x_words[i])
    print("Y:", y_words[i])
    print("Pred:", predict(model, x_words[i]))

Train size: 69456
Val size: 8682
Test size: 8683


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2171 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch 1/10, Train Loss: 0.1732, Val Loss: 0.0404


  0%|          | 0/2171 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch 3/10, Train Loss: 0.0296, Val Loss: 0.0301


  0%|          | 0/2171 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch 4/10, Train Loss: 0.0263, Val Loss: 0.0283


  0%|          | 0/2171 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch 5/10, Train Loss: 0.0237, Val Loss: 0.0268


  0%|          | 0/2171 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch 6/10, Train Loss: 0.0214, Val Loss: 0.0264


  0%|          | 0/2171 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch 7/10, Train Loss: 0.0193, Val Loss: 0.0252


  0%|          | 0/2171 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch 8/10, Train Loss: 0.0176, Val Loss: 0.0253


  0%|          | 0/2171 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch 9/10, Train Loss: 0.0160, Val Loss: 0.0258


  0%|          | 0/2171 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

Epoch 10/10, Train Loss: 0.0146, Val Loss: 0.0260


  0%|          | 0/272 [00:00<?, ?it/s]

Test Loss: 0.0255

X: ['<qw>', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']
Y: ['When']
Pred: ['How', '<pad>', 'Reaction', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']

X: ['<qw>', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']
Y: ['What']
Pred: ['How', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']

X: ['<qw>', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']
Y: ['When']
Pred: ['How', '<pad>', 'Reaction', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']

X: ['<qw>', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']
Y: ['In']
Pred: ['How', 'Other', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>

## Evaluation
