# Question word prediction

> Group 12: Tristan Perrot & Romain Darous

Task is to train and evaluate a QWP model using any available QA-corpus, for instance, the [SQuAD corpus](https://rajpurkar.github.io/SQuAD-explorer/).


In [1]:
import json
import math
import os

import matplotlib.pyplot as plt
import nltk
import pandas as pd
import requests
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from tqdm.notebook import tqdm, trange



In [2]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(i).name)

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
device

NVIDIA H100 80GB HBM3 MIG 1g.10gb


device(type='cuda')

## Data


In [3]:
data_dir = 'data'

In [4]:
if data_dir not in os.listdir():
    os.mkdir(data_dir)

if "squad_train.json" not in os.listdir(data_dir):
    # Download data at https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
    res = requests.get("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
    data = json.loads(res.text)

    # Save data to file
    with open(data_dir + "/squad_train.json", "w") as f:
        json.dump(data, f)

with open(data_dir + "/squad_train.json", "r") as f:
    data = json.load(f)

# Extract answer text and question text
answers = []
questions = []
for article in data["data"]:
    for paragraph in article["paragraphs"]:
        for qa in paragraph["qas"]:
            if qa["is_impossible"]:
                continue
            answers.append(qa["answers"][0]["text"])
            questions.append(qa["question"])

print("Number of questions:", len(questions))

# Print some examples
for i in range(5):
    print()
    print("Question:", questions[i])
    print("Answer:", answers[i])

Number of questions: 86821

Question: When did Beyonce start becoming popular?
Answer: in the late 1990s

Question: What areas did Beyonce compete in when she was growing up?
Answer: singing and dancing

Question: When did Beyonce leave Destiny's Child and become a solo singer?
Answer: 2003

Question: In what city and state did Beyonce  grow up? 
Answer: Houston, Texas

Question: In which decade did Beyonce become famous?
Answer: late 1990s


In [5]:
# Tokenize questions
tokenized_questions = [nltk.word_tokenize(q) for q in questions]

# Tokenize answers
tokenized_answers = [nltk.word_tokenize(a) for a in answers]

for i in range(5):
    print()
    print("Question:", tokenized_questions[i])
    print("Answer:", tokenized_answers[i])


Question: ['When', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?']
Answer: ['in', 'the', 'late', '1990s']

Question: ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?']
Answer: ['singing', 'and', 'dancing']

Question: ['When', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?']
Answer: ['2003']

Question: ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?']
Answer: ['Houston', ',', 'Texas']

Question: ['In', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous', '?']
Answer: ['late', '1990s']


In [6]:
# Merge questions and answers
merged = [q + a for q, a in zip(tokenized_questions, tokenized_answers)]

for i in range(5):
    print()
    print("Merged:", merged[i])


Merged: ['When', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']

Merged: ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']

Merged: ['When', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']

Merged: ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']

Merged: ['In', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous', '?', 'late', '1990s']


In [7]:
# Create vocabulary
vocab = set()
for m in merged:
    vocab.update(m)

vocab = list(vocab)

# Add "<qw>" to vocabulary
vocab.append("<qw>")
vocab.append("<pad>")

# Create word to index and index to word mappings
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

In [8]:
words_to_hide = 1

hidden_merged = []

for m in merged:
    hidden = m.copy()
    hidden[:words_to_hide] = ["<qw>"] * words_to_hide
    hidden_merged.append(hidden)

for i in range(5):
    print()
    print("Merged:", merged[i])
    print("Hidden:", hidden_merged[i])


Merged: ['When', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']
Hidden: ['<qw>', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']

Merged: ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']
Hidden: ['<qw>', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']

Merged: ['When', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']
Hidden: ['<qw>', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']

Merged: ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']
Hidden: ['<qw>', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']

Merged: ['In', 'which', 'decade', 'did', 'Beyonce', 'beco

In [9]:
x_words = [[word for word in hidden] for hidden in hidden_merged]
y_words = [word[:words_to_hide] for word in merged]

for i in range(5):
    print()
    print("X_words:", x_words[i])
    print("Y_words:", y_words[i])


X_words: ['<qw>', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']
Y_words: ['When']

X_words: ['<qw>', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']
Y_words: ['What']

X_words: ['<qw>', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']
Y_words: ['When']

X_words: ['<qw>', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']
Y_words: ['In']

X_words: ['<qw>', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous', '?', 'late', '1990s']
Y_words: ['In']


In [10]:
# Convert words to indices
def words_to_indices(words):
    return [word_to_idx[word] for word in words]

x = [words_to_indices(words) for words in x_words]
y = [words_to_indices(words) for words in y_words]

for i in range(5):
    print()
    print("X:", x[i])
    print("Y:", y[i])


X: [67298, 11478, 15940, 10782, 14113, 19339, 34989, 4722, 57418, 39821, 6750]
Y: [21451]

X: [67298, 2207, 11478, 15940, 1336, 4722, 5463, 64400, 60670, 17175, 32925, 34989, 18917, 25901, 36169]
Y: [649]

X: [67298, 11478, 15940, 27858, 7760, 24722, 43865, 25901, 39001, 13807, 51515, 19829, 34989, 67002]
Y: [21451]

X: [67298, 48729, 51383, 25901, 1586, 11478, 15940, 48742, 32925, 34989, 33111, 56260, 61968]
Y: [58777]

X: [67298, 18872, 63046, 11478, 15940, 39001, 46589, 34989, 39821, 6750]
Y: [58777]


In [11]:
# Pad sequences
max_len = max([len(words) for words in x])

for i in range(len(x)):
    x[i] += [word_to_idx["<pad>"]] * (max_len - len(x[i]))

for i in range(5):
    print()
    print("X:", x[i])
    print("Y:", y[i])


X: [67298, 11478, 15940, 10782, 14113, 19339, 34989, 4722, 57418, 39821, 6750, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299]
Y: [21451]

X: [67298, 2207, 11478, 15940, 1336, 4722, 5463, 64400, 60670, 17175, 32925, 34989, 18917, 25901, 36169, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299]
Y: [649]

X: [67298, 11478, 15940, 27858, 7760, 24722, 43865, 25901, 39001, 13807, 51515, 19829, 34989,

In [12]:
# Reverse the order of the input sequence
x = [x_i[::-1] for x_i in x]
y = [y_i[::-1] for y_i in y]

for i in range(5):
    print()
    print("X:", x[i])
    print("Y:", y[i])


X: [67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 6750, 39821, 57418, 4722, 34989, 19339, 14113, 10782, 15940, 11478, 67298]
Y: [21451]

X: [67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 36169, 25901, 18917, 34989, 32925, 17175, 60670, 64400, 5463, 4722, 1336, 15940, 11478, 2207, 67298]
Y: [649]

X: [67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299, 67299

In [13]:
# Convert to tensors
x = torch.tensor(x).to(device)
y = torch.tensor(y).to(device)

## Training

We will now our model to predict the first or the two first words of the question given the rest of the question and the answer. We will use a transformer model to do so.


### LSTM simple test model

In [14]:
# Train val test split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.5)

print("Train size:", len(x_train))
print("Val size:", len(x_val))
print("Test size:", len(x_test))

# Create dataset
class WordDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


train_dataset = WordDataset(x_train, y_train)
val_dataset = WordDataset(x_val, y_val)
test_dataset = WordDataset(x_test, y_test)

# Create dataloaders
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Train size: 69456
Val size: 8682
Test size: 8683


In [17]:
# Define model of predicting next word
class NextWordModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(NextWordModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x
    
# Define model
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 128

model = NextWordModel(vocab_size, embedding_dim, hidden_dim).to(device)

# Define optimizer
optimizer = optim.Adam(model.parameters())

# Define loss function
criterion = nn.CrossEntropyLoss()

# Train model
num_epochs = 10

for epoch in trange(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    for x_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_pred = model(x_batch)
        y_pred = y_pred.view(-1, vocab_size)
        y_batch = y_batch.view(-1)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_total += y_batch.size(0)
        train_correct += (y_pred.argmax(dim=1) == y_batch).sum().item()
    train_accuracy = train_correct / train_total

    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for x_batch, y_batch in tqdm(val_loader):
            y_pred = model(x_batch)
            y_pred = y_pred.view(-1, vocab_size)
            y_batch = y_batch.view(-1)
            loss = criterion(y_pred, y_batch)
            val_loss += loss.item()
            val_total += y_batch.size(0)
            val_correct += (y_pred.argmax(dim=1) == y_batch).sum().item()
    val_accuracy = val_correct / val_total

    print("Epoch:", epoch + 1)
    print("Train loss:", train_loss / len(train_loader))
    print("Train accuracy:", train_accuracy)
    print("Val loss:", val_loss / len(val_loader))
    print("Val accuracy:", val_accuracy)

# Test model
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for x_batch, y_batch in tqdm(test_loader):
        y_pred = model(x_batch)
        y_pred = y_pred.view(-1, vocab_size)
        y_batch = y_batch.view(-1)
        test_correct += (y_pred.argmax(dim=1) == y_batch).sum().item()
        test_total += y_batch.size(0)
test_accuracy = test_correct / test_total

print("Test accuracy:", test_accuracy)

# Predict next word
def predict_question_word(question):
    question = nltk.word_tokenize(question)
    question = ["<qw>"] + question
    question = [word_to_idx[word] for word in question]
    # Reverse the order of the input sequence
    question = question[::-1]
    question = torch.tensor(question).unsqueeze(0).to(device)
    with torch.no_grad():
        y_pred = model(question)
    y_pred = y_pred.squeeze(0)
    y_pred = y_pred[-1]
    y_pred = F.softmax(y_pred, dim=0)
    y_pred = y_pred.cpu().numpy()
    y_pred = y_pred.argsort()[::-1]
    y_pred = [idx_to_word[idx] for idx in y_pred]
    return y_pred

sentence = "<qw> is the capital of France? Paris"
predict_question_word(sentence)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2171 [00:00<?, ?it/s]

RuntimeError: NVML_SUCCESS == r INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":1154, please report a bug to PyTorch. 

In [None]:
# Model evaluation on test set

# True and predicted words
true_words = []
pred_words = []

model.eval()
with torch.no_grad():
    for x_batch, y_batch in tqdm(test_loader):
        y_pred = model(x_batch)
        y_pred = torch.argmax(y_pred, dim=2).cpu().numpy()
        for y_true, y_p in zip(y_batch.cpu().numpy(), y_pred):
            true_words.append([idx_to_word[idx] for idx in y_true])
            pred_words.append([idx_to_word[idx] for idx in y_p])

# Flatten lists
true_words = [word for words in true_words for word in words]
pred_words = [word for words in pred_words for word in words]

# Calculate accuracy
correct = sum([1 for t, p in zip(true_words, pred_words) if t == p])
accuracy = correct / len(true_words)
print(f"Accuracy: {accuracy:.4f}")

# Randomly sample 10 examples
for i in range(10):
    idx = torch.randint(len(true_words), (1,)).item()
    print()
    print("True:", true_words[idx])
    print("Pred:", pred_words[idx])

  0%|          | 0/272 [00:00<?, ?it/s]

Accuracy: 0.9947

True: <pad>
Pred: <pad>

True: <pad>
Pred: <pad>

True: <pad>
Pred: <pad>

True: <pad>
Pred: <pad>

True: <pad>
Pred: <pad>

True: <pad>
Pred: <pad>

True: <pad>
Pred: <pad>

True: <pad>
Pred: <pad>

True: <pad>
Pred: <pad>

True: <pad>
Pred: <pad>


In [None]:
if not os.path.exists("models"):
    os.mkdir("models")

# Save model
torch.save(model.state_dict(), "models/word_predictor_lstm_1st.pth")

# Load model
model = WordPredictor(vocab_size, embedding_dim, hidden_dim).to(device)
model.load_state_dict(torch.load("models/word_predictor_lstm_1st.pth"))

RuntimeError: Parent directory models does not exist.