# Question word prediction

> Group 12: Tristan Perrot & Romain Darous

Task is to train and evaluate a QWP model using any available QA-corpus, for instance, the [SQuAD corpus](https://rajpurkar.github.io/SQuAD-explorer/).


In [41]:
import json
import math
import os

import matplotlib.pyplot as plt
import nltk
import pandas as pd
import requests
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from tqdm.notebook import tqdm, trange

In [42]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(i).name)

device = torch.device("cuda:1" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='cpu')

## Data


In [43]:
data_dir = 'data'

In [44]:
if data_dir not in os.listdir():
    os.mkdir(data_dir)

if "squad_train.json" not in os.listdir(data_dir):
    # Download data at https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
    res = requests.get("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
    data = json.loads(res.text)

    # Save data to file
    with open(data_dir + "/squad_train.json", "w") as f:
        json.dump(data, f)

with open(data_dir + "/squad_train.json", "r") as f:
    data = json.load(f)

# Extract answer text and question text
answers = []
questions = []
for article in data["data"]:
    for paragraph in article["paragraphs"]:
        for qa in paragraph["qas"]:
            if qa["is_impossible"]:
                continue
            answers.append(qa["answers"][0]["text"])
            questions.append(qa["question"])

print("Number of questions:", len(questions))

# Print some examples
for i in range(5):
    print()
    print("Question:", questions[i])
    print("Answer:", answers[i])

Number of questions: 86821

Question: When did Beyonce start becoming popular?
Answer: in the late 1990s

Question: What areas did Beyonce compete in when she was growing up?
Answer: singing and dancing

Question: When did Beyonce leave Destiny's Child and become a solo singer?
Answer: 2003

Question: In what city and state did Beyonce  grow up? 
Answer: Houston, Texas

Question: In which decade did Beyonce become famous?
Answer: late 1990s


In [45]:
# Tokenize questions
tokenized_questions = [nltk.word_tokenize(q) for q in questions]

# Tokenize answers
tokenized_answers = [nltk.word_tokenize(a) for a in answers]

for i in range(5):
    print()
    print("Question:", tokenized_questions[i])
    print("Answer:", tokenized_answers[i])


Question: ['When', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?']
Answer: ['in', 'the', 'late', '1990s']

Question: ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?']
Answer: ['singing', 'and', 'dancing']

Question: ['When', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?']
Answer: ['2003']

Question: ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?']
Answer: ['Houston', ',', 'Texas']

Question: ['In', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous', '?']
Answer: ['late', '1990s']


In [46]:
# Merge questions and answers
merged = [q + a for q, a in zip(tokenized_questions, tokenized_answers)]

for i in range(5):
    print()
    print("Merged:", merged[i])


Merged: ['When', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']

Merged: ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']

Merged: ['When', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']

Merged: ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']

Merged: ['In', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous', '?', 'late', '1990s']


In [47]:
# Create vocabulary
vocab = set()
for m in merged:
    vocab.update(m)

vocab = list(vocab)

# Add "<qw>" to vocabulary
vocab.append("<qw>")

# Create word to index and index to word mappings
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

In [48]:
words_to_hide = 2

hidden_merged = []

for m in merged:
    hidden = m.copy()
    hidden[:words_to_hide] = ["<qw>"] * words_to_hide
    hidden_merged.append(hidden)

for i in range(5):
    print()
    print("Merged:", merged[i])
    print("Hidden:", hidden_merged[i])


Merged: ['When', 'did', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']
Hidden: ['<qw>', '<qw>', 'Beyonce', 'start', 'becoming', 'popular', '?', 'in', 'the', 'late', '1990s']

Merged: ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']
Hidden: ['<qw>', '<qw>', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up', '?', 'singing', 'and', 'dancing']

Merged: ['When', 'did', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']
Hidden: ['<qw>', '<qw>', 'Beyonce', 'leave', 'Destiny', "'s", 'Child', 'and', 'become', 'a', 'solo', 'singer', '?', '2003']

Merged: ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']
Hidden: ['<qw>', '<qw>', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up', '?', 'Houston', ',', 'Texas']

Merged: ['In', 'which', 'decade', 'did', 'Beyonce', 'bec

## Training

We will now our model to predict the first or the two first words of the question given the rest of the question and the answer. We will use a transformer model to do so.


## Evaluation
