# Question word prediction

> Group 12: Tristan Perrot & Romain Darous

Task is to train and evaluate a **char per char Transformer model** model using any available QA-corpus, for instance, the [SQuAD corpus](https://rajpurkar.github.io/SQuAD-explorer/).


# 0. Importing modules

In [1]:
import json
import math
import os

# Importing
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm, trange


  warn(


In [2]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(i).name)

device = torch.device("cuda" if torch.cuda.is_available(
) else "mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='cpu')

# 1. Data pre-processing

## 1.1. Loading the dataset

In [3]:
data_dir = 'data'
if data_dir not in os.listdir():
    os.mkdir(data_dir)

if "squad_train.json" not in os.listdir(data_dir):
    # Download data at https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
    res = requests.get(
        "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
    data = json.loads(res.text)

    # Save data to file
    with open(data_dir + "/squad_train.json", "w") as f:
        json.dump(data, f)

with open(data_dir + "/squad_train.json", "r") as f:
    data = json.load(f)

# Extract answer text and question text
answers = []
questions = []
for article in data["data"]:
    for paragraph in article["paragraphs"]:
        for qa in paragraph["qas"]:
            if qa["is_impossible"]:
                continue
            answers.append(qa["answers"][0]["text"])
            questions.append(qa["question"])

print("Number of questions:", len(questions))

# Print some examples
for i in range(5):
    print()
    print("Question:", questions[i])
    print("Answer:", answers[i])

Number of questions: 86821

Question: When did Beyonce start becoming popular?
Answer: in the late 1990s

Question: What areas did Beyonce compete in when she was growing up?
Answer: singing and dancing

Question: When did Beyonce leave Destiny's Child and become a solo singer?
Answer: 2003

Question: In what city and state did Beyonce  grow up? 
Answer: Houston, Texas

Question: In which decade did Beyonce become famous?
Answer: late 1990s


In [6]:
print(questions[i] + ' ' + answers[i])

In which decade did Beyonce become famous? late 1990s


## 1.2. Building a character vocabulary
``<BOS>`` token. Indicates that the sentence is starting.

We will make the prediction of the sentence in reverse mode, as we want to predict

In [15]:
# Concatenating questions and answers
dataset = ['<BOS> ' + questions[i].lower() + ' ' + answers[i].lower() for i in range(len(questions))]

In [17]:
print(f"Size of the dataset : {len(dataset)}")
print(f"Example of datapoint : {dataset[0]}")

Size of the dataset : 86821
Example of datapoint : <BOS> when did beyonce start becoming popular? in the late 1990s


# 2. The model

## 2.2. Setting of hyperparameters

In [None]:
# ============= Hyper-parameters for training ============== #

class Config :
    number_of_transformer_encoders = 1
    number_of_attention_heads = 1
    hidden_size = 64
    dropout_prob = 0.1
    batch_size = 64
    learning_rate = 0.0003
    weight_decay = 0.000001
    no_of_epochs = 100

MAXLEN = 32   # This is the number of characters we will consider when 
              # predicting the next character

## 2.1. Training

In [None]:

# ======================= Training ======================= #

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print( "Running on", device )

config = Config()
training_dataset = CharDataset('HP_book_1.txt', 32)
print( "There are", len(training_dataset), "datapoints and", len(id_to_char), "unique characters in the dataset" ) 
training_loader = DataLoader(training_dataset, batch_size=config.batch_size)

charlm = CharLM( config, len(id_to_char), MAXLEN).to(device)
criterion = nn.CrossEntropyLoss()
charlm_optimizer = optim.Adam( charlm.parameters(), lr=config.learning_rate )

charlm.train()
print( datetime.now().strftime("%X"), "Training starts" )
for epoch in range(config.no_of_epochs) :
    iteration = 0
    for input_tensor, label in training_loader :
        input_tensor, label = input_tensor.to(device), label.to(device)
        charlm_optimizer.zero_grad()
        logits = charlm(input_tensor).to(device)
        loss = criterion(logits.squeeze(1), label)
        loss.backward()
        charlm_optimizer.step()
        iteration += 1

    print( datetime.now().strftime("%X"), "End of epoch", epoch+1, ", loss=", loss.detach().item())
    charlm.eval()
    # Generate 50 characters starting from the input text
    try :
        char_list = list("he took out his wand and"[-MAXLEN:])
        for i in range(300) :
            input_tensor = torch.tensor( [char_to_id[c] for c in char_list] + [char_to_id[PADDING_SYMBOL]]*(MAXLEN-len(char_list))).unsqueeze(0).to(device)
            logits = charlm(input_tensor).squeeze().to(device)
            _, new_character_tensor = logits.topk(1)
            new_character = id_to_char[new_character_tensor.detach().item()]
            print( new_character, end='' )
            if len(char_list) == MAXLEN :
                char_list.pop(0)
            char_list.append( new_character )
        print()
    except KeyError :
        continue
    charlm.train()
