# Question word prediction

> Group 12: Tristan Perrot & Romain Darous

Task is to train and evaluate a **char per char Transformer model** model using any available QA-corpus, for instance, the [SQuAD corpus](https://rajpurkar.github.io/SQuAD-explorer/).


# 0. Importing modules

In [1]:
import json
import math
import os

# Importing
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm, trange
import random


  warn(


In [2]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(i).name)

device = torch.device("cuda" if torch.cuda.is_available(
) else "mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='cpu')

# 1. Data pre-processing

## 1.1. Loading the dataset
**Note :** we only want to be able te recover the beginning of a question. For that, it doesn"t matter whether the question is impossible to answer or not.

In [3]:
data_dir = 'data'
if data_dir not in os.listdir():
    os.mkdir(data_dir)

if "squad_train.json" not in os.listdir(data_dir):
    # Download data at https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
    res = requests.get(
        "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
    data = json.loads(res.text)

    # Save data to file
    with open(data_dir + "/squad_train.json", "w") as f:
        json.dump(data, f)

with open(data_dir + "/squad_train.json", "r") as f:
    data = json.load(f)

# Extract answer text and question text
answers = []
questions = []
for article in data["data"]:
    for paragraph in article["paragraphs"]:
        for qa in paragraph["qas"]:
            questions.append(qa["question"])
            if qa["is_impossible"]:
                answers.append("")
            else :
                answers.append(qa["answers"][0]["text"])

print("Number of questions:", len(questions))

# Print some examples
for i in range(5):
    print()
    print("Question:", questions[i])
    print("Answer:", answers[i])

Number of questions: 130319

Question: When did Beyonce start becoming popular?
Answer: in the late 1990s

Question: What areas did Beyonce compete in when she was growing up?
Answer: singing and dancing

Question: When did Beyonce leave Destiny's Child and become a solo singer?
Answer: 2003

Question: In what city and state did Beyonce  grow up? 
Answer: Houston, Texas

Question: In which decade did Beyonce become famous?
Answer: late 1990s


In [4]:
print(questions[i] + ' ' + answers[i])

In which decade did Beyonce become famous? late 1990s


In [5]:
# Importing models
import char_dataset
import cpc_model

## 1.2. Making a suitable dataset
``<BOS>`` token. Indicates that the sentence is starting.

We will make the prediction of the sentence in reverse mode, as we want to predict the beginning of a question. We will use unidirectionnal attention as well.

In [6]:
# Concatenating questions and answers
dataset = [(questions[i].lower() + ' ' + answers[i].lower())[::-1] for i in range(len(questions))]
# Splitting into train, validation, and test sets
train_dataset, test_dataset = train_test_split(dataset, test_size = int(0.1*len(dataset)), train_size=int(0.9*len(dataset)))
train_dataset, val_dataset = train_test_split(train_dataset, train_size=int(0.85*len(train_dataset)), test_size = int(0.15*len(train_dataset)))

In [7]:
print(f"Size of the dataset : {len(dataset)}")
print(f"Size of the train, val and test sets : {len(train_dataset), len(val_dataset), len(test_dataset)}")
print(f"Example of original datapoint : {questions[0] + ' ' + answers[0]}")
print(f"Example of formatted datapoint : {dataset[0]}")

Size of the dataset : 130319
Size of the train, val and test sets : (99693, 17593, 13031)
Example of original datapoint : When did Beyonce start becoming popular? in the late 1990s
Example of formatted datapoint : s0991 etal eht ni ?ralupop gnimoceb trats ecnoyeb did nehw


## 1.3. Building a character dataset

In [8]:
# Computing MAXLEN parameter. It's the min length among all the questions when removing the two first words
# Will be used as the max window size for context
MAXLEN = len(" ".join(dataset[0].split(' ')[:-2]))

for i in range(1, len(dataset)) :
    tmp_len = len(" ".join(dataset[0].split(' ')[:-2]))
    if tmp_len < MAXLEN : MAXLEN = tmp_len

print(f"MAXLEN : {MAXLEN}")

MAXLEN : 49


In [9]:
# Updating models# Delete the modules from the namespace
del char_dataset
del cpc_model

# Unload the modules from memory
import sys
del sys.modules['char_dataset']
del sys.modules['cpc_model']

# Importing models
import char_dataset
import cpc_model

In [12]:
train_char_set = char_dataset.CharDataset(train_dataset, MAXLEN)
val_char_set = char_dataset.CharDataset(val_dataset, MAXLEN)
test_char_set = char_dataset.CharDataset(test_dataset, MAXLEN)

finished
[2, 3, 4, 5, 6, 7, 8, 1, 5, 6, 7, 8, 4, 1, 5, 15, 9, 24, 9, 27, 17, 24, 4, 5, 6, 7, 8, 1, 2, 9, 27, 9, 3, 10, 9, 22, 4, 17, 7, 8, 1, 14, 3, 14, 4, 15, 9, 7, 8, 1, 5, 6, 7, 8, 4, 15, 3, 1, 2, 3, 4, 5, 6, 7, 8, 1, 6, 9, 13, 6, 4, 5, 6, 7, 8, 1, 9, 9, 13, 7, 5, 4, 5, 6, 7, 8, 1, 14, 9, 5, 15, 3, 6, 18, 4, 17, 7, 8, 1, 2, 6, 8, 4, 17, 7, 8, 1, 15, 17, 3, 26, 9, 13, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 9, 7, 5, 4, 15, 3, 1, 14, 3, 14, 4, 15, 9, 7, 8, 1, 12, 15, 6, 24, 4, 8, 17, 7, 1, 24, 17, 14, 26, 15, 3, 21, 4, 7, 11, 3, 7, 8, 1, 2, 6, 8, 4, 9, 13, 9, 7, 8, 1, 17, 14, 4, 5, 6, 7, 8, 1, 5, 15, 9, 27, 9, 4, 5, 6, 7, 8, 1, 40, 39, 39, 29, 4, 15, 3, 1, 9, 18, 12, 5, 4, 5, 6, 7, 8, 1, 2, 6, 8, 4, 9, 13, 9, 7, 8, 1, 15, 9, 7, 8, 4, 9, 11, 15, 3, 2, 1, 12, 13, 5, 15, 25, 17, 11, 4, 5, 6, 7, 8, 1, 7, 2, 6, 13, 11, 4, 9, 7, 5, 1, 7, 5, 3, 8, 4, 26, 15, 17, 10, 6, 1, 2, 6, 8, 4, 8, 17, 7, 1, 9, 13, 9, 8, 4, 15, 9, 7, 8, 1, 14, 9, 22, 3, 13, 11, 2, 9, 14, 4, 17, 7, 8, 1, 14, 3, 6, 2, 4

In [11]:
len(train_char_set.lbls)

AttributeError: 'CharDataset' object has no attribute 'lbls'

# 2. The model

## 2.2. Setting of hyperparameters

In [None]:
# ============= Hyper-parameters for training ============== #

class Config :
    number_of_transformer_encoders = 1
    number_of_attention_heads = 1
    hidden_size = 64
    dropout_prob = 0.1
    batch_size = 64
    learning_rate = 0.0003
    weight_decay = 0.000001
    no_of_epochs = 100


## 2.1. Training

In [None]:

# ======================= Training ======================= #

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print( "Running on", device )

config = Config()
training_dataset = CharDataset('HP_book_1.txt', 32)
print( "There are", len(training_dataset), "datapoints and", len(id_to_char), "unique characters in the dataset" ) 
training_loader = DataLoader(training_dataset, batch_size=config.batch_size)

charlm = CharLM( config, len(id_to_char), MAXLEN).to(device)
criterion = nn.CrossEntropyLoss()
charlm_optimizer = optim.Adam( charlm.parameters(), lr=config.learning_rate )

charlm.train()
print( datetime.now().strftime("%X"), "Training starts" )
for epoch in range(config.no_of_epochs) :
    iteration = 0
    for input_tensor, label in training_loader :
        input_tensor, label = input_tensor.to(device), label.to(device)
        charlm_optimizer.zero_grad()
        logits = charlm(input_tensor).to(device)
        loss = criterion(logits.squeeze(1), label)
        loss.backward()
        charlm_optimizer.step()
        iteration += 1

    print( datetime.now().strftime("%X"), "End of epoch", epoch+1, ", loss=", loss.detach().item())
    charlm.eval()
    # Generate 50 characters starting from the input text
    try :
        char_list = list("he took out his wand and"[-MAXLEN:])
        for i in range(300) :
            input_tensor = torch.tensor( [char_to_id[c] for c in char_list] + [char_to_id[PADDING_SYMBOL]]*(MAXLEN-len(char_list))).unsqueeze(0).to(device)
            logits = charlm(input_tensor).squeeze().to(device)
            _, new_character_tensor = logits.topk(1)
            new_character = id_to_char[new_character_tensor.detach().item()]
            print( new_character, end='' )
            if len(char_list) == MAXLEN :
                char_list.pop(0)
            char_list.append( new_character )
        print()
    except KeyError :
        continue
    charlm.train()


Running on cpu


NameError: name 'id_to_char' is not defined