# Question word prediction

> Group 12: Tristan Perrot & Romain Darous

Task is to train and evaluate a **char per char Transformer model** model using any available QA-corpus, for instance, the [SQuAD corpus](https://rajpurkar.github.io/SQuAD-explorer/).


METTRE EN CONTEXTE LE DEBUT DE LA QUESTION EN CORRIGEANT LE CODE DEJA FAIT
ADAPTER POUR AUGMENTER LE NOMBRE DE TRANSFORMERS EVENTUELLEMENT ET FAIRE PLUSIEURS COUCHES

# 0. Importing modules

In [1]:
import json
import math
import os

# Importing
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm, trange
import random




In [2]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(i).name)

device = torch.device("cuda" if torch.cuda.is_available(
) else "mps" if torch.backends.mps.is_available() else "cpu")
device

NVIDIA H100 80GB HBM3 MIG 1g.10gb


device(type='cuda')

# 1. Data pre-processing

## 1.1. Loading the dataset
**Note :** we only want to be able te recover the beginning of a question. For that, it doesn"t matter whether the question is impossible to answer or not.

In [3]:
data_dir = 'data'
if data_dir not in os.listdir():
    os.mkdir(data_dir)

if "squad_train.json" not in os.listdir(data_dir):
    # Download data at https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
    res = requests.get(
        "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
    data = json.loads(res.text)

    # Save data to file
    with open(data_dir + "/squad_train.json", "w") as f:
        json.dump(data, f)

with open(data_dir + "/squad_train.json", "r") as f:
    data = json.load(f)

# Extract answer text and question text
answers = []
questions = []
for article in data["data"]:
    for paragraph in article["paragraphs"]:
        for qa in paragraph["qas"]:
            
            """if qa["is_impossible"] :
                continue
            answers.append(qa["answers"][0]["text"])"""
                
            if qa["is_impossible"] and len(qa["question"]) > 4:
                continue
            else :
                answers.append("")
            questions.append(qa["question"])
            

print("Number of questions:", len(questions))

# Print some examples
for i in range(5):
    print()
    print("Question:", questions[i])
    print("Answer:", answers[i])

Number of questions: 86821

Question: When did Beyonce start becoming popular?
Answer: 

Question: What areas did Beyonce compete in when she was growing up?
Answer: 

Question: When did Beyonce leave Destiny's Child and become a solo singer?
Answer: 

Question: In what city and state did Beyonce  grow up? 
Answer: 

Question: In which decade did Beyonce become famous?
Answer: 


In [4]:
print(questions[i] + ' ' + answers[i])

In which decade did Beyonce become famous? 


In [5]:
# Importing models
import char_dataset
import cpc_model
from datetime import datetime

## 1.2. Making a suitable dataset
``<BOS>`` token. Indicates that the sentence is starting.

We will make the prediction of the sentence in reverse mode, as we want to predict the beginning of a question. We will use unidirectionnal attention as well.

In [6]:
# Concatenating questions and answers
dataset = [(questions[i].lower() + ' ' + answers[i].lower())[::-1] for i in range(len(questions))]

# Shuffle dataset
random.shuffle(dataset)

# Splitting into train, validation, and test sets
train_dataset, test_dataset = train_test_split(dataset, test_size = int(0.1*len(dataset)), train_size=int(0.9*len(dataset)))
train_dataset, val_dataset = train_test_split(train_dataset, train_size=int(0.85*len(train_dataset)), test_size = int(0.15*len(train_dataset)))

In [7]:
print(f"Size of the dataset : {len(dataset)}")
print(f"Size of the train, val and test sets : {len(train_dataset), len(val_dataset), len(test_dataset)}")
print(f"Example of original datapoint : {questions[0] + ' ' + answers[0]}")
print(f"Example of formatted datapoint : {dataset[0]}")

Size of the dataset : 86821
Size of the train, val and test sets : (66417, 11720, 8682)
Example of original datapoint : When did Beyonce start becoming popular? 
Example of formatted datapoint :  ?eveihca ailiartsua tsniaga evisneffo enirambus esenapaj eht did tahw


## 1.3. Building a character dataset

In [8]:
# ============= Hyper-parameters for training ============== #

class Config :
    number_of_transformer_encoders = 4
    number_of_attention_heads = 4
    hidden_size = 256
    dropout_prob = 0.1
    batch_size = 64
    learning_rate = 0.0002
    weight_decay = 0.000001
    no_of_epochs = 100
    is_causal = True # When True, the attention is causal
    seq_ctxt = True # When False, forces the context to take the beginning of the answer into account
    MAXLEN = 64

In [9]:
config = Config()

In [10]:
# Updating models# Delete the modules from the namespace
del char_dataset
del cpc_model

# Unload the modules from memory
import sys
del sys.modules['char_dataset']
del sys.modules['cpc_model']

# Importing models
import char_dataset
import cpc_model

In [11]:
# Building the datasets
train_char_set = char_dataset.CharDataset(train_dataset, config.MAXLEN, seq_ctxt=config.seq_ctxt)
val_char_set = char_dataset.CharDataset(val_dataset, config.MAXLEN, seq_ctxt=config.seq_ctxt)
test_char_set = char_dataset.CharDataset(test_dataset, config.MAXLEN, seq_ctxt=config.seq_ctxt)

# 2. The model

## 2.1. Training

In [12]:

# ======================= Training ======================= #

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print( "Running on", device )

training_loader = DataLoader(train_char_set, batch_size=config.batch_size)
validation_loader = DataLoader(val_char_set, batch_size=config.batch_size)

charlm = cpc_model.CharLM( config, len(char_dataset.CharDataset.id_to_char), config.MAXLEN, config.is_causal).to(device)
criterion = nn.CrossEntropyLoss()
charlm_optimizer = optim.Adam( charlm.parameters(), lr=config.learning_rate )


val_loss = 0.0
patience = 3
best_val_loss = torch.inf

charlm.train()
print( datetime.now().strftime("%X"), "Training starts" )

for epoch in tqdm(range(config.no_of_epochs)) :
    iteration = 0
    for input_tensor, label in training_loader :
        input_tensor, label = input_tensor.to(device), label.to(device)
        charlm_optimizer.zero_grad()
        logits = charlm(input_tensor).to(device)
        loss = criterion(logits.squeeze(1), label)
        loss.backward()
        charlm_optimizer.step()
        iteration += 1

    print( datetime.now().strftime("%X"), "End of epoch", epoch+1, ", loss=", loss.detach().item())
    
    # Validation phase with Early Stopping
    charlm.eval()
    
    with torch.no_grad():
        for input_tensor, label in validation_loader:
            input_tensor, label = input_tensor.to(device), label.to(device)
            logits = charlm(input_tensor).to(device)
            loss = criterion(logits.squeeze(1), label)
            val_loss += loss.item()

    val_loss /= len(validation_loader)
    print(datetime.now().strftime("%X"), "Validation loss=", val_loss)

    # Check early stopping condition
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience = 5
    else:
        patience -= 1

    if patience == 0:
        print("Early stopping at epoch", epoch + 1)
        break

    charlm.train()


Running on cuda
07:05:51 Training starts


  0%|          | 0/100 [00:00<?, ?it/s]

07:08:51 End of epoch 1 , loss= 1.3206621408462524
07:09:03 Validation loss= 1.186685533543071
07:12:03 End of epoch 2 , loss= 1.0584670305252075
07:12:16 Validation loss= 1.0656856736046816


KeyboardInterrupt: 

In [None]:
# Save the model
torch.save(charlm.state_dict(), './output/charlm_model_es_seq_64.pth')
print('Model saved successfully.')

In [None]:
"""# Load a model
charlm = cpc_model.CharLM( config, len(char_dataset.CharDataset.id_to_char), config.MAXLEN, config.is_causal).to(device)
charlm.load_state_dict(torch.load('./output/charlm_seq_model_early_stopping.pth'))
charlm.eval()"""

## 2.2. Metrics on test set

In [None]:
"""# Computing output on the test set
test_loader = DataLoader(test_char_set, batch_size=config.batch_size)

charlm.eval()
with torch.no_grad():
    for input_tensor, label in test_loader:
        input_tensor, label = input_tensor.to(device), label.to(device)
        logits = charlm(input_tensor).to(device)
        pred_label = torch.argmax(logits, axis = -1)[:,0]
        accuracy += (pred_label == label).sum() / len(pred_label)

print(f"Test accuracy : {torch.round(accuracy*100, decimals = 2)} %")"""

## 2.3. User interaction

New context

In [None]:
"""# ==================== User interaction ==================== #

while True:
    text = input("> ").strip()
    if text == "" :
        continue
    elif text == "QUIT" :
        break

    words_a, words_q = text.split("?")
    words_a = list(" ".join(words_a))
    words_q = list(" ".join(words_q))

    use_boa = False
    boa = [char_dataset.CharDataset.char_to_id[c] for c in words_a[MAXLEN//2:]]

    if len(words_q) > MAXLEN//2 : 
        use_boa = True
    else : 
        use_boa = False
        char_list = list(text[-MAXLEN:].lower())


    full_question = list(text.lower())[::-1]
    
    char_list = char_list[::-1]
    if char_list[-1] != ' ' : 
        char_list.append(' ')
        full_question.append(' ')
    new_character = char_list[-1]

    # Recovering the beginning of the question
    try :
        count = 0
        MAX_COUNT = 50
        while new_character != char_dataset.CharDataset.BOQ and count < MAX_COUNT :
            if use_boa :
                ctxt = boa + [char_dataset.CharDataset.char_to_id[c] for c in char_list]
            else :
                ctxt = [char_dataset.CharDataset.char_to_id[c] for c in char_list]
                
            input_tensor = torch.tensor([0]*(MAXLEN-len(ctxt)) + ctxt).unsqueeze(0).to(device)
            logits = charlm(input_tensor).squeeze().to(device)
            _, new_character_tensor = logits.topk(1)
            new_character = char_dataset.CharDataset.id_to_char[new_character_tensor.detach().item()]
            # Uploading context
            char_list = char_list[1:] + [new_character]
            full_question.append(new_character)
            count += 1
        full_question = "".join(full_question[::-1])
        print(f"Recovered question : {full_question}")
    except KeyError :
        print("ERROR")
        continue"""

Original context

In [None]:
# ==================== User interaction ==================== #

while True:
    text = input("> ").strip()
    if text == "" :
        continue
    elif text == "QUIT" :
        break
    char_list = list(text[-config.MAXLEN:].lower())

    full_question = list(text.lower())[::-1]
    
    char_list = char_list[::-1]
    
    if char_list[-1] != ' ' : 
        char_list.append(' ')
        char_list = char_list[1:]
        full_question.append(' ')
        
    new_character = char_list[-1]

    # Recovering the beginning of the question
    try :
        count = 0
        MAX_COUNT = 50
        while new_character != char_dataset.CharDataset.BOQ and count < MAX_COUNT :
            ctxt = [char_dataset.CharDataset.char_to_id[c] for c in char_list]
            input_tensor = torch.tensor( [0]*(config.MAXLEN-len(ctxt)) + ctxt).unsqueeze(0).to(device)
            logits = charlm(input_tensor).squeeze().to(device)
            _, new_character_tensor = logits.topk(1)
            new_character = char_dataset.CharDataset.id_to_char[new_character_tensor.detach().item()]
            # Uploading context
            char_list = char_list[1:] + [new_character]
            full_question.append(new_character)
            count += 1
        full_question = "".join(full_question[::-1])
        print(f"Recovered question : {full_question}")
    except KeyError :
        print("ERROR")
        continue