# Question word prediction

> Group 12: Tristan Perrot & Romain Darous

Task is to train and evaluate a **char per char Transformer model** model using any available QA-corpus, for instance, the [SQuAD corpus](https://rajpurkar.github.io/SQuAD-explorer/).


METTRE EN CONTEXTE LE DEBUT DE LA QUESTION EN CORRIGEANT LE CODE DEJA FAIT
ADAPTER POUR AUGMENTER LE NOMBRE DE TRANSFORMERS EVENTUELLEMENT ET FAIRE PLUSIEURS COUCHES

# 0. Importing modules

In [1]:
import json
import math
import os

# Importing
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm, trange
import random
from datetime import datetime
import numpy as np

# Importing models
import char_dataset
import cpc_model




In [2]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(i).name)

    device = torch.device("cuda" if torch.cuda.is_available(
    ) else "mps" if torch.backends.mps.is_available() else "cpu")
    device

NVIDIA H100 80GB HBM3 MIG 1g.10gb


# 1. Data pre-processing

## 1.1. Loading the dataset
**Note :** we only want to be able te recover the beginning of a question. For that, it doesn"t matter whether the question is impossible to answer or not.

In [3]:
data_dir = 'data'
if data_dir not in os.listdir():
    os.mkdir(data_dir)

if "squad_train.json" not in os.listdir(data_dir):
    # Download data at https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
    res = requests.get(
        "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
    data = json.loads(res.text)

    # Save data to file
    with open(data_dir + "/squad_train.json", "w") as f:
        json.dump(data, f)

with open(data_dir + "/squad_train.json", "r") as f:
    data = json.load(f)

# Extract answer text and question text
answers = []
questions = []
for article in data["data"]:
    for paragraph in article["paragraphs"]:
        for qa in paragraph["qas"]:
            
            """if qa["is_impossible"] :
                continue
            answers.append(qa["answers"][0]["text"])"""
                
            if qa["is_impossible"] and len(qa["question"]) > 4:
                continue
            else :
                answers.append("")
            questions.append(qa["question"])
            

print("Number of questions:", len(questions))

# Print some examples
for i in range(5):
    print()
    print("Question:", questions[i])
    print("Answer:", answers[i])

Number of questions: 86821

Question: When did Beyonce start becoming popular?
Answer: 

Question: What areas did Beyonce compete in when she was growing up?
Answer: 

Question: When did Beyonce leave Destiny's Child and become a solo singer?
Answer: 

Question: In what city and state did Beyonce  grow up? 
Answer: 

Question: In which decade did Beyonce become famous?
Answer: 


In [4]:
print(questions[i] + ' ' + answers[i])

In which decade did Beyonce become famous? 


## 1.2. Making a suitable dataset
``<BOS>`` token. Indicates that the sentence is starting.

We will make the prediction of the sentence in reverse mode, as we want to predict the beginning of a question. We will use unidirectionnal attention as well.

In [5]:
# Concatenating questions and answers
dataset = [(questions[i].lower() + ' ' + answers[i].lower())[::-1] for i in range(len(questions))]

# Shuffle dataset
random.shuffle(dataset)

# Splitting into train, validation, and test sets
train_dataset, test_dataset = train_test_split(dataset, test_size = int(0.1*len(dataset)), train_size=int(0.9*len(dataset)))
train_dataset, val_dataset = train_test_split(train_dataset, train_size=int(0.85*len(train_dataset)), test_size = int(0.15*len(train_dataset)))

In [6]:
print(f"Size of the dataset : {len(dataset)}")
print(f"Size of the train, val and test sets : {len(train_dataset), len(val_dataset), len(test_dataset)}")
print(f"Example of original datapoint : {questions[0] + ' ' + answers[0]}")
print(f"Example of formatted datapoint : {dataset[0]}")

Size of the dataset : 86821
Size of the train, val and test sets : (66417, 11720, 8682)
Example of original datapoint : When did Beyonce start becoming popular? 
Example of formatted datapoint :  ?sevisneffo dnammoc rebmob rof ygetarts yramirp eht saw tahw


## 1.3. Building a character dataset
The dataset will be built in the ``train_charlm`` function as it depends on the desired configuration. The function that allow to build the dataset and their context are provided in the file ``char_dataset.py``.

# 2. The model

In [7]:
# Updating models# Delete the modules from the namespace
del char_dataset
del cpc_model

# Unload the modules from memory
import sys
del sys.modules['char_dataset']
del sys.modules['cpc_model']

# Importing models
import char_dataset
import cpc_model

## 2.0. Hyperparameters

In [8]:
# ============= Hyper-parameter class for training ============== #

class Config :
    def __init__(self, seq_ctxt = True, MAXLEN = 32) : 
        self.number_of_transformer_encoders = 4
        self.number_of_attention_heads = 4
        self.hidden_size = 256
        self.dropout_prob = 0.1
        self.batch_size = 64
        self.learning_rate = 0.0003
        self.weight_decay = 0.000001
        self.no_of_epochs = 100
        self.is_causal = True # When True, the attention is causal
        self.seq_ctxt = seq_ctxt # When False, forces the context to take the beginning of the answer into account
        self.MAXLEN = MAXLEN

## 2.1. Training
Defining functions for training and testing models.

In [9]:
def train_charlm(config) :
    start_time = datetime.now()

    # ==================== Building datasets ================ #
    train_char_set = char_dataset.CharDataset(train_dataset, config.MAXLEN, seq_ctxt=config.seq_ctxt)
    val_char_set = char_dataset.CharDataset(val_dataset, config.MAXLEN, seq_ctxt=config.seq_ctxt)

    # ======================= Training ======================= #

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print( "Running on", device )

    training_loader = DataLoader(train_char_set, batch_size=config.batch_size)
    validation_loader = DataLoader(val_char_set, batch_size=config.batch_size)

    charlm = cpc_model.CharLM( config, len(char_dataset.CharDataset.id_to_char), config.MAXLEN, config.is_causal).to(device)
    criterion = nn.CrossEntropyLoss()
    charlm_optimizer = optim.Adam( charlm.parameters(), lr=config.learning_rate )


    patience = 3
    best_val_loss = torch.inf

    charlm.train()
    print( datetime.now().strftime("%X"), "Training starts" )

    iteration = 0
    for epoch in tqdm(range(config.no_of_epochs)) :
        val_loss = 0.0
        for input_tensor, label in training_loader :
            input_tensor, label = input_tensor.to(device), label.to(device)
            charlm_optimizer.zero_grad()
            logits = charlm(input_tensor).to(device)
            loss = criterion(logits.squeeze(1), label)
            loss.backward()
            charlm_optimizer.step()
        iteration += 1

        print( datetime.now().strftime("%X"), "End of epoch", epoch+1, ", loss=", loss.detach().item())
        
        # Validation phase with Early Stopping
        charlm.eval()
        
        with torch.no_grad():
            for input_tensor, label in validation_loader:
                input_tensor, label = input_tensor.to(device), label.to(device)
                logits = charlm(input_tensor).to(device)
                loss = criterion(logits.squeeze(1), label)
                val_loss += loss.item()

        val_loss /= len(validation_loader)
        print(datetime.now().strftime("%X"), "Validation loss=", val_loss)

        # Check early stopping condition
        if val_loss <= best_val_loss:
            best_val_loss = val_loss
            patience = 3
        else:
            patience -= 1

        if patience == 0:
            print("Early stopping at epoch", epoch + 1)
            break

        charlm.train()
    
    end_time = datetime.now()

    return charlm, iteration + 1, end_time - start_time


In [10]:
def test_charlm(model, config) :
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print( "Running on", device )
    
    # Computing output on the test set
    test_char_set = char_dataset.CharDataset(test_dataset, config.MAXLEN, seq_ctxt=config.seq_ctxt)
    test_loader = DataLoader(test_char_set, batch_size=config.batch_size)

    accuracies = []
    
    model.eval()    
    with torch.no_grad():
        for input_tensor, label in test_loader:
            input_tensor, label = input_tensor.to(device), label.to(device)
            logits = model(input_tensor).to(device)
            _, pred_label = logits.topk(1, dim=-1)
            pred_label = pred_label.squeeze(-1)  # Squeeze to remove extra dimension
            accuracy = (pred_label == label).float().mean().item()  # Compute accuracy for this batch
            accuracies.append(accuracy)  # Append batch accuracy to list)

    total_acc = np.round(np.mean(np.array(accuracies))*100, 2)
    print(f"Test accuracy : {total_acc} %")
    return total_acc

## 2.2. Grid search on parameters

In [11]:
output = './output/'
model_names = ['charlm_seq_32_es', 'charlm_seq_64_es', 'charlm_nseq_32_es', 'charlm_nseq_64_es']
configs = [Config(seq_ctxt=True, MAXLEN=32), Config(seq_ctxt=True, MAXLEN=64),Config(seq_ctxt=False, MAXLEN=32), Config(seq_ctxt=False, MAXLEN=64)]
test_acc = []
models = []
train_time = []
epochs = []

for i, config in enumerate(configs) :
    # Training the model
    model, epoch, delta = train_charlm(config)
    try :
        accuracy = test_charlm(model, config)
    except :
        accuracy = 0
    models.append(model)
    test_acc.append(accuracy)
    train_time.append(delta)
    epochs.append(epoch)
    

    # Saving the model
    torch.save(model.state_dict(), f"{output}{model_names[i]}{epoch}")
    print("Model saved successfully")


Running on cuda
10:12:34 Training starts


  0%|          | 0/100 [00:00<?, ?it/s]

10:14:04 End of epoch 1 , loss= 1.4200148582458496
10:14:11 Validation loss= 1.2025063384308594
10:15:41 End of epoch 2 , loss= 1.3082795143127441
10:15:48 Validation loss= 1.1117177228698751
10:17:18 End of epoch 3 , loss= 1.2508339881896973
10:17:25 Validation loss= 1.052497255635538
10:18:55 End of epoch 4 , loss= 1.2338570356369019
10:19:02 Validation loss= 1.0516890178627103
10:20:32 End of epoch 5 , loss= 1.0291153192520142
10:20:39 Validation loss= 1.0062022358336278
10:22:09 End of epoch 6 , loss= 0.992608368396759
10:22:15 Validation loss= 1.0018501149073935
10:23:46 End of epoch 7 , loss= 1.079903483390808
10:23:53 Validation loss= 0.9960055497439602
10:25:23 End of epoch 8 , loss= 1.0827209949493408
10:25:29 Validation loss= 0.9687866622008352
10:27:00 End of epoch 9 , loss= 1.052593469619751
10:27:06 Validation loss= 0.9729253089660582
10:28:36 End of epoch 10 , loss= 0.9257869124412537
10:28:43 Validation loss= 0.9900809771652463
10:30:13 End of epoch 11 , loss= 1.07806599

../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [10,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [10,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [10,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [10,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [10,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [10,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [10,0,0], t

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Building a ``.csv`` file to store the results :

In [None]:
import pandas as pd

data = {
    'Running time' : train_time,
    'Last epoch' : epochs,
    'Type of context' : ['Sequential' if cfg.seq_ctxt else 'Split' for cfg in configs],
    'Context size' : [cfg.MAXLEN for cfg in configs],
    'Test accuracy (%)' : test_acc
}

# Create the DataFrame with row names
df = pd.DataFrame(data, index=model_names)
df.to_csv(output + 'metrics.csv')

## 2.2. Loading a model
Now that we trained several models, we can load them.

In [None]:
"""# Load a model
charlm = cpc_model.CharLM( config, len(char_dataset.CharDataset.id_to_char), config.MAXLEN, config.is_causal).to(device)
charlm.load_state_dict(torch.load('./output/charlm_seq_model_early_stopping.pth'))
charlm.eval()"""

# Getting the best model
best_model_idx = np.argmax(test_acc)
charlm = models[best_model_idx]
config = configs[best_model_idx]
print("Best model :", df.loc[model_names[best_model_idx]])

## 2.2. Evaluation on the best model

## 2.3. User interaction

In [None]:
# ==================== User interaction ==================== #
while True:
    text = input("> ").strip()
    if text == "" :
        continue
    elif text == "QUIT" :
        break

    # Will be used to output question
    full_question = list(text.lower())[::-1]

    if full_question[-1] != ' ' : 
        full_question.append(' ')
    if '?' not in full_question :
        full_question = ['?'] + full_question
        
    new_character = full_question[-1]

    # Recovering the beginning of the question
    try :
        count = 0
        MAX_COUNT = 50
        while new_character != char_dataset.CharDataset.BOQ and count < MAX_COUNT :
            # Building context
            char_list = []
            if config.seq_ctxt :
                char_list = full_question[-config.MAXLEN:]
            else :
                tmp = "".join(full_question)
                words_a, words_q = tmp.split("?")[0], tmp.split("?")[1]
                len_q = len(words_q) # counting the <BOS> token
                if len_q <= config.MAXLEN//2 or len(words_a) < config.MAXLEN//8 :
                    char_list = full_question[-config.MAXLEN:]
                else :
                    char_list = words_a[config.MAXLEN//2:] + full_question[-config.MAXLEN:]
            
            ctxt = [char_dataset.CharDataset.char_to_id[c] for c in char_list]
            
            
            # Computing the next character
            input_tensor = torch.tensor( [0]*(config.MAXLEN-len(ctxt)) + ctxt).unsqueeze(0).to(device)
            logits = charlm(input_tensor).squeeze().to(device)
            _, new_character_tensor = logits.topk(1)
            new_character = char_dataset.CharDataset.id_to_char[new_character_tensor.detach().item()]
            
            # Uploading the final output
            full_question.append(new_character)
            count += 1

        full_question = "".join(full_question[::-1])
        print(f"Recovered question : {full_question}")
    except KeyError :
        print("ERROR")
        continue