# Question word prediction

> Group 12: Tristan Perrot & Romain Darous

Task is to train and evaluate a **char per char Transformer model** model using any available QA-corpus, for instance, the [SQuAD corpus](https://rajpurkar.github.io/SQuAD-explorer/).


METTRE EN CONTEXTE LE DEBUT DE LA QUESTION EN CORRIGEANT LE CODE DEJA FAIT
ADAPTER POUR AUGMENTER LE NOMBRE DE TRANSFORMERS EVENTUELLEMENT ET FAIRE PLUSIEURS COUCHES

# 0. Importing modules

In [1]:
import json
import math
import os

# Importing
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm, trange
import random
from datetime import datetime
import numpy as np

# Importing models
import char_dataset
import cpc_model




# 1. Data pre-processing

## 1.1. Loading the dataset
**Note :** we only want to be able te recover the beginning of a question. For that, it doesn"t matter whether the question is impossible to answer or not.

In [2]:
data_dir = 'data'
if data_dir not in os.listdir():
    os.mkdir(data_dir)

if "squad_train.json" not in os.listdir(data_dir):
    # Download data at https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
    res = requests.get(
        "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
    data = json.loads(res.text)

    # Save data to file
    with open(data_dir + "/squad_train.json", "w") as f:
        json.dump(data, f)

with open(data_dir + "/squad_train.json", "r") as f:
    data = json.load(f)

# Extract answer text and question text
answers = []
questions = []
for article in data["data"]:
    for paragraph in article["paragraphs"]:
        for qa in paragraph["qas"]:
            
            """if qa["is_impossible"] :
                continue
            answers.append(qa["answers"][0]["text"])"""
                
            if qa["is_impossible"] and len(qa["question"]) > 4:
                continue
            else :
                answers.append("")
            questions.append(qa["question"])
            

print("Number of questions:", len(questions))

# Print some examples
for i in range(5):
    print()
    print("Question:", questions[i])
    print("Answer:", answers[i])

Number of questions: 86821

Question: When did Beyonce start becoming popular?
Answer: 

Question: What areas did Beyonce compete in when she was growing up?
Answer: 

Question: When did Beyonce leave Destiny's Child and become a solo singer?
Answer: 

Question: In what city and state did Beyonce  grow up? 
Answer: 

Question: In which decade did Beyonce become famous?
Answer: 


In [3]:
print(questions[i] + ' ' + answers[i])

In which decade did Beyonce become famous? 


## 1.2. Making a suitable dataset
``<BOS>`` token. Indicates that the sentence is starting.

We will make the prediction of the sentence in reverse mode, as we want to predict the beginning of a question. We will use unidirectionnal attention as well.

In [4]:
# Concatenating questions and answers
dataset = [(questions[i].lower() + ' ' + answers[i].lower())[::-1] for i in range(len(questions))]

# Shuffle dataset
random.shuffle(dataset)

# Splitting into train, validation, and test sets
train_dataset, test_dataset = train_test_split(dataset, test_size = int(0.1*len(dataset)), train_size=int(0.9*len(dataset)))
train_dataset, val_dataset = train_test_split(train_dataset, train_size=int(0.85*len(train_dataset)), test_size = int(0.15*len(train_dataset)))

In [5]:
print(f"Size of the dataset : {len(dataset)}")
print(f"Size of the train, val and test sets : {len(train_dataset), len(val_dataset), len(test_dataset)}")
print(f"Example of original datapoint : {questions[0] + ' ' + answers[0]}")
print(f"Example of formatted datapoint : {dataset[0]}")

Size of the dataset : 86821
Size of the train, val and test sets : (66417, 11720, 8682)
Example of original datapoint : When did Beyonce start becoming popular? 
Example of formatted datapoint :  ?srewop taerg gnoma era srebmem 7g tahw


## 1.3. Building a character dataset
The dataset will be built in the ``train_charlm`` function as it depends on the desired configuration. The function that allow to build the dataset and their context are provided in the file ``char_dataset.py``.

# 2. The model

In [6]:
# Updating models# Delete the modules from the namespace
del char_dataset
del cpc_model

# Unload the modules from memory
import sys
del sys.modules['char_dataset']
del sys.modules['cpc_model']

# Importing models
import char_dataset
import cpc_model

## 2.0. Hyperparameters

In [7]:
# ============= Hyper-parameter class for training ============== #

class Config :
    def __init__(self, seq_ctxt = True, MAXLEN = 32) : 
        self.number_of_transformer_encoders = 4
        self.number_of_attention_heads = 4
        self.hidden_size = 256
        self.dropout_prob = 0.1
        self.batch_size = 64
        self.learning_rate = 0.0003
        self.weight_decay = 0.000001
        self.no_of_epochs = 100
        self.is_causal = True # When True, the attention is causal
        self.seq_ctxt = seq_ctxt # When False, forces the context to take the beginning of the answer into account
        self.MAXLEN = MAXLEN

## 2.1. Training
Defining functions for training and testing models.

In [8]:
def train_charlm(config, device) :
    start_time = datetime.now()

    # ==================== Building datasets ================ #
    train_char_set = char_dataset.CharDataset(train_dataset, config.MAXLEN, seq_ctxt=config.seq_ctxt)
    val_char_set = char_dataset.CharDataset(val_dataset, config.MAXLEN, seq_ctxt=config.seq_ctxt)

    # ======================= Training ======================= #

    

    training_loader = DataLoader(train_char_set, batch_size=config.batch_size)
    validation_loader = DataLoader(val_char_set, batch_size=config.batch_size)

    charlm = cpc_model.CharLM( config, len(char_dataset.CharDataset.id_to_char), config.MAXLEN, config.is_causal).to(device)
    criterion = nn.CrossEntropyLoss()
    charlm_optimizer = optim.Adam( charlm.parameters(), lr=config.learning_rate )


    patience = 3
    best_val_loss = float('inf')

    charlm.train()
    print( datetime.now().strftime("%X"), "Training starts" )

    iteration = 0
    for epoch in tqdm(range(config.no_of_epochs)) :
        val_loss = 0.0
        for input_tensor, label in training_loader :
            input_tensor, label = input_tensor.to(device), label.to(device)
            charlm_optimizer.zero_grad()
            logits = charlm(input_tensor).to(device)
            loss = criterion(logits.squeeze(1), label)
            loss.backward()
            charlm_optimizer.step()
        iteration += 1

        print( datetime.now().strftime("%X"), "End of epoch", epoch+1, ", loss=", loss.detach().item())
        
        # Validation phase with Early Stopping
        charlm.eval()
        
        with torch.no_grad():
            for input_tensor, label in validation_loader:
                input_tensor, label = input_tensor.to(device), label.to(device)
                logits = charlm(input_tensor).to(device)
                loss = criterion(logits.squeeze(1), label)
                val_loss += loss.item()

        val_loss /= len(validation_loader)
        print(datetime.now().strftime("%X"), "Validation loss=", val_loss)

        # Check early stopping condition
        if val_loss <= best_val_loss:
            best_val_loss = val_loss
            patience = 3
        else:
            patience -= 1

        if patience == 0:
            print("Early stopping at epoch", epoch + 1)
            break

        charlm.train()
    
    end_time = datetime.now()

    return charlm, iteration + 1, end_time - start_time


In [9]:
def test_charlm(model, config, device) :

    # Computing output on the test set
    test_char_set = char_dataset.CharDataset(test_dataset, config.MAXLEN, seq_ctxt=config.seq_ctxt)
    test_loader = DataLoader(test_char_set, batch_size=config.batch_size)

    accuracies = []
    model.to(device)
    model.eval()    
    with torch.no_grad():
        for input_tensor, label in test_loader:
            input_tensor, label = input_tensor.to(device), label.to(device)
            logits = model(input_tensor).to(device)
            _, pred_label = logits.topk(1, dim=-1)
            pred_label = pred_label.squeeze(-1)  # Squeeze to remove extra dimension
            accuracy = (pred_label == label).float().mean().item()  # Compute accuracy for this batch
            accuracies.append(accuracy)  # Append batch accuracy to list)

    total_acc = np.round(np.mean(np.array(accuracies))*100, 2)
    print(f"Test accuracy : {total_acc} %")
    return total_acc

## 2.2. Grid search on parameters

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print( "Running on", device )

Running on cuda


In [10]:
output = './output/'
model_names = ['charlm_seq_32_es', 'charlm_seq_64_es', 'charlm_nseq_32_es', 'charlm_nseq_64_es']
configs = [Config(seq_ctxt=True, MAXLEN=32), Config(seq_ctxt=True, MAXLEN=64),Config(seq_ctxt=False, MAXLEN=32), Config(seq_ctxt=False, MAXLEN=64)]
test_acc = []
models = []
train_time = []
epochs = []

for i, config in enumerate(configs) :
    # Training the model
    model, epoch, delta = train_charlm(config)
    models.append(model)
    train_time.append(delta)
    epochs.append(epoch)


Running on cuda
12:04:35 Training starts


  0%|          | 0/100 [00:00<?, ?it/s]

12:06:07 End of epoch 1 , loss= 1.2433515787124634
12:06:14 Validation loss= 1.1618087292031238 Patience= 3
12:07:45 End of epoch 2 , loss= 1.1608622074127197
12:07:51 Validation loss= 1.1126961008184835 Patience= 3
12:09:21 End of epoch 3 , loss= 1.2662925720214844
12:09:28 Validation loss= 1.0625196494397364 Patience= 3
12:10:59 End of epoch 4 , loss= 1.2371270656585693
12:11:05 Validation loss= 1.0439458171788014 Patience= 3
12:12:36 End of epoch 5 , loss= 1.2506815195083618
12:12:43 Validation loss= 1.015936790045939 Patience= 3
12:14:13 End of epoch 6 , loss= 1.1749731302261353
12:14:20 Validation loss= 0.9965516393906192 Patience= 3
12:15:51 End of epoch 7 , loss= 0.9733778238296509
12:15:57 Validation loss= 0.9873681045833387 Patience= 3
12:17:28 End of epoch 8 , loss= 0.9789430499076843
12:17:35 Validation loss= 1.0152576620641507 Patience= 3
12:19:05 End of epoch 9 , loss= 0.9719316363334656
12:19:12 Validation loss= 1.0001349645068771 Patience= 2
12:20:42 End of epoch 10 , lo

  0%|          | 0/100 [00:00<?, ?it/s]

12:23:53 End of epoch 1 , loss= 1.0322864055633545
12:24:06 Validation loss= 1.3472124413126394 Patience= 3
12:27:05 End of epoch 2 , loss= 0.9432955980300903
12:27:18 Validation loss= 1.1841833987675214 Patience= 3
12:30:18 End of epoch 3 , loss= 0.9477601647377014
12:30:31 Validation loss= 1.1363989107389199 Patience= 3
12:33:33 End of epoch 4 , loss= 0.8078376650810242
12:33:46 Validation loss= 1.1083167186693141 Patience= 3
12:36:49 End of epoch 5 , loss= 0.826664388179779
12:37:02 Validation loss= 1.0922841598015083 Patience= 3
12:40:05 End of epoch 6 , loss= 0.8080551028251648
12:40:18 Validation loss= 1.0929652162445218 Patience= 3
12:43:21 End of epoch 7 , loss= 0.6627166271209717
12:43:34 Validation loss= 1.0791062601459653 Patience= 2
12:46:37 End of epoch 8 , loss= 0.7871485352516174
12:46:51 Validation loss= 1.0920877129310056 Patience= 3
12:49:53 End of epoch 9 , loss= 0.7136619687080383
12:50:06 Validation loss= 1.0786210323007484 Patience= 2
12:53:09 End of epoch 10 , lo

  0%|          | 0/100 [00:00<?, ?it/s]

13:01:32 End of epoch 1 , loss= 1.2065460681915283
13:01:39 Validation loss= 1.1724534233306583 Patience= 3
13:03:13 End of epoch 2 , loss= 1.1050829887390137
13:03:20 Validation loss= 1.1017080041766167 Patience= 3
13:04:54 End of epoch 3 , loss= 1.0310667753219604
13:05:01 Validation loss= 1.0623697236650869 Patience= 3
13:06:35 End of epoch 4 , loss= 0.9561647772789001
13:06:42 Validation loss= 1.0110177026610625 Patience= 3
13:08:14 End of epoch 5 , loss= 0.8977027535438538
13:08:21 Validation loss= 1.013851368803727 Patience= 3
13:09:53 End of epoch 6 , loss= 0.9272994995117188
13:10:00 Validation loss= 0.980973627630033 Patience= 2
13:11:34 End of epoch 7 , loss= 0.8726040124893188
13:11:41 Validation loss= 0.9777043786017519 Patience= 3
13:13:12 End of epoch 8 , loss= 0.7050909399986267
13:13:19 Validation loss= 0.9898916538294993 Patience= 3
13:14:53 End of epoch 9 , loss= 0.7543450593948364
13:14:59 Validation loss= 0.9864856296777725 Patience= 2
13:16:32 End of epoch 10 , los

  0%|          | 0/100 [00:00<?, ?it/s]

13:19:45 End of epoch 1 , loss= 1.3499958515167236
13:19:57 Validation loss= 1.4288318817082204 Patience= 3
13:22:57 End of epoch 2 , loss= 1.2666515111923218
13:23:10 Validation loss= 1.2310902211226915 Patience= 3
13:26:10 End of epoch 3 , loss= 0.9956637024879456
13:26:23 Validation loss= 1.1848894146398494 Patience= 3
13:29:23 End of epoch 4 , loss= 1.1225696802139282
13:29:36 Validation loss= 1.11692898019364 Patience= 3
13:32:35 End of epoch 5 , loss= 1.1762259006500244
13:32:48 Validation loss= 1.1328753693166531 Patience= 3
13:35:48 End of epoch 6 , loss= 1.3823902606964111
13:36:01 Validation loss= 1.1064005705243662 Patience= 2
13:39:00 End of epoch 7 , loss= 1.161044955253601
13:39:13 Validation loss= 1.091712067786016 Patience= 3
13:42:13 End of epoch 8 , loss= 1.0066156387329102
13:42:26 Validation loss= 1.0901869733239475 Patience= 3
13:45:27 End of epoch 9 , loss= 1.118972659111023
13:45:40 Validation loss= 1.1030500642720023 Patience= 3
13:48:41 End of epoch 10 , loss= 

In [11]:
for i, config in enumerate(configs) :
  accuracy = test_charlm(models[i], config)
  test_acc.append(accuracy)
  # Saving the model
  torch.save(models[i].state_dict(), f"{output}{model_names[i]}{epoch}.pth")
  print("Model saved successfully")

Running on cuda


../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [113,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [113,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [113,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [113,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [113,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [113,0,0], thread: [5,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [113,0,0], 

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Building a ``.csv`` file to store the results :

In [None]:
import pandas as pd

data = {
    'Running time' : train_time,
    'Last epoch' : epochs,
    'Type of context' : ['Sequential' if cfg.seq_ctxt else 'Split' for cfg in configs],
    'Context size' : [cfg.MAXLEN for cfg in configs],
    'Test accuracy (%)' : test_acc
}

# Create the DataFrame with row names
df = pd.DataFrame(data, index=model_names)
df.to_csv(output + 'metrics.csv')

## 2.2. Loading a model
Now that we trained several models, we can load them.

In [13]:
"""# Load a model
charlm = cpc_model.CharLM( config, len(char_dataset.CharDataset.id_to_char), config.MAXLEN, config.is_causal).to(device)
charlm.load_state_dict(torch.load('./output/charlm_seq_model_early_stopping.pth'))
charlm.eval()"""

# Getting the best model
best_model_idx = 3
charlm = models[best_model_idx]
config = configs[best_model_idx]
#print("Best model :", df.loc[model_names[best_model_idx]])

## 2.2. Evaluation on the best model

## 2.3. User interaction

In [16]:
# ==================== User interaction ==================== #
while True:
    text = input("> ").strip()
    if text == "" :
        continue
    elif text == "QUIT" :
        break

    # Will be used to output question
    full_question = list(text.lower())[::-1]

    if full_question[-1] != ' ' : 
        full_question.append(' ')
    if '?' not in full_question :
        full_question = ['?'] + full_question
        
    new_character = full_question[-1]

    # Recovering the beginning of the question
    try :
        count = 0
        MAX_COUNT = 50
        while new_character != char_dataset.CharDataset.BOQ and count < MAX_COUNT :
            # Building context
            char_list = []
            if config.seq_ctxt :
                char_list = full_question[-config.MAXLEN:]
            else :
                tmp = "".join(full_question)
                words_a, words_q = tmp.split("?")[0], tmp.split("?")[1]
                len_q = len(words_q) # counting the <BOS> token
                if len_q <= config.MAXLEN//2 or len(words_a) < config.MAXLEN//8 :
                    char_list = full_question[-config.MAXLEN:]
                else :
                    char_list = words_a[config.MAXLEN//2:] + full_question[-config.MAXLEN:]
            
            ctxt = [char_dataset.CharDataset.char_to_id[c] for c in char_list]
            
            
            # Computing the next character
            input_tensor = torch.tensor( [0]*(config.MAXLEN-len(ctxt)) + ctxt).unsqueeze(0).to(device)
            logits = charlm(input_tensor).squeeze().to(device)
            _, new_character_tensor = logits.topk(1)
            new_character = char_dataset.CharDataset.id_to_char[new_character_tensor.detach().item()]
            
            # Uploading the final output
            full_question.append(new_character)
            count += 1

        full_question = "".join(full_question[::-1])
        print(f"Recovered question : {full_question}")
    except KeyError :
        print("ERROR")
        continue

>  are you ? Fine


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
