## Evaluation Notebook
> model to evaluate below

In [2]:
# change model name
model_name = "t5-base"

In [16]:
"""Train T5 model on a given dataset, for answer aware question generation task. (Basically a sequence to sequence task)"""

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"
import gc
import datasets
import transformers
import torch
import evaluate
from tqdm import tqdm

TOKEN_QUESTION = '<question>'
TOKEN_END_QUESTION = '<question>'
TOKEN_CONTEXT = '<context>'
TOKEN_END_CONTEXT = '<context>'
TOKEN_ANSWER = '<answer>'
TOKEN_END_ANSWER = '<answer>'  
SPLIT_SEED = 42
NPROC = 32

## Load fine-tuned model

In [4]:
model = transformers.T5ForConditionalGeneration.from_pretrained(f"./models/{model_name}/", device_map='auto')
tokenizer = transformers.AutoTokenizer.from_pretrained(f"./models/{model_name}")
torch.cuda.empty_cache()

## Create SQuAD preprocessing pipeline

In [5]:
def preprocess_squad_dataset(dataset_name='squad', split='train'):
    dataset = datasets.load_dataset(dataset_name, split=split)
    # Add question, answer and context tokens to dataset in a new column named text
    dataset = dataset.map(
        lambda e: {
            # answer + context
            'inputs': f'generate question: {TOKEN_ANSWER} {e["answers"]["text"][0]} {TOKEN_END_ANSWER} {TOKEN_CONTEXT} {e["context"]} {TOKEN_END_CONTEXT}', 
            # question
            'target': f'{TOKEN_QUESTION} {e["question"]} {TOKEN_END_QUESTION}'},
            num_proc=NPROC
        )
    
    # Remove unnecessary columns, leaving only the formatted_text column
    dataset = dataset.remove_columns(['answers', 'context', 'question'])
    return dataset

# create a tokenizer function
def tokenize_function(example, max_context_length=512, max_question_length=32):
# Combine context and question
    # Tokenize input (context + answer)
    inputs = tokenizer(example['inputs'], max_length=(max_context_length), return_tensors="pt", padding="max_length", truncation=True)
    labels = tokenizer(example['target'], max_length=max_question_length, return_tensors="pt", padding="max_length", truncation=True)
    return {"input_ids": inputs["input_ids"], "labels": labels["input_ids"]}

In [6]:
# load dataset
dataset = preprocess_squad_dataset(dataset_name='squad', split='train')
valid_dataset = preprocess_squad_dataset(dataset_name='squad', split='validation')
train, validation = dataset, valid_dataset

# tokenize dataset
tokenized_dataset_train = train.map(
    tokenize_function,
    batched=True,
    num_proc=32,
    remove_columns=['inputs', 'target', 'title', 'id'],
)

tokenized_dataset_validation = validation.select(range(1000)).map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=['inputs', 'target', 'title', 'id'],
)

## Evaluate

In [17]:
predictions = []
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

def generate_question(example, max_length=32):
    ids = tokenizer.encode(example, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(input_ids=ids, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def evaluate_question_generation(dataset, max_length=32):
    for example in tqdm(dataset):
        predictions.append(generate_question(example['inputs'], max_length=max_length))


dataset = preprocess_squad_dataset(dataset_name='squad', split='validation').select(range(1000, 2000))    
evaluate_question_generation(dataset, max_length=32)



In [None]:
# save predictions
with open(f"./results/{model_name}/predictions.txt", "w") as f:
    for pred in predictions:
        f.write(pred + "\n")

In [94]:
predictions = [prediction.replace('question>', '').replace('<question>', '') for prediction in predictions]
targets = [target.replace('<question>', '') for target in dataset['target']]

In [95]:
predictions[6]

' When was Super Bowl 50 played? '

In [96]:
targets[6]

' What day was the game played on? '

In [97]:
score = bertscore.compute(predictions=predictions, references=targets, lang='en')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [101]:
score.keys()

dict_keys(['precision', 'recall', 'f1', 'hashcode'])

### Eval results
```` T5-SMALL SQUAD: 0.89937

In [104]:
import numpy as np
np.mean(score['f1'])

0.8993794137835502

In [99]:
rouge.compute(predictions=predictions, references=targets)

{'rouge1': 0.3940472830525471,
 'rouge2': 0.2026262090775079,
 'rougeL': 0.3640860483908057,
 'rougeLsum': 0.36411097278438986}

In [None]:
# save results in a json file into ./models/{model_name}/metrics.json
import json
with open(f"./results/{model_name}/metrics.json", "w") as f:
    json.dump(rouge.compute(predictions=predictions, references=targets), f)
    json.dump(bertscore.compute(predictions=predictions, references=targets, lang='en'), f)
