## Evaluation Notebook
> model to evaluate below

In [5]:
# change model name
model_name = "t5-small-hl"

In [2]:
"""Train T5 model on a given dataset, for answer aware question generation task. (Basically a sequence to sequence task)"""

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import gc
import datasets
import transformers
import torch
import evaluate
from tqdm import tqdm
import json
import numpy as np
HIGHLIGHT = True
TOKEN_QUESTION = '<question>'
TOKEN_END_QUESTION = '<question>'
TOKEN_CONTEXT = '<context>'
TOKEN_END_CONTEXT = '<context>'
TOKEN_ANSWER = '<answer>'
TOKEN_END_ANSWER = '<answer>'  
HIGHLIGHT_ANSWER = '<hl>'
SPLIT_SEED = 42
NPROC = 32

  from .autonotebook import tqdm as notebook_tqdm


## Load fine-tuned model

In [4]:
print(model_name)

t5-base-hl


In [6]:
model = transformers.T5ForConditionalGeneration.from_pretrained(f"./models/{model_name}/", device_map='auto')
tokenizer = transformers.AutoTokenizer.from_pretrained(f"./models/{model_name}")
torch.cuda.empty_cache()

## Create SQuAD preprocessing pipeline

In [41]:
def get_inputs_target(e):
    answer_start = e["answers"]["answer_start"][0]
    # add highlight token to context
    ans_len = len(e["answers"]["text"][0])

    if HIGHLIGHT:
        e["context"] = (
            e["context"][:answer_start]
            + " "
            + HIGHLIGHT_ANSWER
            + " "
            + e["context"][answer_start : answer_start + ans_len]
            + " "
            + HIGHLIGHT_ANSWER
            + " "
            + e["context"][answer_start + ans_len :]
        )

    return {
        # answer + context
        "inputs": f'generate question: {TOKEN_ANSWER} {e["answers"]["text"][0]} {TOKEN_END_ANSWER} {TOKEN_CONTEXT} {e["context"]} {TOKEN_END_CONTEXT}',
        # question
        "target": f'{TOKEN_QUESTION} {e["question"]} {TOKEN_END_QUESTION}',
    }

def preprocess_squad_dataset(dataset_name='squad', split='train'):
    dataset = datasets.load_dataset(dataset_name, split=split)
    dataset = dataset.map(get_inputs_target, num_proc=NPROC)
    dataset = dataset.remove_columns(['answers', 'context', 'question'])
    return dataset

# create a tokenizer function
def tokenize_function(example, max_context_length=512, max_question_length=32):
# Combine context and question
    # Tokenize input (context + answer)
    print(example['inputs'])
    inputs = tokenizer(example['inputs'][0], max_length=(max_context_length), return_tensors="pt", padding="max_length", truncation=True)
    labels = tokenizer(example['target'], max_length=max_question_length, return_tensors="pt", padding="max_length", truncation=True)
    return {"input_ids": inputs["input_ids"], "labels": labels["input_ids"]}

In [None]:
# load dataset
dataset = preprocess_squad_dataset(dataset_name='squad', split='train')
valid_dataset = preprocess_squad_dataset(dataset_name='squad', split='validation')
train, validation = dataset, valid_dataset

# tokenize dataset
tokenized_dataset_train = train.map(
    tokenize_function,
    batched=True,
    num_proc=32,
    remove_columns=['inputs', 'target', 'title', 'id'],
)

tokenized_dataset_validation = validation.select(range(1000)).map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=['inputs', 'target', 'title', 'id'],
)

In [22]:
train.select(range(1))['inputs']

['generate question: <answer> Saint Bernadette Soubirous <answer> <context> Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to  <hl> Saint Bernadette Soubirous <hl>  in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary. <context>']

## Evaluate

In [42]:
predictions = []
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

In [45]:
predictions = []
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

def generate_question(example, max_length=32):
    ids = tokenizer.encode(example, return_tensors="pt", padding=True, truncation=True).to(model.device)
    outputs = model.generate(input_ids=ids, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def evaluate_question_generation(dataset, max_length=32):
    for example in tqdm(dataset):
        predictions.append(generate_question(example['inputs'], max_length=max_length))


dataset = preprocess_squad_dataset(dataset_name='squad', split='validation').select(range(1000, 2000))    
evaluate_question_generation(dataset, max_length=32)

  2%|▎         | 25/1000 [00:21<14:04,  1.15it/s]


KeyboardInterrupt: 

In [10]:
predictions = [prediction.replace('question>', '').replace('<question>', '').replace('<hl>', '') for prediction in predictions]
targets = [target.replace('<question>', '') for target in dataset['target']]

In [None]:
# save predictions and targets
res = {
    'predictions': predictions,
    'targets': targets
}

with open(f'./results/{model_name}.json', 'w') as f:
    json.dump(res, f)

In [48]:
model_name

't5-small-hl'

In [49]:
# evaluate
# load predictions and targets
with open(f'./results/{model_name}.json', 'r') as f:
    print(f'./results/{model_name}.json')
    res = json.load(f)

predictions = res['predictions']
targets = res['targets']

./results/t5-small-hl.json


In [50]:
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore', device_map='cpu')

bscore = bertscore.compute(predictions=predictions, references=targets, lang='en')
rougescore = rouge.compute(predictions=predictions, references=targets)

print(f"BERTScore: {np.mean(bscore['f1'])}")
print(f"ROUGE: {rougescore['rouge1']}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: 0.9083551833033562
ROUGE: 0.42308826740676375


### Eval results
```` T5-SMALL SQUAD: 0.89937

| Model | BERTScore | ROUGE |
| --- | --- | --- |
| T5-small no hl | 0.8905211106538773 | 0.35782651017737593 |
| T5-small hl | 0.9083551822900772 | 0.42310833048410884 |
| T5-base no hl | 0.9106274239420891 | 0.43880052930298297 |
| T5-base hl | 0.9065315473079681 | 0.4382283047193676 |



In [51]:
metrics = {
    'bertscore': bscore,
    'rouge': rougescore,
    'predictions' : predictions,
    'targets' : targets
}

# save results in a json file into ./models/{model_name}/metrics.json

with open(f"./models/{model_name}/metrics.json", "w") as f:
    json.dump(metrics, f)

In [52]:
model_name = "t5-small-hl"

# read metrics from json file
with open(f"./models/{model_name}/metrics.json", "r") as f:
    metrics = json.load(f)

# print metrics
print(f"BERTScore: {np.mean(metrics['bertscore']['f1'])}")
print(f"ROUGE: {metrics['rouge']['rouge1']}")


BERTScore: 0.9083551833033562
ROUGE: 0.42308826740676375
