## Evaluation Notebook
> model to evaluate below

In [1]:
# change model name
model_name = "t5-base-hl-ppo"

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"
import gc
import datasets
import transformers
import torch
import evaluate
from tqdm import tqdm
import json
import numpy as np
HIGHLIGHT = True
TOKEN_QUESTION = '<question>'
TOKEN_END_QUESTION = '<question>'
TOKEN_CONTEXT = '<context>'
TOKEN_END_CONTEXT = '<context>'
TOKEN_ANSWER = '<answer>'
TOKEN_END_ANSWER = '<answer>'  
HIGHLIGHT_ANSWER = '<hl>'
SPLIT_SEED = 42
NPROC = 32

  from .autonotebook import tqdm as notebook_tqdm


## Load fine-tuned model

In [3]:
print(model_name)

t5-base-hl-ppo


In [4]:

model = transformers.T5ForConditionalGeneration.from_pretrained(f"./models/{model_name}/", device_map='cuda:0')
tokenizer = transformers.AutoTokenizer.from_pretrained(f"./models/{model_name}")
torch.cuda.empty_cache()

## Create SQuAD preprocessing pipeline

In [5]:
def get_inputs_target(e):
    answer_start = e["answers"]["answer_start"][0]
    # add highlight token to context
    ans_len = len(e["answers"]["text"][0])

    if HIGHLIGHT:
        e["context"] = (
            e["context"][:answer_start]
            + " "
            + HIGHLIGHT_ANSWER
            + " "
            + e["context"][answer_start : answer_start + ans_len]
            + " "
            + HIGHLIGHT_ANSWER
            + " "
            + e["context"][answer_start + ans_len :]
        )

    return {
        # answer + context
        "inputs": f'generate question: {TOKEN_ANSWER} {e["answers"]["text"][0]} {TOKEN_END_ANSWER} {TOKEN_CONTEXT} {e["context"]} {TOKEN_END_CONTEXT}',
        # question
        "target": f'{TOKEN_QUESTION} {e["question"]} {TOKEN_END_QUESTION}',
    }

def preprocess_squad_dataset(dataset_name="squad", split="train"):
    dataset = datasets.load_dataset(dataset_name, split=split).select(range(1000,2000))
    dataset = dataset.map(get_inputs_target, num_proc=NPROC)
    return dataset

# create a tokenizer function
def tokenize_function(example, max_context_length=512, max_question_length=32):
# Combine context and question
    # Tokenize input (context + answer)
    inputs = tokenizer(example['inputs'], max_length=(max_context_length), return_tensors="pt", padding="max_length", truncation=True)
    labels = tokenizer(example['target'], max_length=max_question_length, return_tensors="pt", padding="max_length", truncation=True)
    return {"input_ids": inputs["input_ids"], "labels": labels["input_ids"]}

In [6]:
# load dataset
validation = preprocess_squad_dataset(dataset_name='squad', split='validation')
validation = validation.shuffle(seed=SPLIT_SEED).select(range(100))


## Evaluate

In [7]:
predictions = []
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

def generate_question(example, max_length=32):
    ids = tokenizer.encode(example, return_tensors="pt", padding=True, truncation=True).to(model.device)
    outputs = model.generate(input_ids=ids, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def evaluate_question_generation(dataset, max_length=32):
    for example in tqdm(dataset):
        predictions.append(generate_question(example['inputs'], max_length=max_length))


dataset = validation
evaluate_question_generation(dataset, max_length=32)

100%|██████████| 100/100 [02:00<00:00,  1.21s/it]


In [10]:
predictions = [prediction.replace('question>', '').replace('<question>', '').replace('<hl>', '') for prediction in predictions]
targets = [target.replace('<question>', '') for target in dataset['target']]
contexes = [d['context'] for d in dataset]
answers = [d['answers']['text'][0] for d in dataset]
references= [d['question'] for d in dataset]


In [11]:
# export to json file. create a list of json objects, each object is a dict with keys: context, answer, question, prediction
json_list = []
for i in range(len(predictions)):
    json_list.append({'context': contexes[i], 'answer': answers[i], 'question': references[i], 'prediction': predictions[i]})
with open(f'./predictions/{model_name}.json', 'w') as outfile:
    json.dump(json_list, outfile)

In [None]:
# save predictions and targets



res = {
    'predictions': predictions,
    'targets': targets,
    'contexes': contexes,
    'answers': answers,
    'question': references
}

with open(f'./results/{model_name}.json', 'w') as f:
    json.dump(res, f)

In [48]:
model_name

't5-small-hl'

In [49]:
# evaluate
# load predictions and targets
with open(f'./results/{model_name}.json', 'r') as f:
    print(f'./results/{model_name}.json')
    res = json.load(f)

predictions = res['predictions']
targets = res['targets']

./results/t5-small-hl.json


In [50]:
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore', device_map='cpu')

bscore = bertscore.compute(predictions=predictions, references=targets, lang='en')
rougescore = rouge.compute(predictions=predictions, references=targets)

print(f"BERTScore: {np.mean(bscore['f1'])}")
print(f"ROUGE: {rougescore['rouge1']}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: 0.9083551833033562
ROUGE: 0.42308826740676375


### Eval results (Squad v1)

| Model | BERTScore | ROUGE |
| --- | --- | --- |
| T5-small no hl | 0.89052 | 0.35782 |
| T5-small hl | 0.908355 | 0.42310 |
| T5-small hl, PPO | 0.9039 | 0.41878 |
| T5-base no hl | 0.9106 | 0.43880 |
| T5-base hl | 0.91270 | 0.454453 |
| T5-base hl, PPO | ???? | ???? |



## Evaluate on SQuAD v2

In [51]:
metrics = {
    'bertscore': bscore,
    'rouge': rougescore,
    'predictions' : predictions,
    'targets' : targets
}

# save results in a json file into ./models/{model_name}/metrics.json
with open(f"./models/{model_name}/metrics.json", "w") as f:
    json.dump(metrics, f)

In [4]:
model_name = "t5-base-hl"

# read metrics from json file
with open(f"./models/{model_name}/metrics.json", "r") as f:
    metrics = json.load(f)

# print metrics
print(f"BERTScore: {np.mean(metrics['bertscore']['f1'])}")
print(f"ROUGE: {metrics['rouge']['rouge1']}")


BERTScore: 0.9127049925327301
ROUGE: 0.45445331974723385
