In [None]:
# from evaluate import load
# gleu_metric = load('glue', 'mrpc')


In [1]:
from nltk.translate.gleu_score import corpus_gleu

In [2]:
from datasets import load_dataset
eval_dataset = load_dataset("csv", data_files=["data/eval_data.csv"])

In [11]:
eval_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 1201
    })
})

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
checkpoint = "model/gec-t5-base-40ep"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [9]:
def preprocess_func(examples):
    inputs = [source for source in examples['input']]
    targets = [target for target in examples['target']]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=1024, truncation=True)
    return model_inputs


In [10]:
tokenized_data = eval_dataset.map(preprocess_func, batched=True)

Map:   0%|          | 0/1201 [00:00<?, ? examples/s]

In [12]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1201
    })
})

In [None]:
tokenized_data['train']['input_ids']

In [None]:
tokenized_data

In [None]:
len(eval_dataset['train'])

In [None]:
eval_dataset['train'][0]

In [None]:
from transformers import pipeline
corrector = pipeline("text2text-generation", model=checkpoint)

In [15]:
from tqdm import tqdm
references = []
predictions = []
for i , data in tqdm(enumerate(eval_dataset['train']), total=len(eval_dataset['train'])):
    input_s = eval_dataset['train'][i]['input']
    # print("Input: ", input_s)
    input_ids = tokenizer(input_s, return_tensors="pt").input_ids
    gen = model.generate(input_ids, max_new_tokens=1024)
    pred = tokenizer.decode(gen[0], skip_special_tokens=True)
    target = eval_dataset['train'][i]['target']
    # print("Target: ", target)
    # correction = corrector(input_s)
    # pred = correction[0]['generated_text']
    references.append(target)
    predictions.append(pred)

gleu_score = corpus_gleu([[ref] for ref in references], [pred for pred in predictions], max_len=1024)
print(f"GLEU score: ", gleu_score)   


100%|██████████| 1201/1201 [16:34<00:00,  1.21it/s]


GLEU score:  0.45690885316021235


In [None]:

predictions =[]
references = []
model.eval()
input_ids = tokenized_data['train']['input_ids']
target_ids = tokenized_data['train']['labels']
outputs = model.generate(input_ids=input_ids, max_length=1024)

predictions.extend(tokenizer.batch_decode(output, skip_special_tokens=True) for output in outputs)
references.extend(tokenizer.decode(target_ids[i], skip_special_tokens=True) for i in range(len(target_ids)))
        
gleu_score = corpus_gleu([[ref] for ref in references], [pred for pred in predictions])

print(f"GLEU Score: {gleu_score}")

In [None]:
for epoch in range(8):
    test_loss = []
    predictions =[]
    references = []
    model.eval()
    for batch in tokenized_data:
        input_ids = batch['train']['input_ids']
        target_ids = batch['train']['labels']
        outputs = model.generate(input_ids=input_ids, max_length=1024)
        
        predictions.extend(tokenizer.decode(output, skip_special_tokens=True) for output in outputs)
        references.extend(tokenizer.decode(target_ids[i], skip_special_tokens=True) for i in range(len(target_ids)))
        
    gleu_score = corpus_gleu([[ref] for ref in references], [pred for pred in predictions])
    print(f"Epoch: {epoch+1} GLUE: {gleu_score}")
        

In [None]:
# Then create a function that passes your predictions and labels to compute to calculate the SacreBLEU score:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result