In [4]:
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, default_data_collator, Seq2SeqTrainingArguments, Seq2SeqTrainer
from loguru import logger
import random

In [5]:
# # // load local dataset from json
# dataset = load_dataset('json', data_files={'train': 'data/anli_train_r1_with__rationale.json'})
# train_split, val_split = dataset.train_test_split(test_size=0.2)
ds = load_dataset('json', data_files={'train_r1': 'data/modified_anli_train_r1.json'})
test = load_dataset('json', data_files={'test_r1': 'data/modified_anli_test_r1.json'})
dev = load_dataset('json', data_files={'val_r1': 'data/modified_anli_dev_r1.json'})

print(ds)
print(test)
print(dev)


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test_r1 split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating val_r1 split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train_r1: Dataset({
        features: ['prompt_id', 'rationale', 'uid', 'premise', 'prompt', 'label', 'hypothesis'],
        num_rows: 10
    })
})
DatasetDict({
    test_r1: Dataset({
        features: ['prompt_id', 'hypothesis', 'uid', 'label', 'rationale', 'prompt', 'premise'],
        num_rows: 10
    })
})
DatasetDict({
    val_r1: Dataset({
        features: ['prompt_id', 'hypothesis', 'uid', 'label', 'rationale', 'prompt', 'premise'],
        num_rows: 10
    })
})


In [6]:
model_name= "t5-base"
# model_name= "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
dataset = dataset.map(
    lambda example: {'target': 'Label: ' + example['label']  + ' Rationale: ' + example['rationale']},
    remove_columns=['label', 'rationale'],
)

dataset = dataset.map(
    lambda example: {'input': 'Premise: ' + example['premise'] + ' Hypothesis: ' + example['hypothesis']},
    remove_columns=['premise', 'hypothesis'],
)


NameError: name 'dataset' is not defined

In [None]:
max_input_length = 600
max_target_length = 400

def preprocess_data(inputs):

    model_inputs = tokenizer(inputs['input'], max_length=max_input_length, truncation=True,  padding='max_length')
    # print("Model Inputs: {}".format(model_inputs))
    labels = tokenizer([str(label) for label in inputs['target']], max_length=max_target_length, truncation=True,  padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    # print(model_inputs)
    return model_inputs

In [None]:
processed_dataset = dataset.map(
    function=preprocess_data,
    batched=True)
# print(processed_dataset)
# print(processed_dataset['input'][0])
# print(processed_dataset['target'][0])

In [None]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    learning_rate=5e-5,
    output_dir="./t5-base-train"
    # remove_unused_columns=False
)

In [None]:
def compute_exact_match(eval_prediction):
    predictions = eval_prediction.predictions
    labels = eval_prediction.label_ids
    preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
    refs = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
    exact_matches = [1 if pred == ref else 0 for pred, ref in zip(preds, refs)]
    accuracy = np.mean(exact_matches)
    return {"exact_match_accuracy": accuracy}

In [None]:
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

def compute_metrics(eval_prediction):
    predictions, label_ids = eval_prediction
    preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
    refs = [tokenizer.decode(label, skip_special_tokens=True) for label in label_ids]

    label_accuracy = []
    rationale_scores = []

    for pred, ref in zip(preds, refs):
        # Check if the Eos token is present 
        if " Rationale: " in pred:

            pred_label, pred_rationale = pred.split(" Rationale: ")
            ref_label, ref_rationale = ref.split(" Rationale: ")

             # Calculate label accuracy
            label_accuracy.append(int(pred_label.strip() == ref_label.strip()))

            # Calculate rationale quality (using BLEU score as an example)
            rationale_scores.append(sentence_bleu([ref_rationale.strip().split()], pred_rationale.strip().split()))

        else:
            # Handle cases where Eos token is not present
            # You might want to log these cases or handle them differently
            logger.warning("Could notsplit rationale from label in prediction or reference")
            # print("Eos token not found in prediction or reference")

    return {
        "label_accuracy": np.mean(label_accuracy) if label_accuracy else 0,
        "rationale_bleu_score": np.mean(rationale_scores) if rationale_scores else 0
    }


In [None]:
processed_dataset = processed_dataset.shuffle(seed=42)
train_split, val_split = processed_dataset.train_test_split(test_size=0.1).values()

NameError: name 'processed_dataset' is not defined

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    # compute_metrics=compute_exact_match,
    train_dataset=train_split,
    eval_dataset=val_split,
    data_collator=default_data_collator,
    
)

NameError: name 'Seq2SeqTrainer' is not defined

In [None]:
trainer.train()


  0%|          | 0/900 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
test = processed_dataset
logger.debug(test)
# // print the average token length of input and target tokens
input_token_lens = []
target_token_lens = []
for i in range(len(dataset)):
    # print(dataset[i])
    # print(dataset[i]['input'])
    # print(dataset[i]['target'])
    input_token_lens.append(len(dataset[i]['input']))
    target_token_lens.append(len(dataset[i]['target']))
print("Average input token length: {}".format(np.mean(input_token_lens)))
print("Average target token length: {}".format(np.mean(target_token_lens)))
print("Max input token length: {}".format(np.max(input_token_lens)))
print("Max target token length: {}".format(np.max(target_token_lens)))


[32m2024-01-28 14:14:16.663[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1mDataset({
    features: ['prompt', 'prompt_id', 'uid', 'target', 'input', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})[0m


Average input token length: 408.403
Average target token length: 230.212
Max input token length: 685
Max target token length: 394


In [None]:
def manual_evaluation_of_scores(pred, ref):
    if " Rationale: " in pred:

            pred_label, pred_rationale = pred.split(" Rationale: ")
            ref_label, ref_rationale = ref.split(" Rationale: ")

             # Calculate label accuracy
            label_accuracy = int(pred_label.strip() == ref_label.strip())

            # Calculate rationale quality (using BLEU score as an example)
            rationale_scores = sentence_bleu([ref_rationale.strip().split()], pred_rationale.strip().split())

    else:
        # Handle cases where Eos token is not present
        # You might want to log these cases or handle them differently
        logger.warning("Could notsplit rationale from label in prediction or reference")
        # print("Eos token not found in prediction or reference")

    return {
        "label_accuracy": np.mean(label_accuracy) if label_accuracy else 0,
        "rationale_bleu_score": np.mean(rationale_scores) if rationale_scores else 0
    }

In [None]:
# // load the model from the output directory
model = T5ForConditionalGeneration.from_pretrained('./t5-train/checkpoint-4500')
tokenizer = T5Tokenizer.from_pretrained('./t5-train/checkpoint-4500')

ind = random.randint(0, len(dataset))

test_input = dataset['input'][ind]
test_label = dataset['target'][ind]

input_ids = tokenizer.encode(test_input, return_tensors='pt')

output = model.generate(input_ids,max_new_tokens=400)
output_decoded = tokenizer.decode(output[0], skip_special_tokens=True)
# print amount of output tokens
manual_eval = manual_evaluation_of_scores(output_decoded, test_label)

# Log the results
logger.debug(f"Input: {test_input}")
logger.debug(f"Output of T5: {output_decoded} \n with token amount of {len(output_decoded.split())}")
logger.debug(f"Target: {test_label}")
logger.debug(f"Test accuracy: {manual_eval}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[32m2024-01-28 14:14:18.494[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [34m[1mInput: Premise: Buddleja 'Pink Pagoda' (sold as in the USA) is a sterile hybrid cultivar raised in 1998 by Peter Moore, Chief Propagator at the Longstock Park Nursery in Hampshire, England, and named by Roy Lancaster. The cultivar is the result of a crossing of "Buddleja" 'Pink Delight' with "Buddleja × weyeriana" 'Golden Glow'. Plant Breeders' Rights were awarded in 2008. Hypothesis: Peter Moore is the Chief Propagator at the Blackmoor Nursery in Hampshire, England.[0m
[32m2024-01-28 14:14:18.495[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [34m[1mOutput of T5: Label: Contradiction Rationale: Who: Peter Moore, Roy Lancaster What: Buddleja 'Pink Pagoda' (sterile hybrid cultivar) When: raised in 1998 Where: Longstock