In [1]:
import os
import re
import string
import torch
import datasets
from datasets import load_from_disk, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from evaluate import load
import numpy as np
from nltk.stem import PorterStemmer
from scipy.spatial.distance import cosine
import nltk
import matplotlib.pyplot as plt
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
t5_model_name = 't5-base'

In [3]:
# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
meteor = load('meteor')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\YoonesVaezi(DataMLOp\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\YoonesVaezi(DataMLOp\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\YoonesVaezi(DataMLOp\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\YoonesVaezi(DataMLOp\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\YoonesVaezi(DataMLOp\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Us

In [4]:
# Initialize the stemmer for stemming words during text normalization
stemmer = PorterStemmer()
# Define a consistent max_length for tokenization
max_length = 128

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
# 1. Define the custom Seq2SeqDataset class (if needed)
class Seq2SeqDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=max_length, max_target_length=max_length):
        self.data = data  # The dataset containing the text examples
        self.tokenizer = tokenizer  # The tokenizer for converting text to tokens
        self.max_input_length = max_input_length  # Max length for input sequences
        self.max_target_length = max_target_length  # Max length for target sequences

    def __len__(self):
        return len(self.data)  # Returns the total number of examples in the dataset

    def __getitem__(self, index):
        # Fetch the original and rephrased questions for the given index
        input_text = self.data['original_question'][index]
        target_text = self.data['rephrased_question'][index]

        # Tokenize the input and target texts
        input_ids = self.tokenizer(input_text, padding='max_length', max_length=self.max_input_length, truncation=True, return_tensors='pt').input_ids
        target_ids = self.tokenizer(target_text, padding='max_length', max_length=self.max_target_length, truncation=True, return_tensors='pt').input_ids

        # Return a dictionary with input and target token IDs
        return {'input_ids': input_ids.squeeze(), 'labels': target_ids.squeeze()}

In [6]:
# # 2. Load and split the dataset
# def load_and_split_dataset(dataset_path, split_ratios=(0.8, 0.1, 0.1)):
#     # Load the full dataset from the specified path
#     dataset = load_from_disk(dataset_path)

#     # Split dataset into training and test sets first
#     split_dataset = dataset.train_test_split(test_size=split_ratios[2])

#     # Further split the training set into train and validation sets
#     temp_dataset = split_dataset['train'].train_test_split(test_size=split_ratios[1] / (split_ratios[0] + split_ratios[1]))

#     # Combine the splits into a final DatasetDict
#     final_splits = DatasetDict({
#         'train': temp_dataset['train'],
#         'validation': temp_dataset['test'],
#         'test': split_dataset['test']
#     })

#     # Return the split datasets
#     return final_splits

In [7]:
def load_and_split_dataset(dataset_path, split_ratios=(0.8, 0.1, 0.1)):
    # Load the dataset from the specified path
    dataset = load_from_disk(dataset_path)
    
    # Check if the dataset already has 'train', 'validation', and 'test' keys
    if all(key in dataset.keys() for key in ['train', 'validation', 'test']):
        print("Dataset already contains 'train', 'validation', and 'test' splits.")
        return dataset  # Return the existing splits
    
    # If not, perform the splits manually
    print("Splitting the dataset into 'train', 'validation', and 'test' sets.")
    
    # Split dataset into training and test sets first
    split_dataset = dataset.train_test_split(test_size=split_ratios[2])

    # Further split the training set into train and validation sets
    temp_dataset = split_dataset['train'].train_test_split(test_size=split_ratios[1] / (split_ratios[0] + split_ratios[1]))

    # Combine the splits into a final DatasetDict
    final_splits = DatasetDict({
        'train': temp_dataset['train'],
        'validation': temp_dataset['test'],
        'test': split_dataset['test']
    })

    # Return the split datasets
    return final_splits

In [8]:
# Ensure consistent tokenization settings across the script
def tokenize_input(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

In [9]:
# 3. Preprocess the data using tokenizer
def preprocess_data(tokenizer, dataset, max_input_length=max_length, max_target_length=max_length):
    # This function tokenizes the input and target sequences in the dataset
    def tokenize_function(examples):
        # Tokenize the original question
        model_inputs = tokenizer(
            examples["original_question"], 
            max_length=max_input_length, 
            truncation=True, 
            padding="max_length"
        )
        # Tokenize the rephrased question and store it as labels
        labels = tokenizer(
            examples["rephrased_question"], 
            max_length=max_target_length, 
            truncation=True, 
            padding="max_length"
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # Apply the tokenization function to the dataset
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset

In [10]:
# Text normalization function
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove extra whitespaces (e.g., multiple spaces) and trim leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Apply stemming to reduce words to their root form
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    # Return the normalized text
    return text

In [11]:
# Function to compute normalized edit distance
def compute_normalized_edit_distance(pred, label):
    # Calculate the raw Edit Distance
    raw_edit_distance = nltk.edit_distance(pred, label)
    
    # Normalize by the length of the longer string
    normalized_edit_distance = raw_edit_distance / max(len(pred), len(label))
    
    return normalized_edit_distance

In [12]:
# 4. Define compute_metrics function with proper handling of text-based and embedding-based metrics
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions.argmax(-1)
    
    # Load the tokenizer to decode the prediction and label IDs back to text
    # tokenizer = T5Tokenizer.from_pretrained("t5-base")
    
    # Decode the predictions and labels into strings
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    # Normalize the decoded strings
    pred_str_norm = [normalize_text(text) for text in pred_str]
    labels_str_norm = [normalize_text(text) for text in labels_str]
    
    # Compute text-based metrics
    # bleu_score = np.mean([sentence_bleu([label], pred) for label, pred in zip(labels_str_norm, pred_str_norm)])
    
    # rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    # rouge_scores = [rouge_scorer_obj.score(label, pred) for label, pred in zip(labels_str_norm, pred_str_norm)]
    # avg_rouge = {metric: np.mean([score[metric].fmeasure for score in rouge_scores]) for metric in rouge_scores[0].keys()}
    
 
    # meteor_scores = meteor.compute(predictions=pred_str_norm, references=labels_str_norm)
    
    # edit_distance = np.mean([nltk.edit_distance(pred, label) for pred, label in zip(pred_str_norm, labels_str_norm)])

    # Normalized Edit Distance
    # normalized_edit_distance = np.mean([compute_normalized_edit_distance(pred, label) for pred, label in zip(pred_str_norm, labels_str_norm)])

    # Compute embedding-based metrics
    # Tokenize for embedding-based metrics
    # pred_embeddings = tokenize_input(pred_str)['input_ids']
    # labels_embeddings = tokenize_input(labels_str)['input_ids']

    # Get embeddings for predicted and reference strings using consistent tokenization
    pred_embeddings = tokenizer(pred_str, padding=True, truncation=True, max_length=max_length, return_tensors='pt')['input_ids']
    labels_embeddings = tokenizer(labels_str, padding=True, truncation=True, max_length=max_length, return_tensors='pt')['input_ids']

    # Move embeddings to GPU if available
    # if torch.cuda.is_available():
    #     pred_embeddings = pred_embeddings.cuda()
    #     labels_embeddings = labels_embeddings.cuda()

    # Cosine similarity between embeddings
    # cosine_sim_scores = [1 - cosine(pred_embedding.cpu(), label_embedding.cpu()) for pred_embedding, label_embedding in zip(pred_embeddings, labels_embeddings)]
    # avg_cosine_similarity = np.mean(cosine_sim_scores)

    # Compute cosine similarity directly on the GPU using PyTorch
    cosine_sim_scores = F.cosine_similarity(pred_embeddings.float(), labels_embeddings.float(), dim=-1)
    avg_cosine_similarity = torch.mean(cosine_sim_scores).item()
    
    # Perplexity (logits-based, not embedding-based)
    # perplexity = torch.exp(torch.mean(pred.predictions))

    # Return the computed metrics as a dictionary
    return {
        'cosine_similarity': avg_cosine_similarity,
        # 'bleu': bleu_score,
        # 'rouge1': avg_rouge['rouge1'],
        # 'rougeL': avg_rouge['rougeL'],
        # 'meteor': meteor_scores['meteor'],
        # 'perplexity': perplexity.item(),
        # 'edit_distance': edit_distance,
        # 'normalized_edit_distance': normalized_edit_distance
    }

In [13]:
# 5. Set Up the Model and Training Arguments
def setup_model_and_args(model_name="t5-small", output_dir="./results", **kwargs):
    # Load the T5 model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    
    # Move model to GPU if available
    # if torch.cuda.is_available():
    #     model.cuda()
    # Default training arguments in a dictionary
    training_args_dict = {
        "output_dir":output_dir,  # Output directory for model checkpoints
        "eval_strategy":'steps',  # Evaluate model at the end of each epoch
        "save_strategy":'steps',  # Save the model every few steps
        "learning_rate":5e-5,  # Learning rate for optimizer
        "warmup_steps":300,
        "per_device_train_batch_size":16,  # Batch size for training
        "per_device_eval_batch_size":16,  # Batch size for evaluation
        "weight_decay":0.01,  # Weight decay to avoid overfitting
        "num_train_epochs":1,  # Number of training epochs
        "fp16":False,#torch.cuda.is_available(),  # Enable mixed precision training if a GPU is available
        "logging_dir":'./logs',  # Directory for storing logs
        "logging_steps":50,  # Log training progress every 10 steps
        "eval_steps":50,  # Evaluate the model every 500 steps
        "save_steps":100,  # Save model checkpoint every 500 steps
        "load_best_model_at_end":True,  # Load the best model found during training
        # "metric_for_best_model": "cosine_similarity",  # Example: Monitor BLEU score to save the best model
        # "greater_is_better": True,  # Set to False if a lower value is better (e.g., for loss)
        "metric_for_best_model": "eval_loss",
        "greater_is_better": False,
    }

    # Update the dictionary with any provided kwargs
    training_args_dict.update(kwargs)

    # Pass the updated dictionary to TrainingArguments
    training_args = TrainingArguments(**training_args_dict)
    
    # Return the model, tokenizer, and training arguments
    return model, tokenizer, training_args

In [14]:
# 6. Initialize Trainer and Data Collator
def initialize_trainer(model, tokenizer, tokenized_dataset, training_args):
    # Define the data collator to dynamically pad the sequences during batching
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    
    # Initialize the Trainer class with the model, arguments, datasets, tokenizer, and data collator
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],  # Training dataset
        eval_dataset=tokenized_dataset["validation"],  # Validation dataset
        tokenizer=tokenizer,  # Tokenizer to decode the predictions
        data_collator=data_collator,  # Data collator for padding
        # compute_metrics=compute_metrics  # Function to compute evaluation metrics
    )
    
    # Return the initialized Trainer
    return trainer

In [15]:
# 7. Train the Model and Evaluate
def train_and_evaluate_model(trainer):
    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results

# 8. Save the Model and Tokenizer
def save_model_and_tokenizer(trainer, tokenizer, output_dir="./fine_tuned_t5"):
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

In [16]:
# 9. Plot training and validation losses
def plot_train_validation_losses(trainer):
    train_loss = trainer.state.log_history
    steps = range(len(train_loss))
    train_losses = [x['loss'] for x in train_loss if 'loss' in x]
    eval_losses = [x['eval_loss'] for x in train_loss if 'eval_loss' in x]

    plt.figure(figsize=(12, 6))

    # Training loss
    plt.subplot(1, 2, 1)
    plt.plot(steps[:len(train_losses)], train_losses, label='Training Loss')
    plt.xlabel('Steps')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.legend()

    # Validation loss
    plt.subplot(1, 2, 2)
    plt.plot(steps[:len(eval_losses)], eval_losses, label='Validation Loss', color='orange')
    plt.xlabel('Steps')
    plt.ylabel('Loss')
    plt.title('Validation Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

# 10. Plot evaluation metrics over iterations
def plot_evaluation_metrics(trainer):
    log_history = trainer.state.log_history

    # Separate metrics
    eval_steps = [x['step'] for x in log_history if 'eval_loss' in x]
    bleu_scores = [x['eval_bleu'] for x in log_history if 'eval_bleu' in x]
    rouge1_scores = [x['eval_rouge1'] for x in log_history if 'eval_rouge1' in x]
    rougeL_scores = [x['eval_rougeL'] for x in log_history if 'eval_rougeL' in x]
    meteor_scores = [x['eval_meteor'] for x in log_history if 'eval_meteor' in x]
    cosine_similarity_scores = [x['eval_cosine_similarity'] for x in log_history if 'eval_cosine_similarity' in x]
    perplexity_scores = [x['eval_perplexity'] for x in log_history if 'eval_perplexity' in x]
    edit_distances = [x['eval_edit_distance'] for x in log_history if 'eval_edit_distance' in x]

    plt.figure(figsize=(14, 7))

    # Subplot 1: Evaluation metrics except Edit Distance
    plt.subplot(1, 2, 1)
    plt.plot(eval_steps, bleu_scores, label='BLEU', color='blue')
    plt.plot(eval_steps, rouge1_scores, label='ROUGE-1', color='green')
    plt.plot(eval_steps, rougeL_scores, label='ROUGE-L', color='red')
    plt.plot(eval_steps, meteor_scores, label='METEOR', color='purple')
    plt.plot(eval_steps, cosine_similarity_scores, label='Cosine Similarity', color='brown')
    plt.plot(eval_steps, perplexity_scores, label='Perplexity', color='magenta')
    plt.xlabel('Evaluation Steps')
    plt.ylabel('Score')
    plt.title('Evaluation Metrics Over Iterations (Excluding Edit Distance)')
    plt.legend()

    # Subplot 2: Edit Distance
    plt.subplot(1, 2, 2)
    plt.plot(eval_steps, edit_distances, label='Edit Distance', color='orange')
    plt.xlabel('Evaluation Steps')
    plt.ylabel('Edit Distance')
    plt.title('Edit Distance Over Iterations')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [17]:
# Main function to orchestrate the fine-tuning process
def fine_tune_t5(dataset_path, output_dir="./fine_tuned_t5",model_name='t5_small',split_ratios=(0.8, 0.1, 0.1), **training_args):
    dataset_splits = load_and_split_dataset(dataset_path, split_ratios)
    model, tokenizer, training_args = setup_model_and_args(
        model_name=model_name,
        output_dir=output_dir, 
        **training_args
    )
    tokenized_dataset = preprocess_data(tokenizer, dataset_splits)
    trainer = initialize_trainer(model, tokenizer, tokenized_dataset, training_args)

    # Train and Evaluate
    eval_results = train_and_evaluate_model(trainer)
    print(f"Evaluation results: {eval_results}")

    # Save Model and Tokenizer
    save_model_and_tokenizer(trainer, tokenizer, output_dir)

    # Plot losses
    # plot_train_validation_losses(trainer)

    # Plot evaluation metrics
    # plot_evaluation_metrics(trainer)

    return trainer, eval_results

In [18]:
# Function to plot training and evaluation metrics after training
def plot_training_and_evaluation_results(trainer):
    # Plot training and validation losses
    plot_train_validation_losses(trainer)

    # Plot evaluation metrics
    plot_evaluation_metrics(trainer)

In [19]:
# trainer, results = fine_tune_t5("msmarco_dataset", output_dir="./fine_tuned_t5_masmarco", num_train_epochs=1, split_ratios=(0.8, 0.1, 0.1),
#                  learning_rate=3e-5, per_device_train_batch_size=32,per_device_eval_batch_size=32)

trainer, results = fine_tune_t5(
        "hotpotqa_dataset", 
        output_dir="./fine_tuned_t5_hotpotqa", 
        model_name = "t5-small",
        split_ratios=(0.8, 0.1, 0.1),
        num_train_epochs=5, 
        use_cpu = False,
        learning_rate=3e-5,  # Override the default learning rate
        per_device_train_batch_size=16, 
        per_device_eval_batch_size=4
    )

Dataset already contains 'train', 'validation', and 'test' splits.


  0%|          | 50/28160 [00:55<8:29:30,  1.09s/it]

{'loss': 11.9363, 'grad_norm': 50.36502456665039, 'learning_rate': 4.9999999999999996e-06, 'epoch': 0.01}




In [None]:
# plot_training_and_evaluation_results(trainer)