In [1]:
import torch
import evaluate
import numpy as np

from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

In [2]:
# for retraining change this model name to base model and model_name should be same as before for saving it at last.
model_name = './lora_trained/lora-flan-t5-rte/'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset('glue','rte')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

Some weights of the model checkpoint at ./lora_trained/lora-flan-t5-rte/ were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.0.SelfAttention.q.base_layer.weight', 'decoder.block.0.layer.0.SelfAttention.q.lora_A.default.weight', 'decoder.block.0.layer.0.SelfAttention.q.lora_B.default.weight', 'decoder.block.0.layer.0.SelfAttention.v.base_layer.weight', 'decoder.block.0.layer.0.SelfAttention.v.lora_A.default.weight', 'decoder.block.0.layer.0.SelfAttention.v.lora_B.default.weight', 'decoder.block.0.layer.1.EncDecAttention.q.base_layer.weight', 'decoder.block.0.layer.1.EncDecAttention.q.lora_A.default.weight', 'decoder.block.0.layer.1.EncDecAttention.q.lora_B.default.weight', 'decoder.block.0.layer.1.EncDecAttention.v.base_layer.weight', 'decoder.block.0.layer.1.EncDecAttention.v.lora_A.default.weight', 'decoder.block.0.layer.1.EncDecAttention.v.lora_B.default.weight', 'decoder.block.1.layer.0.SelfAttention.q.base_layer.weight', 'decoder.block.1.layer.0.Self

cuda


In [3]:
total_params = sum(p.numel() for p in model.parameters())
print(total_params * 4)
for key ,value in enumerate(dataset):
    print(key, value)

990311424
0 train
1 validation
2 test


In [8]:
def data_preprocessing(batch):
    inputs = [f"premise: {premise} hypothesis: {hypothesis}" for premise, hypothesis in zip(batch['sentence1'], batch['sentence2'])]
    labels = batch['label']

    label_map = {0: "not entailment", 1: "entailment"}
    processed_labels = [label_map[label] if label in label_map else None for label in labels]

    valid_inputs = [input_text for input_text, label in zip(inputs, processed_labels) if label is not None]
    valid_labels = [label for label in processed_labels if label is not None]

    model_inputs = tokenizer(valid_inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(valid_labels, max_length=16, truncation=True, padding="max_length")

    model_inputs["labels"] = tokenized_labels["input_ids"]
    return model_inputs

def process_dataset_in_batches(dataset, batch_size=32):
    processed_dataset = {
        "input_ids": [],
        "attention_mask": [],
        "labels": []
    }
    
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i + batch_size]
        processed_batch = data_preprocessing(batch)
        
        processed_dataset["input_ids"].extend(processed_batch["input_ids"])
        processed_dataset["attention_mask"].extend(processed_batch["attention_mask"])
        processed_dataset["labels"].extend(processed_batch["labels"])

    return processed_dataset

train_processed = process_dataset_in_batches(dataset["train"], batch_size=32)
validation_processed = process_dataset_in_batches(dataset["validation"], batch_size=32)

train_dataset = Dataset.from_dict({
    "input_ids": train_processed["input_ids"],
    "attention_mask": train_processed["attention_mask"],
    "labels": train_processed["labels"]
})

validation_dataset = Dataset.from_dict({
    "input_ids": validation_processed["input_ids"],
    "attention_mask": validation_processed["attention_mask"],
    "labels": validation_processed["labels"]
})

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")


Training dataset size: 2490
Validation dataset size: 277


In [5]:
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # Replace ignored index (-100)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    accuracy = sum([pred == label for pred, label in zip(decoded_preds, decoded_labels)]) / len(decoded_preds)

    return {
        "accuracy": accuracy
    }

In [9]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

lora_config = LoraConfig(
    r=8, 
    lora_alpha=32,  
    target_modules=["q", "v"],  
    lora_dropout=0.1,  
    bias="none"
)


lora_model = get_peft_model(model, lora_config)
lora_model.to(device)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=lora_model)
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=1e-5,  # Learning rate
    per_device_train_batch_size=12,  # Training batch size
    per_device_eval_batch_size=12,  # Evaluation batch size
    weight_decay=0.01,  # Weight decay to avoid overfitting
    save_total_limit=3,  # Save up to 3 model checkpoints
    num_train_epochs=3,  # Number of epochs to train
    predict_with_generate=True,  # Generate predictions during evaluation
    logging_dir='./logs',  # Directory for logs
    logging_steps=500,  # How often to log during training
    fp16=True,
)



# Define Trainer
trainer = Seq2SeqTrainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,  # The preprocessed training dataset
    eval_dataset=validation_dataset,  # The preprocessed validation dataset
    data_collator=data_collator,  # Handles batch padding
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Fine-tune the LoRA model
# uncommnet this to train
# make sure to change model_name from the top code shell to base model for retraining.
# trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [7]:

results = trainer.evaluate()
print(results)




  0%|          | 0/24 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)

('./lora_trained/lora-flan-t5-rte/tokenizer_config.json',
 './lora_trained/lora-flan-t5-rte/special_tokens_map.json',
 './lora_trained/lora-flan-t5-rte/tokenizer.json')