<a href="https://www.kaggle.com/code/tarekyahia/lora-fine-tune-flan-t5-gen-question-answer-marco?scriptVersionId=168285324" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# install 

%pip install --upgrade pip

!pip install --upgrade transformers
%pip install \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer,AutoTokenizer

import torch

from datasets import DatasetDict, Dataset,load_dataset

import pandas as pd
import numpy as np

import nltk

from evaluate import load
metric = load("rouge")

batch_size = 8

# load the model


In [None]:
model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# to know the trinable parameters count
def number_of_trainable(model):
    trainable_params = 0
    all_model_params = 0
    for _,param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"trainable = {trainable_params}\nall params = {all_model_params}\npercentage of trainable params = {round((trainable_params/all_model_params)*100,2)}%"
    

# prepare the dataset

In [None]:
# Load the MS MARCO dataset
train = pd.read_csv("/kaggle/input/ms-marco-dataset/train.csv")
val = pd.read_csv("/kaggle/input/ms-marco-dataset/valid.csv")
train.head()

# for test 
#train = train.sample(n=5000)
#val = val.sample(n=2000)

In [None]:
# Convert df train to Dataset object
train = Dataset.from_pandas(train.dropna().reset_index(drop=True))

# Convert df2 to Dataset object
val = Dataset.from_pandas(val.dropna().reset_index(drop=True))

# Create a DatasetDict containing both datasets
dds = DatasetDict({"train": train, "val": val})

# Access datasets in the DatasetDict
print(dds.keys())  # Output: dict_keys(['dataset1', 'dataset2'])

In [None]:
# sort the df
# to work with the limitation of GPU memory

def lens(df):
    
    start_prompt = "Answer the following question based on the context: \n "
    question_prompt =  "Question: \n "
    context_prompt = " \n Context: \n "
    end_prompt = " \n Answer: "
    df['prompt'] = [
                        start_prompt + question_prompt + str(question) + context_prompt + str(context) + end_prompt 
                        for context, question in  zip(df['finalpassage'],df['query'])
    ]

    length = [len(tokenizer(prompt)['input_ids']) for prompt in df['prompt']]
    return {'lens':length}



dds_len =dds.map(lens, batched = True)

for key, dataset in dds_len.items():
    dds[key] = dataset.sort("lens")

    
#dds = sorted_datasets
dds['train']['lens'][:10]

In [None]:
sum(dds['train']['lens'])

In [None]:
def tokenize_prompt(example):
    # prepare
    start_prompt = "Answer the following question based on the context:\n"
    question_prompt =  "Question:\n"
    context_prompt = "\nContext:\n"
    end_prompt = "\nAnswer: "
    example['prompt'] = [
                        start_prompt + question_prompt + str(question) + context_prompt + str(context) + end_prompt 
                        for context, question in  zip(example['finalpassage'],example['query'])
    ]
    
    #tokenize
    input_id = tokenizer(example['prompt'],padding = True,truncation = True)

    label = tokenizer(example['answers'],padding = True, truncation = True)
        
    
    return {'input_ids':input_id['input_ids'],
                'labels':label['input_ids']
                }
    
    
tokenized_dds = dds.map(tokenize_prompt, batched = True,batch_size=batch_size)
tokenized_dds = tokenized_dds.remove_columns(['answers','query','finalpassage','prompt','lens'])
tokenized_dds

# fine-tune with LoRA

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha = 32,
    target_modules = ["q","v"],
    lora_dropout = 0.05,
    bias="none",
    task_type = TaskType.SEQ_2_SEQ_LM
)

In [None]:
# peft_model
peft_model = get_peft_model(model, lora_config)
print(number_of_trainable(peft_model))

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
output_dir = '/kaggle/working/'

peft_args = Seq2SeqTrainingArguments(
    output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    num_train_epochs=2,
    predict_with_generate=True,
    load_best_model_at_end=True
)

peft_trainer = Seq2SeqTrainer(
    model = peft_model,
    args = peft_args,
    train_dataset=tokenized_dds['train'],
    eval_dataset= tokenized_dds['val'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
peft_trainer.train()

# Save The Model.

In [None]:
if not os.path.exists('peft_models'):
    os.makedirs('peft_models')
peft_model_path = '/kaggle/working/peft_models/'

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)