<a href="https://colab.research.google.com/github/suman520-git/Large_Language_Model/blob/main/Fine_Tune_Any_LLM_Model_Modular_Coding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install accelerate transformers peft bitsandbytes datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2

In [2]:
from datasets import load_dataset, Dataset
import torch

In [3]:
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer,BitsAndBytesConfig,DataCollatorForLanguageModeling
    )

In [4]:
from peft import LoraConfig, get_peft_model, PeftModel

In [5]:
import pandas as pd

In [30]:
class LoRAFineTuner:

  def __init__(self,model_name,dataset_name,output_dir):
    """
    this is initialization of the class parameter
    """
    print("params initlized")

    self.model_name=model_name
    self.dataset_name=dataset_name
    self.output_dir=output_dir
    self.tokenizer=None
    self.model=None
    self.tokenized_data=None

  def load_tokenizer(self):
    """
    this function to define the tokenizer of the model
    """
    print("load_tokenizer")
    self.tokenizer=AutoTokenizer.from_pretrained(self.model_name,trust_remote_code=True)
    self.tokenizer.pad_token=self.tokenizer.eos_token

  def load_model(self):
    """
    this function loadsdefines the quantizes model
    """

    print("load_model")

    bnb_config=BitsAndBytesConfig(
        load_in_4bit=True,

        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
        )

    #Quantized model
    self.model=AutoModelForCausalLM.from_pretrained(
        self.model_name,
        device_map={"":0},
        trust_remote_code=True,
        quantization_config=bnb_config
        )


  def apply_lora(self):

    """
    this function to apply the lora config on the model

    """
    print("apply_lora")

    config=LoraConfig(
          r=16,
          lora_alpha=32,
          target_modules=["q_proj", "v_proj"],
          lora_dropout=0.05,
          bias="none",
          task_type="CAUSAL_LM"
        )
    # applied lora on quantizatied model

    self.model=get_peft_model(self.model,config) #Qlora model
    self.model.print_trainable_parameters()


  def load_and_tokenize_dataset(self):
    """
    this function will load the dataset and it will perform the tokenization on it
    """
    print("load_and_tokenize_dataset")
    data=load_dataset(self.dataset_name,'main',split="train")

    data_df=data.to_pandas()
    print(data_df.head())

    text_column=data_df.columns[0] # Select first column if unsure
    print(text_column)

    if "question" in data_df.columns and "answer" in data_df.columns:
      data_df["text"] = data_df.apply(lambda x: f"question: {x['question']} answer: {x['answer']}", axis=1)


    else:
      data_df["text"] = data_df[text_column]


    # Convert back to Hugging Face dataset
    data = Dataset.from_pandas(data_df)

    # Tokenize dataset
    def tokenize(sample):
      return self.tokenizer(sample["text"], padding=True, truncation=True, max_length=512)

    self.tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)

  def train(self,epochs: int = 1, batch_size: int = 4, learning_rate: float = 2e-4, max_steps: int = 1000):

    """
      this function will perform the training
    """

    print("train started")
    training_args = TrainingArguments(
            output_dir=self.output_dir,
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=1,
            learning_rate=learning_rate,
            lr_scheduler_type="cosine",
            save_strategy="epoch",
            logging_steps=100,
            max_steps=max_steps,
            num_train_epochs=epochs,
            push_to_hub=False,
            report_to="none"

        )

    trainer=Trainer(
        model=self.model,
        train_dataset=self.tokenized_data,
        args=training_args,
         data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        )
    trainer.train()

  def save_model(self): #,model_repo:str

    """
    this function will save the model
    """
    print("save_model")
    base_model=AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=True, torch_dtype=torch.float32)
    peft_model = PeftModel.from_pretrained(base_model, self.output_dir, from_transformers=True)
    merged_model = peft_model.merge_and_unload()
    # merged_model.push_to_hub(model_repo)
    print("saving the model")

  def run(self):

    """
    this function will run the whole process
    """
    print("starting finetuning process")
    self.load_tokenizer()
    print("tokenizer loaded")

    self.load_model()
    print("model loaded")

    self.apply_lora()
    print("lora applied")

    self.load_and_tokenize_dataset()
    print("dataset loaded and tokenized")

    self.train()
    print("model trained")

    self.save_model()
    print("model saved")



In [31]:
model_name="microsoft/phi-1_5"
dataset_name="gsm8k"
output_dir="phi-1_5-finetuned"

In [32]:
fine_tuner=LoRAFineTuner(model_name,dataset_name,output_dir)

params initlized


In [None]:
fine_tuner.run()

starting finetuning process
load_tokenizer
tokenizer loaded
load_model
model loaded
apply_lora
trainable params: 3,145,728 || all params: 1,421,416,448 || trainable%: 0.2213
lora applied
load_and_tokenize_dataset
                                            question  \
0  Natalia sold clips to 48 of her friends in Apr...   
1  Weng earns $12 an hour for babysitting. Yester...   
2  Betty is saving money for a new wallet which c...   
3  Julie is reading a 120-page book. Yesterday, s...   
4  James writes a 3-page letter to 2 different fr...   

                                              answer  
0  Natalia sold 48/2 = <<48/2=24>>24 clips in May...  
1  Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...  
2  In the beginning, Betty has only 100 / 2 = $<<...  
3  Maila read 12 x 2 = <<12*2=24>>24 pages today....  
4  He writes each friend 3*2=<<3*2=6>>6 pages a w...  
question


Tokenizing data:   0%|          | 0/7473 [00:00<?, ? examples/s]

dataset loaded and tokenized
train started


Step,Training Loss
100,1.1528
200,1.0656
300,1.0328
400,1.0424
500,1.0469
600,1.0264
700,0.9835
800,1.0129
900,1.027
1000,1.027


model trained
save_model


In [21]:
data=load_dataset("gsm8k",'main',split="train")

In [22]:
data_df = data.to_pandas()

In [23]:
print(data_df.head())

                                            question  \
0  Natalia sold clips to 48 of her friends in Apr...   
1  Weng earns $12 an hour for babysitting. Yester...   
2  Betty is saving money for a new wallet which c...   
3  Julie is reading a 120-page book. Yesterday, s...   
4  James writes a 3-page letter to 2 different fr...   

                                              answer  
0  Natalia sold 48/2 = <<48/2=24>>24 clips in May...  
1  Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...  
2  In the beginning, Betty has only 100 / 2 = $<<...  
3  Maila read 12 x 2 = <<12*2=24>>24 pages today....  
4  He writes each friend 3*2=<<3*2=6>>6 pages a w...  


In [24]:
data_df.columns

Index(['question', 'answer'], dtype='object')

In [25]:
data_df.columns[0]

'question'

In [26]:
text_column = data_df.columns[0]

In [27]:
print(text_column)

question


In [28]:
if "question" in data_df.columns and "answer" in data_df.columns:
      data_df["text"] = data_df.apply(lambda x: f"question: {x['question']} answer: {x['answer']}", axis=1)
else:
      data_df["text"] = data_df[text_column]

In [29]:
data_df["text"][0]

'question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'