<a href="https://colab.research.google.com/github/vivek09thakur/Fine-Tuning-LLaMA-2/blob/main/Colab%20Notebook/Fine_Tuning_LLaMA2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Fine-Tuning LLaMA**

**Installing Dependencies**

In [1]:
!pip install accelerate peft bitsandbytes transformers trl
!pip install flask-ngrok
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip
!./ngrok authtoken 2c2CD7vXkTJYULz1dDGFSViTqY9_6KDavnrbYSNxp3TPJReYa # you token here
!nohup python -m flask run &

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.8.2-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.7.10-py3-none-any.whl (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from trl)
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m47.3 MB/s[0m eta [36m0

### **Getting Started**

**Imports**

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

class FineTuned_LlaMA:

    def __init__(self,base_model,training_dataset,new_model,num_of_token):
        self.base_model = base_model
        self.training_dataset = training_dataset
        self.new_model = new_model
        self.num_of_token = num_of_token
        self.dataset = load_dataset(training_dataset, split="train")

        # 4-bit quantization config
        self.compute_dtype = getattr(torch, "float16")
        self.quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=self.compute_dtype,
            bnb_4bit_use_double_quant=False,
        )

    def load_model(self):
        self.model = AutoModelForCausalLM.from_pretrained(
            self.base_model,
            quantization_config=self.quant_config,
            device_map={"": 0}
        )
        self.model.config.use_cache = False
        self.model.config.pretraining_tp = 1

    def load_tokenizer(self):
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.base_model,
                                            trust_remote_code=True)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"

        self.peft_params = LoraConfig(
                    lora_alpha=16,
                    lora_dropout=0.1,
                    r=64,
                    bias="none",
                    task_type="CAUSAL_LM",
                )

    def model_trainer(self):
        self.training_params = TrainingArguments(
                        # The output directory is where the model
                        # predictions and checkpoints will be stored.
                        output_dir="./results",
                        num_train_epochs=1, # One training epoch.
                        per_device_train_batch_size=4,
                        gradient_accumulation_steps=1,
                        optim="paged_adamw_32bit",
                        save_steps=25,
                        logging_steps=25,
                        learning_rate=2e-4,
                        weight_decay=0.001,
                        fp16=False,
                        bf16=False,
                        max_grad_norm=0.3,
                        max_steps=-1,
                        warmup_ratio=0.03,
                        group_by_length=True,
                        lr_scheduler_type="constant",
                        report_to="tensorboard"
                )

        self.trainer = SFTTrainer(
            model=self.model,
            train_dataset=self.dataset,
            peft_config=self.peft_params,
            dataset_text_field="text",
            max_seq_length=None,
            tokenizer=self.tokenizer,
            args=self.training_params,
            packing=False,
        )

        self.trainer.model.save_pretrained(self.new_model)
        self.trainer.tokenizer.save_pretrained(self.new_model)

    def create_pipeline(self):
        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            max_length=self.num_of_token,
        )

    def inference(self,prompt):
        self.results = self.pipe(f"<s>[INST] {prompt} [/INST]")
        return self.results

Mounted at /content/drive


### **Driver Code**

In [4]:
BASE_MODEL = "NousResearch/Llama-2-7b-chat-hf"
GUANANCO_DATASET = "mlabonne/guanaco-llama2-1k"
NEW_MODEL = "llama2-7b-vivek"
TOKENS = 200

fine_tuned_model = FineTuned_LlaMA(BASE_MODEL,GUANANCO_DATASET,NEW_MODEL,TOKENS)
fine_tuned_model.load_model()
fine_tuned_model.load_tokenizer()
fine_tuned_model.model_trainer()
fine_tuned_model.create_pipeline()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

**main-loop**

In [5]:
while True:
    try:
        prompt = input("\n<You>: ")
        prefix = f"<s>[INST] {prompt} [/INST] "
        response = fine_tuned_model.inference(prompt)[0]['generated_text']
        print(f"\n<LlaMA> : {response[len(prefix):]}")
    except KeyboardInterrupt:
        print("Bye!")
        break


<You>: hey

<LlaMA> :  Hey there! How's it going?
Bye!

<You>: 


**main-app**

In [6]:
from flask import Flask,request
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

@app.route("/")
def home():
    return "<h1>Server Looks Good!</h1>"

@app.route('/generate_response', methods=['POST'])
def receive_data():
    data = request.get_json()
    prompt = data['key']
    prefix = f"<s>[INST] {prompt} [/INST] "
    response = fine_tuned_model.inference(prompt)[0]['generated_text']
    cleaned_response = response[len(prefix):]
    return cleaned_response, 200

In [7]:
app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://b195-35-231-41-208.ngrok-free.app
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [07/Feb/2024 22:54:25] "POST /generate_response HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Feb/2024 22:55:12] "POST /generate_response HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Feb/2024 22:56:44] "POST /generate_response HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Feb/2024 22:58:44] "POST /generate_response HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Feb/2024 23:00:53] "POST /generate_response HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Feb/2024 23:03:10] "POST /generate_response HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Feb/2024 23:03:53] "POST /generate_response HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Feb/2024 23:07:35] "POST /generate_response HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Feb/2024 23:11:01] "POST /generate_response HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Feb/2024 23:12:17] "POST /generate_response HTTP/1.1" 200 -
