In [None]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

In [None]:
# Setting up packages
import os
import torch
import transformers
import logging
import json 
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer
from google.colab import userdata

# Huggingface token
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

# Setup device 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Setup logging
logging.basicConfig(format='%(asctime)s %(message)s',
                     datefmt='%m/%d/%Y %I:%M:%S %p',
                     filename='logfile.txt',
                     level=logging.INFO)

In [None]:
class LargeLanguageModel():
    def __init__(self,model_id,bnb_config = False):
        self.model_id = model_id
        
        # hyperparameters 
        with open('params.json', 'r') as file:
            params = json.load(file)
        self.train_params =params["training"]
        lora_params =params["lora"]
        self.token = os.environ['HF_TOKEN']
        self.bnb_config = bnb_config
        
        # Model object
        self.model = AutoModelForCausalLM.from_pretrained(self.model_id,
                                             quantization_config=self.bnb_config,
                                             device_map={"":0},
                                             token=self.token)
        # Tokenizer object
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, token=self.token)
        
        # Lora object
        self.lora_config = LoraConfig(
                                    r=lora_params["r"],
                                    target_modules=lora_params["target_modules"],
                                    task_type=lora_params["task_type"],
                                )
        
        logging.info("LLM class instantiated")
        logging.info(f"Hyperparameters: \n{params}")
        
    
    def generate_example(self,text):
        inputs = self.tokenizer(text, return_tensors="pt").to(device)
        outputs = self.model.generate(**inputs, max_new_tokens=50)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    @staticmethod
    def formatting_func(example):
        text = f"<start_of_turn>user\n{example['INSTRUCTION'][0]}<end_of_turn> <start_of_turn>model\n{example['RESPONSE'][0]}<end_of_turn>"
        return [text]

    def finetune(self,dataset):
        logging.info("Finetuning started")
        trainer = SFTTrainer(
            model = self.model,
            train_dataset = dataset.data["train"],
            args=transformers.TrainingArguments(
                per_device_train_batch_size = self.train_params["per_device_train_batch_size"],
                gradient_accumulation_steps = self.train_params["gradient_accumulation_steps"],
                warmup_steps = self.train_params["warmup_steps"],
                max_steps = self.train_params["max_steps"],
                learning_rate = self.train_params["learning_rate"],
                fp16 = self.train_params["fp16"],
                logging_steps = self.train_params["logging_steps"],
                output_dir = self.train_params["output_dir"],
                optim = self.train_params["optim"]
            ),
            peft_config = self.lora_config,
            formatting_func = self.formatting_func,
        )
        trainer.train()
        logging.info("Finetuning completed")


class Dataset():
    def __init__(self):
        self.data = None

    def load_data(self, dataset_id):
        logging.info("Loading dataset")
        self.data = load_dataset(dataset_id)
        logging.info("Loading dataset completed")


    def print_dataset_values(self):
        print(self.data['train'])

In [None]:
# Defining model and dataset
model_id = "google/gemma-2b"
dataset_id = "databricks/databricks-dolly-15k"

# Setting up Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Generating objects 
llm = LargeLanguageModel(model_id, bnb_config)
dataset = Dataset()
dataset.load_data(dataset_id)
dataset.return_dataset_values()

In [None]:
# Finetuning
llm.finetune(dataset)