# Phi3 Fine Tuning QLoRA

In [None]:
!pip install -qqq --upgrade bitsandbytes transformers peft accelerate datasets trl flash_attn

In [None]:
!pip install huggingface_hub
!pip install python-dotenv

In [None]:
!pip install wandb -qqq

In [None]:
!pip install absl-py nltk rouge_score

In [None]:
!pip list | grep transformers.

In [None]:
from random import randrange

import torch
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel, TaskType
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
    pipeline
)
from trl import SFTTrainer

In [None]:
# Setting Parameter
model_id = "microsoft/Phi-3-mini-4k-instruct"
model_name = "microsoft/Phi-3-mini-4k-instruct"

dataset_name = "wonik-hi/korea_summary_Thesis"

dataset_split= "train"

new_model = "phi-3-mini-QLoRA"

hf_model_repo="wonik-hi/"+new_model

# Load Model on GPU
device_map = {"": 0}

se_4bit = True

bnb_4bit_compute_dtype = "bfloat16"

bnb_4bit_quant_type = "nf4"

use_double_quant = True

# LoRA configuration for the model
lora_r = 16
lora_alpha = 16
lora_dropout = 0.05
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
set_seed(1234)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()
login(token=os.getenv("HF_HUB_TOKEN"))

In [None]:
# Dataset 불러오기
dataset = load_dataset(dataset_name, split=dataset_split)
dataset = dataset.select(range(500)) #실제 훈련 시, 삭제
print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])

In [None]:
dataset

In [None]:
print(dataset[randrange(len(dataset))])

# 데이터셋 준비를 위한 Tokenizer 로드 

In [None]:
tokenizer_id = model_id
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.padding_side = 'right' # warnings 방지

In [None]:
# 챗 메시지 
def create_message_column(row):
    messages = []
    user = {
        "content": f"{row['instruction']}\n Input: {row['input']}",
        "role": "user"
    }
    messages.append(user)
    assistant = {
        "content": f"{row['output']}",
        "role": "assistant"
    }
    messages.append(assistant)
    return {"messages": messages}

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}

In [None]:
dataset_chatml = dataset.map(create_message_column)
dataset_chatml = dataset_chatml.map(format_dataset_chatml)

dataset_chatml[0]

In [None]:
dataset_chatml = dataset_chatml.train_test_split(test_size=0.05, seed=1234)
dataset_chatml

# QLoRA and trl

In [None]:
# GPU 인식 - 학습을 위해서는 GPU 이용
if torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
    attn_implementation = 'flash_attention_2'
else:
    compute_dtype = torch.float16
    attn_implementation = 'sdpa'

print(attn_implementation)
print(compute_dtype)

## Fine Tuning을 위한 Tokenizer 불러오기

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'left'

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
          model_name, torch_dtype=compute_dtype, trust_remote_code=True, quantization_config=bnb_config, device_map=device_map,
          attn_implementation=attn_implementation
)

model = prepare_model_for_kbit_training(model)

In [None]:
args = TrainingArguments(
        output_dir="./phi-3-mini-QLoRA",
        evaluation_strategy="steps",
        do_eval=True,
        optim="adamw_torch",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=8,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=100,
        learning_rate=1e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=100,
        #num_train_epochs=3,
        num_train_epochs=1,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        report_to="wandb",
        seed=42,
)

peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        task_type=TaskType.CAUSAL_LM,
        target_modules=target_modules,
)


In [None]:
# 시각화
import wandb
os.envrion["PROJECT"] = "phi-3-mini-QLoRA"

In [None]:
project_name = "phi-3-mini-QLoRA"

wandb.init(project=project_name, name = "phi-3-mini-QLoRA")

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset_chatml['train'],
        eval_dataset=dataset_chatml['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=args,
)

In [None]:
# train
trainer.train()

# save model in local
trainer.save_model()

In [None]:
hf_adapter_repo="wonik-hi/phi-3-mini-QLoRA-adapter"

In [None]:
trainer.push_to_hub(hf_adapter_repo)

In [None]:
del model
del trainer
import gc
gc.collect()
gc.collect()

In [None]:
torch.cuda.empty_cache() # PyTorch thing

In [None]:
hf_adapter_repo = "wonik-hi/phi-3-mini-QLoRA"

model_name, hf_adapter_repo, compute_dtype

In [None]:
peft_model_id = hf_adapter_repo
tr_model_id = model_name

model = AutoModelForCausalLM.from_pretrained(tr_model_id, trust_remote_code=True, torch_dtype=compute_dtype)
model = PeftModel.from_pretrained(model, peft_model_id)
model = model.merge_and_unload()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

In [None]:
hf_model_repo

In [None]:
merged_model_id = hf_model_repo
model.push_to_hub(merged_model_id)
tokenizer.push_to_hub(merged_model_id)

In [None]:
hf_model_repo

In [None]:
hf_model_repo='wonik-hi/phi-3-mini-QLoRA'

In [None]:
device_map, compute_dtype

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed

set_seed(1234)  # For reproducibility

tokenizer = AutoTokenizer.from_pretrained(hf_model_repo,trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(hf_model_repo, trust_remote_code=True, torch_dtype=compute_dtype, device_map=device_map) # compute "auto" dev_map "cuda"

In [None]:
## prepare the dataset
dataset_chatml = dataset.map(create_message_column)
dataset_chatml = dataset_chatml.map(format_dataset_chatml)
dataset_chatml = dataset_chatml.train_test_split(test_size=0.05)
dataset_chatml

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
pipe.tokenizer.apply_chat_template([{"role": "user", "content": dataset_chatml['test'][0]['messages'][0]['content']}], tokenize=False, add_generation_prompt=True)

In [None]:
def test_inference(prompt):
    prompt = pipe.tokenizer.apply_chat_template([{"role": "user", "content": prompt}], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, num_beams=1, temperature=0.3, top_k=50, top_p=0.95, max_time= 180)
    return outputs[0]['generated_text'][len(prompt):].strip()

In [None]:
# '%%time' is a magic command in Jupyter Notebook that measures the execution time of the cell it is placed in.
%%time

test_inference(dataset_chatml['test'][0]['messages'][0]['content'])

## 성능 평가

In [None]:
from datasets import load_metric
rouge_metric = load_metric("rouge", trust_remote_code=True)

def calculate_rogue(row):
    response = test_inference(row['messages'][0]['content'])
    result = rouge_metric.compute(predictions=[response], references=[row['output']], use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result['response']=response
    return result

In [None]:
%%time
metricas = dataset_chatml['test'].select(range(0,25)).map(calculate_rogue, batched=False)
# 실제 수행 시, 하기 주석 풀고 상기 내용 삭제
#metricas = dataset_chatml['test'].select(range(0,500)).map(calculate_rogue, batched=False)


In [None]:
import numpy as np

In [None]:
print("Rouge 1 Mean: ",np.mean(metricas['rouge1']))
print("Rouge 2 Mean: ",np.mean(metricas['rouge2']))
print("Rouge L Mean: ",np.mean(metricas['rougeL']))
print("Rouge Lsum Mean: ",np.mean(metricas['rougeLsum']))

In [None]:
dataset_chatml['test'][0]['output']

In [None]:
#num_samples=500
num_samples=5

In [None]:
%%time
# '%%time' is a magic command in Jupyter Notebook that measures the execution time of the cell it is placed in.
prompts = [pipe.tokenizer.apply_chat_template([{"role": "user", "content": dataset_chatml['test'][i]['messages'][0]['content']}],
                                              tokenize=False, add_generation_prompt=True)
                                              for i in range(num_samples)]

In [None]:
outputs = pipe(prompts, batch_size=4, max_new_tokens=256, do_sample=True, num_beams=1, temperature=0.3, top_k=50, top_p=0.95,
                   max_time= 180)
preds = [outputs[i][0]['generated_text'].split("<|assistant|>\n")[1].strip() for i in range(len(outputs))]
references= [dataset_chatml['test'][i]['output'] for i in range(len(outputs))]
rouge_metric.add_batch(predictions=preds, references=references)

In [None]:
# So, 'result = rouge_metric.compute(use_stemmer=True)' calculates the Rouge scores with stemming and stores the result in the 'result' variable.
result = rouge_metric.compute(use_stemmer=True)

In [None]:
print("Rouge 1 Mean: ",np.mean(result['rouge1']))
print("Rouge 2 Mean: ",np.mean(result['rouge2']))
print("Rouge L Mean: ",np.mean(result['rougeL']))
print("Rouge Lsum Mean: ",np.mean(result['rougeLsum']))

In [None]:

# ROUGE-1 is a metric for evaluating automatic summarization of texts and machine translation. It compares the overlap of unigrams (single words) between the system's output and the reference summaries.
result['rouge1']