# Import Library

In [None]:
import unicodedata

import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTConfig, SFTTrainer
from datasets import load_dataset

# Langchain 관련
from peft import LoraConfig

# CONFIG

In [None]:
class CFG:
    # Root of Files
    EMBEDDING_CSV_FULL_TRAIN = './datas/stf_e5_base_full_train.csv'
    EMBEDDING_CSV_TRAIN = './datas/stf_e5_base_train.csv'
    EMBEDDING_CSV_VAL = './datas/stf_e5_base_val.csv'

    # About Finetuning
    PRETRAINING_MODEL = "rtzr/ko-gemma-2-9b-it"

    LoRA_RANK = 16
    LoRA_ALPHA = 32
    LoRA_DROPOUT = 0.05

    TRAINING_RESUTL_DIR = './finetune_models/training_result'
    PER_TRAIN_BATCH_SIZE =2
    PER_EVAL_BATCH_SIZE = 2
    NUM_TRAIN_EPOCHS = 5
    LOGGING_DIR='./finetune_models/training_logs'
    LOGGING_STEPS=1000
    SAVE_STEPS=1000
    SAVING_FINETUNING_MODEL_DIR = "./finetune_models/gemma_ko_9b_ver1.01"

### Loading default Pre-trained Model

In [None]:
# 4비트 양자화 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 모델 ID 
model_id = CFG.PRETRAINING_MODEL

# 토크나이저 로드 및 설정
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.use_default_system_prompt = False

# 모델 로드 및 양자화 설정 적용
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

### Show how the model look like

In [None]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

### Preparing Parammeters for Finetuning

In [None]:
# Load LoRA configuration
peft_config = LoraConfig(
    r=CFG.LoRA_RANK,
    lora_alpha=CFG.LoRA_ALPHA,
    lora_dropout=CFG.LoRA_DROPOUT,
    bias="none",
    target_modules=[
    "model.embed_tokens", # able
    #"model.layers.0.input_layernorm", # unable
    #"model.layers.0.post_attention_layernorm", # unable
    #"model.layers.0.pre_feedforward_layernorm", # unable
    #"model.layers.0.post_feedforward_layernorm", # unable
    #"model.norm" # unable
    ],
    task_type="CAUSAL_LM",
)        

training_args = TrainingArguments(
    output_dir=CFG.TRAINING_RESUTL_DIR,
    per_device_train_batch_size=CFG.PER_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=CFG.PER_EVAL_BATCH_SIZE,
    num_train_epochs=CFG.NUM_TRAIN_EPOCHS,
    logging_dir=CFG.LOGGING_DIR,
    logging_steps=CFG.LOGGING_STEPS,
    save_steps=CFG.SAVE_STEPS,
    evaluation_strategy="steps",
)

### Finetuning and Saving Finetuned Model

In [None]:
def Finetune_llm_with_SFT_Trainer():

    def normalize_string(s):
        """유니코드 정규화"""
        return unicodedata.normalize('NFC', s)
    
    # load dataset
    # train_dataset = load_dataset('csv', data_files=CFG.EMBEDDING_CSV_TRAIN)['train']
    # eval_dataset = load_dataset('csv', data_files=CFG.EMBEDDING_CSV_VAL)['train']  
    train_dataset = load_dataset('csv', data_files=CFG.EMBEDDING_CSV_FULL_TRAIN)['train']
    
    def formatting_prompts_func(example):
        output_texts = []
        for i in range(len(example)):
            text =  """다음 정보를 바탕으로 질문에 답하세요. 답변은 꼭 문장으로 하세요. 주어를 꼭 적으세요. :
# {example[Context]}
# 
# 질문: {example[Question]}
# 
# 답변: {example[Answer]}
# """
            text = normalize_string(text)
            output_texts.append(text)
        return output_texts
    
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        peft_config = peft_config,
        formatting_func=formatting_prompts_func,
        train_dataset = train_dataset,
        # eval_dataset = eval_dataset,   
    )

    # Train model
    trainer.train()
    
    # Save trained model
    trainer.model.save_pretrained(CFG.SAVING_FINETUNING_MODEL_DIR)
    tokenizer.save_pretrained(CFG.SAVING_FINETUNING_MODEL_DIR)

In [None]:
Finetune_llm_with_SFT_Trainer()