## LLaMA-Factory 코드 및 Dataset 준비

In [None]:
!git clone https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory/data
!wget -O ko_civil_service_inst.json https://github.com/superdom/blog/raw/main/052024-llamafactory/ko_civil_service_inst.json
%ls | grep ko_civil_service_int.json
%cd ..
%cd /content/LLaMA-Factory/

## LLaMA-Factory 및 Unsloth 설치

In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers
!pip install .[bitsandbytes]

## Custom Dataset 추가

In [None]:
!apt-get install jq -y

In [4]:
!jq '. |= {"ko_civil_service_inst": {"file_name": "ko_civil_service_inst.json"}} + .' ./data/dataset_info.json > ./data/tmp.json && mv ./data/tmp.json ./data/dataset_info.json

In [None]:
!head ./data/dataset_info.json

## Training 환경 구성

In [6]:
import os
os.makedirs("config", exist_ok=True)

In [7]:
config_yaml = """
model_name_or_path: unsloth/llama-3-8b-Instruct-bnb-4bit
quantization_bit: 4
use_unsloth: true

### method
stage: sft
do_train: true
flash_attn: auto
use_unsloth: true
finetuning_type: lora
lora_target: all
lora_rank: 8
lora_alpha: 16
lora_dropout: 0

### dataset
dataset: ko_civil_service_inst
template: gemma
cutoff_len: 1024
#max_samples: 1000
#overwrite_cache: true
preprocessing_num_workers: 16

### output
output_dir: output/llama-3-8b-Instruct-bnb-4bit/qlora
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 4
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
fp16: true
report_to: none

### eval
val_size: 0.1
per_device_eval_batch_size: 1
evaluation_strategy: steps
eval_steps: 100
"""

In [8]:
with open("config/llama3-8b-instruct-bnb-4bit-unsloth.yaml", "w") as file:
    file.write(config_yaml)

## Training!

In [None]:
!llamafactory-cli train config/llama3-8b-instruct-bnb-4bit-unsloth.yaml

## Chat 모드로 Inference Test

In [None]:
from llamafactory.chat import ChatModel
from llamafactory.extras.misc import torch_gc

args = dict(
  model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit",
  adapter_name_or_path="output/llama-3-8b-Instruct-bnb-4bit/qlora",
  template="llama3",
  finetuning_type="lora",
  quantization_bit=4,
)
chat_model = ChatModel(args)

messages = []
print("Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.")
while True:
  query = input("\nUser: ")
  if query.strip() == "exit":
    break
  if query.strip() == "clear":
    messages = []
    torch_gc()
    print("History has been removed.")
    continue

  messages.append({"role": "user", "content": query})
  print("Assistant: ", end="", flush=True)

  response = ""
  for new_text in chat_model.stream_chat(messages):
    print(new_text, end="", flush=True)
    response += new_text
  print()
  messages.append({"role": "assistant", "content": response})

torch_gc()

## 모델을 병합하여 저장

In [None]:
!huggingface-cli login

In [13]:
import json

args = dict(
  model_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct", # Unsloth의 양자화 모델이 아닌 원래 llama 모델 지정
  adapter_name_or_path="output/llama-3-8b-Instruct-bnb-4bit/qlora",
  template="llama3",
  finetuning_type="lora",
  export_dir="output/Meta-Llama-3-8B-Instruct",
  export_size=2,                                  # 모델을 몇GB로 나눠 분할 저장할지 지정
  export_device="cpu",                            # 모델 병합을 처리할 디바이스 지정 (cpu and cuda)
  #export_hub_model_id="your_id/your_model",
)

json.dump(args, open("merge_llama3.json", "w", encoding="utf-8"), indent=2)

In [None]:
!llamafactory-cli export merge_llama3.json