In [None]:
!pip install bitsandbytes==0.43.1
!pip install accelerate==0.30.1
!pip install transformers==4.39.3
!pip install peft==0.10.0
!pip install datasets

# Llama3 Model 양자화
- L4, A100은 되어야 파인튜닝 가능

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

# !pip install bitsandbytes
config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model_id = "beomi/Llama-3-Open-Ko-8B-Instruct-preview"

tokenizer = AutoTokenizer.from_pretrained(model_id, quantization_config=config)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="cuda:0",
    trust_remote_code=True,
    quantization_config=config
)

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):

    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params}\nall params: {all_param}\ntrainable%: {100 * trainable_params / all_param}"
    )

In [None]:
# 풀 파인튜닝인 경우 40억개의 파라미터를 학습해야함, LoRA 파인튜닝인 경우 340만개의 파라미터 학습
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# HuggingFace Login

In [None]:
import huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

# Load RAG dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("nlpai-lab/databricks-dolly-15k-ko")
categories = ['closed_qa', 'information_extraction', 'summarization']
filtered_dataset = dataset['train'].filter(lambda example: example['category'] in categories)


print(filtered_dataset)

In [None]:
dataset['train'][0]

In [None]:
print(set(filtered_dataset['category']))

In [None]:
# Llama3가 인식할 수 있는 데이터로 변환하는 과정
# System prompt는 RAG에서 Llama3를 활용할 때 사용하는 것을 가져다 씀
SYSTEM_PROMPT = "You are an assistant for answering questions. You are given the extracted parts of a long document and a question. Provide a conversational answer. Don't make up an answer."

tokenizer.pad_token = tokenizer.eos_token

def get_rag_train_prompt(row):

    question = "Context에 따르면, " + row['instruction']
    context = row['context']
    answer = row['response']

    user_prompt = f'###Context:{context}\n###Question:{question}'

    messages = [
        {"role": "system", "content" : SYSTEM_PROMPT},
        {"role": "user", "content" : user_prompt},
        {"role": "assistant", "content" : answer}
    ]

    encoded = tokenizer.apply_chat_template(
        messages,
        padding=True,
        truncation=True
    )

    return {"input_ids": encoded}

In [None]:
new_dataset = filtered_dataset.map(get_rag_train_prompt)

In [None]:
print(tokenizer.decode(new_dataset[1]['input_ids']))

# Training Model
- L4, A100 등에서 학습 가능하며, 약 2시간 반정도 걸림

In [None]:
import transformers

trainer = transformers.Trainer(
    model=model,
    train_dataset=new_dataset,
    args=transformers.TrainingArguments(
        auto_find_batch_size=True,
        gradient_checkpointing=True,
        warmup_steps=10,
        max_steps=2400,
        save_steps=200,
        save_total_limit=3,
        learning_rate=2e-4,
        fp16=False,
        bf16=True,
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()