In [1]:
import json
from datasets import Dataset
from transformers import AutoTokenizer

# 설정
MODEL_NAME = "MLP-KTLim/llama-3-Korean-Bllossom-8B"
JSON_PATH = "./classified_Empathy_Supervisor.json"
MAX_LENGTH = 512

# JSON 파일 로딩 (content만 추출)
with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

# 'content'만으로 Dataset 구성
texts = [item["content"] for item in data]
dataset = Dataset.from_dict({"text": texts})

In [2]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, max_length=MAX_LENGTH)

tokenized_dataset = dataset.map(tokenize)

Map:   0%|          | 0/1918 [00:00<?, ? examples/s]

In [3]:
import torch
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848


In [4]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

OUTPUT_DIR = "./empathy_adapter"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

# GPT류 모델용 LM Collator (MLM=False 필수)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model.cpu(),
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,3.0396
20,2.7566
30,2.531
40,2.4436
50,2.3394
60,2.2638
70,2.1811
80,2.2546
90,2.3985
100,2.1382


TrainOutput(global_step=720, training_loss=2.0050131479899087, metrics={'train_runtime': 998.835, 'train_samples_per_second': 5.761, 'train_steps_per_second': 0.721, 'total_flos': 1.1640207972114432e+16, 'train_loss': 2.0050131479899087, 'epoch': 3.0})

In [5]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

('./empathy_adapter/tokenizer_config.json',
 './empathy_adapter/special_tokens_map.json',
 './empathy_adapter/chat_template.jinja',
 './empathy_adapter/tokenizer.json')