In [1]:
import os
import json
import glob
import torch
import torch.nn as nn
from datasets import Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForLanguageModeling, Trainer

os.environ["CUDA_VISIBLE_DEVICES"] = "0,2"


In [None]:
class CustomTrainer(Trainer) :
    def compute_loss(self, model, inputs, return_outputs = False, **kwargs) :
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        labels = labels.to(logits.device)
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.vocab_size), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
    
class QnADataset :
    def __init__(self, data_paths, tokenizer, max_length=32786) :
        self.data = self.load_data(data_paths)
        self.max_length = max_length
        self.tokenizer = tokenizer

    def load_data(self, data_paths) :
        all_loaded_data = []
        for path in data_paths :
            with open(path, 'r', encoding="utf-8") as f :
                data = json.load(f)
                all_loaded_data.extend(data)
        return all_loaded_data
    
    def prepare_input_output(self, item) :
        input_text = f"질문 : {item['question']}\n문서 : {item['context']}\n답변 : "
        output_text = item["answer"]

        return input_text, output_text
    
    def tokenize_data(self) :
        tokenized_samples = []
        for item in self.data :
            input_text, output_text = self.prepare_input_output(item)

            input_tokens_ids = self.tokenizer(
                input_text,
                add_special_tokens=False
            )["input_ids"]

            output_tokens_ids = self.tokenizer(
                output_text,
                add_special_tokens=False
            )["input_ids"]

            full_sequence_ids = input_tokens_ids + output_tokens_ids
            labels = [-100] * len(input_tokens_ids) + output_tokens_ids

            if self.tokenizer.eos_token_id is not None :
                full_sequence_ids.append(self.tokenizer.eos_token_id)
                labels.append(self.tokenizer.eos_token_id)

            if len(full_sequence_ids) > self.max_length :
                full_sequence_ids = full_sequence_ids[:self.max_length]
                labels = labels[:self.max_length]

            attention_mask = [1] * len(full_sequence_ids)

            tokenized_samples.append({
                "input_ids" : full_sequence_ids,
                "labels" : labels,
                "attention_mask" : attention_mask
            })

        dataset = Dataset.from_list(tokenized_samples)

        return dataset
    
def setup_model_and_tokenizer(model_name) :
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quants=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        # device_map={"":0},
        device_map="auto",
        trust_remote_code=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side="right"

    return model, tokenizer

def setup_dora_config() :
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        use_dora=True
    )

    return lora_config

def create_sample_data() :
    sample_data = [
        {
            "question": "파이썬에서 리스트를 어떻게 정렬하나요?",
            "context": "파이썬 리스트는 sort() 메서드나 sorted() 함수를 사용하여 정렬할 수 있습니다. sort()는 원본 리스트를 수정하고, sorted()는 새로운 정렬된 리스트를 반환합니다.",
            "answer": "파이썬에서 리스트를 정렬하는 방법은 두 가지입니다. 1) list.sort() - 원본 리스트를 직접 수정하여 정렬합니다. 2) sorted(list) - 원본을 유지하고 새로운 정렬된 리스트를 반환합니다."
        },
        {
            "question": "딥러닝에서 과적합이란 무엇인가요?",
            "context": "과적합(Overfitting)은 모델이 훈련 데이터에 너무 특화되어 새로운 데이터에 대한 일반화 성능이 떨어지는 현상입니다. 훈련 정확도는 높지만 검증 정확도가 낮은 특징을 보입니다.",
            "answer": "과적합은 모델이 훈련 데이터에만 과도하게 맞춰져서 새로운 데이터에 대한 예측 성능이 떨어지는 현상입니다. 드롭아웃, 정규화, 조기 종료 등의 방법으로 방지할 수 있습니다."
        }
    ]

    if not os.path.exists("./data") :
        os.makedirs("./data")
    
    with open("./data/qna_data.json", 'w', encoding="utf-8") as f :
        json.dump(sample_data, f, ensure_ascii=False, indent=2)

def main() :
    model_name = "../model/LLM/deepseek-qwen-bllossom-32b"
    data_dir = "../data/HarryPotterQA"
    output_dir = "../model/finetuned-qwen-harrypotter"

    all_json_files = glob.glob(os.path.join(data_dir, "*.json"))
    if not all_json_files :
        print(f"Error: No Json files found : {data_dir}")
        return
    
    model, tokenizer = setup_model_and_tokenizer(model_name)

    if tokenizer.pad_token is None :
        tokenizer.pad_token = tokenizer.eos_token

    model = prepare_model_for_kbit_training(model)

    dora_config = setup_dora_config()
    model = get_peft_model(model, dora_config)

    model.print_trainable_parameters()

    dataset_handler = QnADataset(all_json_files, tokenizer)
    train_dataset = dataset_handler.tokenize_data()

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_strategy="epoch",
        eval_strategy="no",
        warmup_steps=100,
        lr_scheduler_type="cosine",
        remove_unused_columns=False,
        dataloader_pin_memory=False
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    print("DoRA 시작")
    trainer.train()

    trainer.save_model()
    tokenizer.save_pretrained(output_dir)

    print(f"DoRA 파인튜닝 완료 : {output_dir}")

if __name__ == "__main__" : 
    main()

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

trainable params: 34,340,864 || all params: 32,798,217,216 || trainable%: 0.1047


  trainer = CustomTrainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


DoRA 시작




Step,Training Loss


KeyboardInterrupt: 

: 