In [1]:
import json
import torch
from datasets import Dataset

class QnADataset :
    def __init__(self, data_path, tokenizer, max_length=512) :
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(data_path)

    def load_data(self, data_path) :
        with open(data_path, 'r', encoding="utf-8") as f :
            data = json.load(f)
        return data
    
    def prepare_input_output(self, item) :
        input_text = f"질문 : {item['question']}\n문서 : {item['context']}\m답변 : "

        output_text = item["answer"]
        
        return input_text, output_text
    
    def tokenize_data(self) :
        input_ids_list = []
        attention_mask_list = []
        labels_list = []

        for item in self.data :
            input_text, output_text = self.prepare_input_output(item)

            input_tokens = self.tokenizer(
                input_text,
                add_special_tokens=False,
                return_tensors="pt"
            )["input_ids"].squeeze()

            output_tokens = self.tokenizer(
                output_text,
                add_special_tokens=False,
                return_tensors="pt"
            )["input_ids"].squeeze()

            full_sequence = torch.cat([input_tokens, output_tokens])

            labels = torch.cat([
                torch.full_like(input_tokens, -100),
                output_tokens
            ])

            if len(full_sequence) > self.max_length :
                full_sequence = full_sequence[:self.max_length]
                labels = labels[:self.max_length]

            attention_mask = torch.ones_like(full_sequence)

            input_ids_list.append(full_sequence)
            attention_mask_list.append(attention_mask)
            labels_list.append(labels)

        input_ids_list = torch.nn.utils.rnn.pad_sequence(
            input_ids_list, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )

        attention_mask_list = torch.nn.utils.rnn.pad_sequence(
            attention_mask_list, batch_first=True, padding_value=0
        )

        labels_list = torch.nn.utils.rnn.pad_sequence(
            labels_list, batch_first=True, padding_value=-100
        )

        dataset = Dataset.from_dict({
            "input_ids" : input_ids_list,
            "attention_mask" : attention_mask_list,
            "labels" : labels_list
        })

        return dataset

from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
import os
import torch

def setup_model_and_tokenizer(model_name) :
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    # GPU 개수 확인
    num_gpus = torch.cuda.device_count()
    print(f"사용 가능한 GPU 개수: {num_gpus}")
    
    # 첫 번째 GPU에 로드하고 나중에 DataParallel 적용
    device_map = {"": 0}
    print(f"모델을 GPU 0에 로드 후 DataParallel 적용 예정")

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map=device_map,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side="right"

    return model, tokenizer

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
def setup_lora_config() :
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )

    return lora_config

def create_sample_data() :
    sample_data = [
        {
            "question": "파이썬에서 리스트를 어떻게 정렬하나요?",
            "context": "파이썬 리스트는 sort() 메서드나 sorted() 함수를 사용하여 정렬할 수 있습니다. sort()는 원본 리스트를 수정하고, sorted()는 새로운 정렬된 리스트를 반환합니다.",
            "answer": "파이썬에서 리스트를 정렬하는 방법은 두 가지입니다. 1) list.sort() - 원본 리스트를 직접 수정하여 정렬합니다. 2) sorted(list) - 원본을 유지하고 새로운 정렬된 리스트를 반환합니다."
        },
        {
            "question": "딥러닝에서 과적합이란 무엇인가요?",
            "context": "과적합(Overfitting)은 모델이 훈련 데이터에 너무 특화되어 새로운 데이터에 대한 일반화 성능이 떨어지는 현상입니다. 훈련 정확도는 높지만 검증 정확도가 낮은 특징을 보입니다.",
            "answer": "과적합은 모델이 훈련 데이터에만 과도하게 맞춰져서 새로운 데이터에 대한 예측 성능이 떨어지는 현상입니다. 드롭아웃, 정규화, 조기 종료 등의 방법으로 방지할 수 있습니다."
        }
    ]

    with open("./data/qna_data.json", 'w', encoding="utf-8") as f :
        json.dump(sample_data, f, ensure_ascii=False, indent=2)

    print("샘플 데이터 생성")

from transformers import TrainingArguments, DataCollatorForLanguageModeling, Trainer
import os
import torch.nn as nn

def main() :
    model_name = "./model/LLM/deepseek-qwen-bllossom-32b"
    data_path = "./data/qna_data.json"
    output_dir = "./model/finetuned-model"

    model, tokenizer = setup_model_and_tokenizer(model_name)

    model = prepare_model_for_kbit_training(model)

    lora_config = setup_lora_config()
    model = get_peft_model(model, lora_config)

    # Multi-GPU 설정: DataParallel 적용
    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        print(f"DataParallel 적용: {num_gpus}개 GPU 사용")
        model = nn.DataParallel(model)
        # GPU별 배치 크기 조정
        per_device_batch_size = 1
        gradient_accumulation_steps = max(8 // num_gpus, 1)
    else:
        print("Single GPU 모드")
        per_device_batch_size = 1
        gradient_accumulation_steps = 8

    dataset_handler = QnADataset(data_path, tokenizer)
    train_dataset = dataset_handler.tokenize_data()
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_strategy="epoch",
        eval_strategy="no",
        warmup_steps=100,
        lr_scheduler_type="cosine",
        remove_unused_columns=False,
        dataloader_pin_memory=False,
        dataloader_num_workers=0,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    print("파인튜닝 시작")
    trainer.train()

    # DataParallel 사용 시 모델 저장 방법 수정
    if num_gpus > 1:
        # DataParallel의 경우 .module로 원본 모델에 접근
        trainer.model.module.save_pretrained(output_dir)
    else:
        trainer.save_model()
    
    tokenizer.save_pretrained(output_dir)

    print(f"파인튜닝 완료 : {output_dir}")

if __name__ == "__main__" :
    create_sample_data()
    main()

샘플 데이터 생성
사용 가능한 GPU 개수: 3
모델을 GPU 0에 로드 후 DataParallel 적용 예정


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

DataParallel 적용: 3개 GPU 사용


  trainer = Trainer(


파인튜닝 시작




Step,Training Loss
10,0.5785
20,0.6361




파인튜닝 완료 : ./model/finetuned-model


In [None]:
import json
import torch
from datasets import Dataset

class QnADataset :
    def __init__(self, data_path, tokenizer, max_length=512) :
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(data_path)

    def load_data(self, data_path) :
        with open(data_path, 'r', encoding="utf-8") as f :
            data = json.load(f)
        return data
    
    def prepare_input_output(self, item) :
        input_text = f"질문 : {item['question']}\n문서 : {item['context']}\m답변 : "

        output_text = item["answer"]
        
        return input_text, output_text
    
    def tokenize_data(self) :
        input_ids_list = []
        attention_mask_list = []
        labels_list = []

        for item in self.data :
            input_text, output_text = self.prepare_input_output(item)

            input_tokens = self.tokenizer(
                input_text,
                add_special_tokens=False,
                return_tensors="pt"
            )["input_ids"].squeeze()

            output_tokens = self.tokenizer(
                output_text,
                add_special_tokens=False,
                return_tensors="pt"
            )["input_ids"].squeeze()

            full_sequence = torch.cat([input_tokens, output_tokens])

            labels = torch.cat([
                torch.full_like(input_tokens, -100),
                output_tokens
            ])

            if len(full_sequence) > self.max_length :
                full_sequence = full_sequence[:self.max_length]
                labels = labels[:self.max_length]

            attention_mask = torch.ones_like(full_sequence)

            input_ids_list.append(full_sequence)
            attention_mask_list.append(attention_mask)
            labels_list.append(labels)

        input_ids_list = torch.nn.utils.rnn.pad_sequence(
            input_ids_list, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )

        attention_mask_list = torch.nn.utils.rnn.pad_sequence(
            attention_mask_list, batch_first=True, padding_value=0
        )

        labels_list = torch.nn.utils.rnn.pad_sequence(
            labels_list, batch_first=True, padding_value=-100
        )

        dataset = Dataset.from_dict({
            "input_ids" : input_ids_list,
            "attention_mask" : attention_mask_list,
            "labels" : labels_list
        })

        return dataset

from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import torch

def setup_model_and_tokenizer(model_name, force_multi_gpu=True) :
    # GPU 개수 확인
    num_gpus = torch.cuda.device_count()
    print(f"사용 가능한 GPU 개수: {num_gpus}")
    
    if num_gpus > 1 and force_multi_gpu:
        print("=== 진짜 Multi-GPU 모드 ===")
        print("Quantization 없이 모델을 여러 GPU에 분산 로드")
        
        # GPU 메모리 설정 (각 GPU당 사용할 최대 메모리)
        max_memory = {}
        for i in range(num_gpus):
            max_memory[i] = "20GiB"  # 각 GPU당 20GB까지 사용
        
        print(f"GPU 메모리 설정: {max_memory}")
        
        # 모델을 여러 GPU에 자동 분산
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",  # 자동으로 여러 GPU에 분산
            max_memory=max_memory,
            trust_remote_code=True,
            torch_dtype=torch.float16,  # quantization 대신 float16 사용
            low_cpu_mem_usage=True
        )
        
        print("모델 분산 완료! nvidia-smi로 확인해보세요.")
        
    else:
        print("=== Single GPU 모드 (Quantization 사용) ===")
        from transformers import BitsAndBytesConfig
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map={"": 0},
            trust_remote_code=True,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True
        )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side="right"

    return model, tokenizer

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

def setup_lora_config() :
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )

    return lora_config

def create_sample_data() :
    sample_data = [
        {
            "question": "파이썬에서 리스트를 어떻게 정렬하나요?",
            "context": "파이썬 리스트는 sort() 메서드나 sorted() 함수를 사용하여 정렬할 수 있습니다. sort()는 원본 리스트를 수정하고, sorted()는 새로운 정렬된 리스트를 반환합니다.",
            "answer": "파이썬에서 리스트를 정렬하는 방법은 두 가지입니다. 1) list.sort() - 원본 리스트를 직접 수정하여 정렬합니다. 2) sorted(list) - 원본을 유지하고 새로운 정렬된 리스트를 반환합니다."
        },
        {
            "question": "딥러닝에서 과적합이란 무엇인가요?",
            "context": "과적합(Overfitting)은 모델이 훈련 데이터에 너무 특화되어 새로운 데이터에 대한 일반화 성능이 떨어지는 현상입니다. 훈련 정확도는 높지만 검증 정확도가 낮은 특징을 보입니다.",
            "answer": "과적합은 모델이 훈련 데이터에만 과도하게 맞춰져서 새로운 데이터에 대한 예측 성능이 떨어지는 현상입니다. 드롭아웃, 정규화, 조기 종료 등의 방법으로 방지할 수 있습니다."
        }
    ]

    # 디렉토리 생성
    os.makedirs("./data", exist_ok=True)
    
    with open("./data/qna_data.json", 'w', encoding="utf-8") as f :
        json.dump(sample_data, f, ensure_ascii=False, indent=2)

    print("샘플 데이터 생성")

from transformers import TrainingArguments, DataCollatorForLanguageModeling, Trainer
import os

def main() :
    model_name = "./model/LLM/deepseek-qwen-bllossom-32b"
    data_path = "./data/qna_data.json"
    output_dir = "./model/finetuned-model"

    # GPU 개수 확인
    num_gpus = torch.cuda.device_count()
    use_multi_gpu = num_gpus > 1
    
    # Multi-GPU 사용 여부 물어보기
    if use_multi_gpu:
        print(f"\n🔥 {num_gpus}개 GPU 감지됨!")
        print("1. Multi-GPU 모드 (quantization 없음, 더 빠름)")
        print("2. Single GPU 모드 (quantization 사용, 메모리 절약)")
        choice = input("선택하세요 (1 또는 2, 기본값: 1): ").strip()
        
        if choice == "2":
            use_multi_gpu = False
            print("Single GPU 모드 선택")
        else:
            use_multi_gpu = True
            print("Multi-GPU 모드 선택")

    model, tokenizer = setup_model_and_tokenizer(model_name, use_multi_gpu)

    # LoRA 설정
    if use_multi_gpu:
        # Multi-GPU에서는 quantization 없이 바로 LoRA 적용
        lora_config = setup_lora_config()
        model = get_peft_model(model, lora_config)
        print("Multi-GPU: quantization 없이 LoRA 적용")
    else:
        # Single GPU에서는 quantization 후 LoRA 적용
        model = prepare_model_for_kbit_training(model)
        lora_config = setup_lora_config()
        model = get_peft_model(model, lora_config)
        print("Single GPU: quantization 후 LoRA 적용")

    dataset_handler = QnADataset(data_path, tokenizer)
    train_dataset = dataset_handler.tokenize_data()

    # 훈련 설정
    if use_multi_gpu:
        per_device_batch_size = 1
        gradient_accumulation_steps = max(8 // num_gpus, 1)
        print(f"Multi-GPU 훈련 설정: 배치={per_device_batch_size}, 누적={gradient_accumulation_steps}")
    else:
        per_device_batch_size = 1
        gradient_accumulation_steps = 8
        print(f"Single GPU 훈련 설정: 배치={per_device_batch_size}, 누적={gradient_accumulation_steps}")
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_strategy="epoch",
        eval_strategy="no",
        warmup_steps=100,
        lr_scheduler_type="cosine",
        remove_unused_columns=False,
        dataloader_pin_memory=False,
        dataloader_num_workers=0,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    print("\n🚀 파인튜닝 시작!")
    if use_multi_gpu:
        print("📊 nvidia-smi를 다른 터미널에서 실행해서 모든 GPU 사용량을 확인해보세요!")
    
    trainer.train()

    trainer.save_model()
    tokenizer.save_pretrained(output_dir)

    print(f"✅ 파인튜닝 완료: {output_dir}")

if __name__ == "__main__" :
    create_sample_data()
    main()

샘플 데이터 생성

🔥 3개 GPU 감지됨!
1. Multi-GPU 모드 (quantization 없음, 더 빠름)
2. Single GPU 모드 (quantization 사용, 메모리 절약)
Multi-GPU 모드 선택
사용 가능한 GPU 개수: 3
=== 진짜 Multi-GPU 모드 ===
Quantization 없이 모델을 여러 GPU에 분산 로드
GPU 메모리 설정: {0: '20GiB', 1: '20GiB', 2: '20GiB'}


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.


모델 분산 완료! nvidia-smi로 확인해보세요.


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Multi-GPU: quantization 없이 LoRA 적용
Multi-GPU 훈련 설정: 배치=1, 누적=2

🚀 파인튜닝 시작!
📊 nvidia-smi를 다른 터미널에서 실행해서 모든 GPU 사용량을 확인해보세요!


RuntimeError: Function MmBackward0 returned an invalid gradient at index 1 - expected device meta but got cuda:0

: 

In [1]:
import json
import torch
from datasets import Dataset
import os

class QnADataset :
    def __init__(self, data_path, tokenizer, max_length=512) :
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(data_path)

    def load_data(self, data_path) :
        with open(data_path, 'r', encoding="utf-8") as f :
            data = json.load(f)
        return data
    
    def prepare_input_output(self, item) :
        input_text = f"질문 : {item['question']}\n문서 : {item['context']}\m답변 : "
        output_text = item["answer"]
        return input_text, output_text
    
    def tokenize_data(self) :
        input_ids_list = []
        attention_mask_list = []
        labels_list = []

        for item in self.data :
            input_text, output_text = self.prepare_input_output(item)

            input_tokens = self.tokenizer(
                input_text,
                add_special_tokens=False,
                return_tensors="pt"
            )["input_ids"].squeeze()

            output_tokens = self.tokenizer(
                output_text,
                add_special_tokens=False,
                return_tensors="pt"
            )["input_ids"].squeeze()

            full_sequence = torch.cat([input_tokens, output_tokens])

            labels = torch.cat([
                torch.full_like(input_tokens, -100),
                output_tokens
            ])

            if len(full_sequence) > self.max_length :
                full_sequence = full_sequence[:self.max_length]
                labels = labels[:self.max_length]

            attention_mask = torch.ones_like(full_sequence)

            input_ids_list.append(full_sequence)
            attention_mask_list.append(attention_mask)
            labels_list.append(labels)

        input_ids_list = torch.nn.utils.rnn.pad_sequence(
            input_ids_list, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )

        attention_mask_list = torch.nn.utils.rnn.pad_sequence(
            attention_mask_list, batch_first=True, padding_value=0
        )

        labels_list = torch.nn.utils.rnn.pad_sequence(
            labels_list, batch_first=True, padding_value=-100
        )

        dataset = Dataset.from_dict({
            "input_ids" : input_ids_list,
            "attention_mask" : attention_mask_list,
            "labels" : labels_list
        })

        return dataset

from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer

def setup_model_and_tokenizer_fsdp_qlora(model_name):
    num_gpus = torch.cuda.device_count()
    
    if num_gpus > 1:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_storage=torch.bfloat16,
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
        )
        
    else:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map={"": 0},
            trust_remote_code=True,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True
        )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer, num_gpus > 1

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

def setup_lora_config():
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )
    return lora_config

def create_sample_data():
    sample_data = [
        {
            "question": "파이썬에서 리스트를 어떻게 정렬하나요?",
            "context": "파이썬 리스트는 sort() 메서드나 sorted() 함수를 사용하여 정렬할 수 있습니다. sort()는 원본 리스트를 수정하고, sorted()는 새로운 정렬된 리스트를 반환합니다.",
            "answer": "파이썬에서 리스트를 정렬하는 방법은 두 가지입니다. 1) list.sort() - 원본 리스트를 직접 수정하여 정렬합니다. 2) sorted(list) - 원본을 유지하고 새로운 정렬된 리스트를 반환합니다."
        },
        {
            "question": "딥러닝에서 과적합이란 무엇인가요?",
            "context": "과적합(Overfitting)은 모델이 훈련 데이터에 너무 특화되어 새로운 데이터에 대한 일반화 성능이 떨어지는 현상입니다. 훈련 정확도는 높지만 검증 정확도가 낮은 특징을 보입니다.",
            "answer": "과적합은 모델이 훈련 데이터에만 과도하게 맞춰져서 새로운 데이터에 대한 예측 성능이 떨어지는 현상입니다. 드롭아웃, 정규화, 조기 종료 등의 방법으로 방지할 수 있습니다."
        }
    ]

    os.makedirs("./data", exist_ok=True)
    
    with open("./data/qna_data.json", 'w', encoding="utf-8") as f :
        json.dump(sample_data, f, ensure_ascii=False, indent=2)

def training_function():
    from transformers import TrainingArguments, DataCollatorForLanguageModeling, Trainer
    from accelerate import Accelerator
    
    accelerator = Accelerator()
    
    model_name = "./model/LLM/deepseek-qwen-bllossom-32b"
    data_path = "./data/qna_data.json"
    output_dir = "./model/finetuned-model"

    model, tokenizer, is_multi_gpu = setup_model_and_tokenizer_fsdp_qlora(model_name)

    if not is_multi_gpu:
        model = prepare_model_for_kbit_training(model)
    
    lora_config = setup_lora_config()
    model = get_peft_model(model, lora_config)

    dataset_handler = QnADataset(data_path, tokenizer)
    train_dataset = dataset_handler.tokenize_data()

    num_gpus = accelerator.num_processes
    
    if is_multi_gpu:
        per_device_batch_size = 1
        gradient_accumulation_steps = max(8 // num_gpus, 1)
    else:
        per_device_batch_size = 1
        gradient_accumulation_steps = 8

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=3,
        learning_rate=2e-4,
        bf16=is_multi_gpu,
        fp16=not is_multi_gpu,
        logging_steps=10,
        save_strategy="epoch",
        eval_strategy="no",
        warmup_steps=100,
        lr_scheduler_type="cosine",
        remove_unused_columns=False,
        dataloader_pin_memory=False,
        dataloader_num_workers=0,
        fsdp="full_shard auto_wrap" if is_multi_gpu else "",
        fsdp_transformer_layer_cls_to_wrap="LlamaDecoderLayer" if is_multi_gpu else None,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    trainer.train()

    trainer.save_model()
    tokenizer.save_pretrained(output_dir)

def main():
    create_sample_data()
    
    num_gpus = torch.cuda.device_count()
    
    if num_gpus > 1:
        try:
            from accelerate import notebook_launcher
            notebook_launcher(training_function, args=(), num_processes=num_gpus)
        except ImportError:
            training_function()
    else:
        training_function()

if __name__ == "__main__":
    main()

Launching training on 3 GPUs.


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  trainer = Trainer(
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f5385eb4d50>>
Traceback (most recent call last):
  File "/opt/conda/envs/sangwon/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

  File "/opt/conda/envs/sangwon/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
torch.distributed.elastic.multiprocessing.api.SignalException: Process 646956 got signal: 15
No label_names provided for

ChildFailedError: 
============================================================
training_function FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-06-05_08:25:31
  host      : f559870eb492
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 646957)
  error_file: /tmp/torchelastic_v9gjn4d9/none_c5egbe90/attempt_0/1/error.json
  traceback : Traceback (most recent call last):
    File "/opt/conda/envs/sangwon/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
      return f(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^
    File "/tmp/ipykernel_646841/2942075242.py", line 219, in training_function
      trainer.train()
    File "/opt/conda/envs/sangwon/lib/python3.11/site-packages/transformers/trainer.py", line 2245, in train
      return inner_training_loop(
             ^^^^^^^^^^^^^^^^^^^^
    File "/opt/conda/envs/sangwon/lib/python3.11/site-packages/transformers/trainer.py", line 2362, in _inner_training_loop
      self._fsdp_qlora_plugin_updates()
    File "/opt/conda/envs/sangwon/lib/python3.11/site-packages/transformers/trainer.py", line 5227, in _fsdp_qlora_plugin_updates
      self.accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(self.model)
                                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/opt/conda/envs/sangwon/lib/python3.11/site-packages/peft/utils/other.py", line 533, in fsdp_auto_wrap_policy
      raise Exception("Could not find the transformer layer class to wrap in the model.")
  Exception: Could not find the transformer layer class to wrap in the model.
  
============================================================