In [1]:
!pip install --upgrade transformers peft accelerate datasets bitsandbytes

[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl (411 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m89.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl (67.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[0mInstalling collected packages: dataset

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch

In [20]:
# 1. 모델 및 토크나이저 로딩
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")

Fetching 2 files: 100%|██████████| 2/2 [09:37<00:00, 288.71s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.81it/s]


In [21]:
# 2. QLoRA 설정
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)

In [8]:
# 3. 데이터셋 로딩 및 전처리
# dataset = load_dataset("codeparrot/github-code", split="train[:5000]")  # 일부만 사용 (예제 목적)

dataset = load_dataset(
    path="/datasets/github-code/github-code-clean",
    data_dir="/datasets/github-code/hf_data",
    cache_dir="/datasets/github-code/hf_cache",
    trust_remote_code=True
)

In [22]:
dataset

IterableDatasetDict({
    train: IterableDataset({
        features: Unknown,
        num_shards: 880
    })
})

In [23]:
# 4. 코드 전용 프롬프트 포맷
def format_example(example):
    code = example["code"]
    return {"text": f"# Python code snippet:\n{code.strip()}"}

dataset = dataset.map(format_example)


In [24]:
# 5. 토크나이징
tokenized_dataset = dataset.map(
    lambda e: tokenizer(e["text"], truncation=True, padding="max_length", max_length=512),
    batched=True
)

In [25]:
# 6. 학습 설정
training_args = TrainingArguments(
    output_dir="./gemma-2b-code-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    optim="paged_adamw_32bit",
    report_to="none"
)


In [26]:
# 7. Trainer 구성
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [27]:
# 8. 학습 시작
trainer.train()


KeyError: 0

In [29]:
print(type(tokenized_dataset))

<class 'datasets.dataset_dict.IterableDatasetDict'>


In [None]:

# 9. 모델 저장
model.save_pretrained("./gemma-2b-code-finetuned")
tokenizer.save_pretrained("./gemma-2b-code-finetuned")