In [None]:
!pip install --upgrade transformers peft accelerate datasets bitsandbytes

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting peft
  Downloading peft-0.15.2-py3-none-any.whl (411 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl (6

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch

In [None]:
# 1. 모델 및 토크나이저 로딩
model_id = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True  # QLoRA 방식
)

In [None]:
# 2. QLoRA 설정
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)

In [None]:
# 3. 데이터셋 로딩 및 전처리
dataset = load_dataset("codeparrot/github-code", split="train[:5000]")  # 일부만 사용 (예제 목적)

In [None]:
# 4. 코드 전용 프롬프트 포맷
def format_example(example):
    code = example["code"]
    return {"text": f"# Python code snippet:\n{code.strip()}"}

dataset = dataset.map(format_example)


In [None]:
# 5. 토크나이징
tokenized_dataset = dataset.map(
    lambda e: tokenizer(e["text"], truncation=True, padding="max_length", max_length=512),
    batched=True
)

In [None]:
# 6. 학습 설정
training_args = TrainingArguments(
    output_dir="./gemma-2b-code-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    optim="paged_adamw_32bit",
    report_to="none"
)


In [None]:
# 7. Trainer 구성
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

In [None]:
# 8. 학습 시작
trainer.train()


In [None]:

# 9. 모델 저장
model.save_pretrained("./gemma-2b-code-finetuned")
tokenizer.save_pretrained("./gemma-2b-code-finetuned")