In [None]:
!pip install --upgrade bitsandbytes transformers peft accelerate wandb

from huggingface_hub import login
import os

hf_token = os.environ.get("HF_TOKEN")
login(token=hf_token)

!wandb login $os.environ["WANDB_API_KEY"]

from google.colab import drive
drive.mount('/content/drive')

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import torch.nn.utils.prune as prune
from datasets import Dataset, load_dataset
import time

model_id = "google/gemma-3-1b-pt"

# 모델 로드
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="eager"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# 비구조적 프루닝 적용
for layer in model.model.layers:
    for name in ["gate_proj", "up_proj", "down_proj"]:
        prune.l1_unstructured(getattr(layer.mlp, name), name="weight", amount=0.1)

for layer in model.model.layers:
    for name in ["gate_proj", "up_proj", "down_proj"]:
        prune.remove(getattr(layer.mlp, name), "weight")

# LoRA 적용
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj","v_proj","k_proj","o_proj","gate_proj","down_proj","up_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# 데이터셋 로드
file_path = "/content/drive/MyDrive/tomato_qa_3000.txt"
dataset = load_dataset("json", data_files=file_path)["train"]

dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 데이터셋 토큰화
def preprocess_function(ex):
    input = f"question: {ex['instruction']} answer: "
    label = ex["output"]

    model_inputs = tokenizer(input, truncation=True, max_length=128, padding="max_length")

    label_tokens = tokenizer(label, truncation=True, max_length=128, padding="max_length")

    model_inputs["labels"] = label_tokens["input_ids"]

    model_inputs["labels"] = [
        (l if l != tokenizer.pad_token_id else -100) for l in model_inputs["labels"]
    ]

    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=-100
)

# 학습 (Trainer)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    learning_rate=5e-5,
    logging_steps=10,
    bf16=True,
    fp16=False,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator
)

trainer.train()

# 최종 저장
model.save_pretrained("/content/drive/MyDrive/results")
tokenizer.save_pretrained("/content/drive/MyDrive/results")

# 기존 gemma 모델 로드
base_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-3-1b-pt",
    attn_implementation="eager"
)

base_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-pt")

# 경량화된 모델 로드
model = AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/results",
    attn_implementation="eager"
)

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/results")

text = "question: How should nitrogen be managed after flowering begins? answer: "

# 기존 모델 시간
inputs = base_tokenizer(text, return_tensors="pt").to("cpu")

start = time.time()
outputs = base_model.generate(**inputs, max_new_tokens=128)
end = time.time()

print(f"{end - start}초 걸림\n")
print(base_tokenizer.decode(outputs[0], skip_special_tokens=True))

# 경량화된 모델 시간
inputs = tokenizer(text, return_tensors="pt").to("cpu")

start = time.time()
outputs = model.generate(**inputs, max_new_tokens=128)
end = time.time()

print(f"{end - start}초 걸림\n")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))