**프루닝 및 baseline 모델과의 속도차이 비교**

In [None]:
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch, time
import torch.nn as nn

# 필수 패키지 설치
!pip install -q "torch>=2.4.0" "transformers>=4.51.3" accelerate sentencepiece

# 허깅페이스 로그인
!huggingface-cli login

# 모델 로드
model_id = "google/gemma-3-1b-pt"
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.float32,
    low_cpu_mem_usage=True,
    device_map="auto",
    attn_implementation="eager"
)

#레이어명 확인
for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        print(name)

model.eval()
print(model.__class__.__name__, model.config.model_type, model.config.hidden_size)


**추론 함수 및 프루닝 전 모델 추론 속도 테스트**

In [None]:
def quick_infer(model, tokenizer, prompt="Hello", max_new_tokens=30):
    model_cpu = model.to("cpu") #cpu로 런타임 유형 바꿔서 추론

    inputs = tokenizer(prompt, return_tensors="pt")
    start = time.time()
    with torch.no_grad():
        output = model_cpu.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, temperature=0.1, top_p=0.8)
    elapsed = time.time() - start
    text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"\n--- Prompt ---\n{prompt}\n")
    print(f"--- Output ---\n{text}\n")
    print(f"[Elapsed] {elapsed:.2f} sec for {max_new_tokens} tokens on CPU\n")
    return elapsed

# 예시 실행 (프루닝 전 baseline 측정)
quick_infer(model, tokenizer, prompt="Q: What is the ideal daytime temperature for tomato plants?\nA", max_new_tokens=40)

In [None]:
def find_transformer_layers(model):
    container = getattr(model, "model", None) or getattr(model, "transformer", None) or model
    return getattr(container, "layers", None)

def get_mlp(block):
    mlp = block.mlp
    return mlp.gate_proj, mlp.up_proj, mlp.down_proj

@torch.no_grad()
def structured_prune_mlp(model, keep_ratio=0.7):
    layers = find_transformer_layers(model)
    gate0, up0, down0 = get_mlp(layers[0])
    inter = down0.in_features
    new_inter = max(16, int(round(inter * keep_ratio)))
    print(f"[INFO] intermediate {inter} → {new_inter} (keep {keep_ratio*100:.0f}%)")

    for li, block in enumerate(layers):
        gate, up, down = get_mlp(block)
        score = torch.norm(up.weight, dim=1) + torch.norm(gate.weight, dim=1)
        keep_idx = torch.topk(score, k=new_inter).indices.sort()[0]

        def clone_linear(old, new_out, new_in, sel_out=None, sel_in=None):
            new = nn.Linear(new_in, new_out, bias=old.bias is not None,
                            dtype=old.weight.dtype, device=old.weight.device)
            W = old.weight.data
            if sel_out is not None: W = W[sel_out, :]
            if sel_in is not None:  W = W[:, sel_in]
            new.weight.data.copy_(W)
            if old.bias is not None:
                b = old.bias.data
                if sel_out is not None: b = b[sel_out]
                new.bias.data.copy_(b)
            return new

        new_gate = clone_linear(gate, new_inter, gate.in_features, sel_out=keep_idx)
        new_up   = clone_linear(up,   new_inter, up.in_features,   sel_out=keep_idx)
        new_down = clone_linear(down, down.out_features, new_inter, sel_in=keep_idx)

        block.mlp.gate_proj, block.mlp.up_proj, block.mlp.down_proj = new_gate, new_up, new_down
        if li % 4 == 0:
            print(f"  - layer {li}: kept {new_inter}/{inter}")

    model.config.intermediate_size = new_inter
    print("[DONE] structured pruning complete.")

In [None]:
model = model.to("cuda")
structured_prune_mlp(model, keep_ratio=0.7)

**프루닝된 모델 추론 속도 테스트**

In [None]:
quick_infer(model, tokenizer, "Q: What is the ideal daytime temperature for tomato plants?\nA", max_new_tokens=40)

**프루닝된 모델을 LoRA 파인튜닝**

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

model = model.to("cuda")

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8, lora_alpha=16, lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "up_proj", "down_proj", "gate_proj"]
)
model = get_peft_model(model, lora_config)

In [None]:
from datasets import load_dataset

# JSONL 파일 불러오기
dataset = load_dataset("json", data_files="/content/tomato_qa_3000.jsonl")["train"]
dataset = dataset.train_test_split(test_size=0.2, seed=42)   # train:eval = 80:20

train_dataset = dataset["train"]
eval_dataset  = dataset["test"]

# 토크나이징 함수 (batched=True 지원)
def tokenize_fn(examples):
    texts = [f"Q: {ins}\nA: {out} {tokenizer.eos_token}"
             for ins, out in zip(examples["instruction"], examples["output"])]
    return tokenizer(texts, truncation=True, max_length=128, padding="max_length")

train_dataset = train_dataset.map(tokenize_fn, batched=True)
eval_dataset  = eval_dataset.map(tokenize_fn, batched=True)

print(train_dataset[0])  # 토크나이즈 결과 확인

In [None]:
import wandb
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling, EarlyStoppingCallback
from transformers.trainer_utils import IntervalStrategy

wandb.login()
wandb.init(project="tomato-qa", name="tomato-qa-pruning-lora")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./lora-tomato",
    logging_dir="./logs",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="steps",
    save_steps=500,
    eval_strategy=IntervalStrategy.STEPS,
    eval_steps=500,
    report_to="wandb",
    run_name="tomato-qa-pruning-lora",
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("🔗 대시보드 URL:", wandb.run.get_url())
trainer.train()

In [None]:
quick_infer(model, tokenizer, "Q: What is the ideal daytime temperature for tomato plants?\nA", max_new_tokens=40)

**구글 드라이브에 사본 저장**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r ./lora-tomato /content/drive/MyDrive/