환경 세팅

In [45]:
!pip install -U transformers accelerate trl peft datasets safetensors --no-cache-dir



구글 드라이브 연결 및 모델 복사

In [46]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r /content/drive/MyDrive/pruned-model /content/
!ls -lh /content/pruned-model

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
total 3.3G
-rw------- 1 root root   35 Oct  7 16:54 added_tokens.json
-rw------- 1 root root 1.5K Oct  7 16:54 config.json
-rw------- 1 root root  210 Oct  7 16:54 generation_config.json
-rw------- 1 root root 3.2G Oct  7 16:55 model.safetensors
-rw------- 1 root root 579K Oct  7 16:54 pruned_structure.json
-rw------- 1 root root  150 Oct  7 16:54 pruning_metadata.json
-rw------- 1 root root  662 Oct  7 16:54 special_tokens_map.json
-rw------- 1 root root 1.2M Oct  7 16:54 tokenizer_config.json
-rw------- 1 root root  32M Oct  7 16:54 tokenizer.json
-rw------- 1 root root 4.5M Oct  7 16:54 tokenizer.model


모델 준비

In [None]:
import os, torch, json, re
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- 경로 설정 ---
MODEL_DIR = "/content/pruned-model"                # 프루닝된 Gemma3-1B-pt 모델 폴더
DATA_FILE = "/content/finetuning_lora_data2.jsonl"  # LoRA 학습 데이터셋 경로

OUT_DIR    = "/content/lora-out"       # 학습 출력 경로
MERGED_DIR = "/content/lora-merged"    # 병합 모델 저장 경로

os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MERGED_DIR, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Environment ready. Using device: {device}")

✅ Environment ready. Using device: cuda


토크나이저 로드

In [None]:
from transformers import AutoTokenizer
import json, os

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"ℹ pad_token was None → set to eos_token ({tokenizer.eos_token})")

# 선택: pruned-model/added_tokens.json 있으면 반영
added_token_path = os.path.join(MODEL_DIR, "added_tokens.json")
if os.path.exists(added_token_path):
    try:
        with open(added_token_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            n_added = tokenizer.add_tokens(list(set(data)))
            print(f"Added {n_added} tokens from added_tokens.json")
        else:
            print("ℹ added_tokens.json is not a list. Skipped.")
    except Exception as e:
        print("added_tokens.json parse failed:", e)
else:
    print("ℹ No added_tokens.json found.")
print(f"✅Tokenizer ready — vocab size: {len(tokenizer)}")

ℹ added_tokens.json is not a list. Skipped.
✅ Tokenizer ready — vocab size: 262145


프루닝 구조 반영 및 가중치 로드

In [None]:
from transformers import AutoConfig
from safetensors.torch import load_file
import torch.nn as nn
import json, os

pruned_path     = os.path.join(MODEL_DIR, "pruned_structure.json")
state_dict_path = os.path.join(MODEL_DIR, "model.safetensors")

with open(pruned_path, "r") as f:
    pruned_info = json.load(f)

if "layer_structure" not in pruned_info:
    raise ValueError("❌ 'layer_structure' not found in pruned_structure.json")

layer_sizes = {int(k): v["intermediate_size"] for k, v in pruned_info["layer_structure"].items()}
print(f"Detected {len(layer_sizes)} layers from pruning info.")

# config → from_pretrained (trust_remote_code 필요시 True)
config = AutoConfig.from_pretrained(MODEL_DIR, trust_remote_code=True)

# 베이스 모델(구조) 만들기
base_model = AutoModelForCausalLM.from_config(config)
# 레이어별 MLP 리사이즈
for i, layer in enumerate(base_model.model.layers):
    if i in layer_sizes:
        new_dim = layer_sizes[i]
        in_dim  = layer.mlp.up_proj.weight.shape[1]
        # gate/up/down 교체
        layer.mlp.gate_proj = nn.Linear(in_dim, new_dim, bias=False)
        layer.mlp.up_proj   = nn.Linear(in_dim, new_dim, bias=False)
        layer.mlp.down_proj = nn.Linear(new_dim, in_dim, bias=False)

print("Structure rebuilt. Loading pruned weights...")
state_dict = load_file(state_dict_path)
missing, unexpected = base_model.load_state_dict(state_dict, strict=False)
print(f"Weights loaded (missing={len(missing)}, unexpected={len(unexpected)})")

# 토크나이저 토큰 수 반영
base_model.resize_token_embeddings(len(tokenizer))
base_model.to(device)

print("Model structure aligned with pruning and weights loaded.")

✅ Detected 26 layers from pruning info.
🔧 Structure rebuilt. Loading pruned weights...
✅ Weights loaded (missing=1, unexpected=0)
🚀 Model structure aligned with pruning and weights loaded.


LoRA 적용

In [None]:
import torch.nn as nn
import re

CANDIDATES = [
    "q_proj","k_proj","v_proj","o_proj",
    "gate_proj","up_proj","down_proj",
    "wi","wo","wq","wk","wv","out_proj",
    "fc_in","fc_out"
]

def infer_target_modules(model):
    present = set()
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            for cand in CANDIDATES:
                if re.search(rf"\b{re.escape(cand)}\b", name):
                    present.add(cand)
    present = [m for m in sorted(present) if m != "lm_head"]
    # 최소 세트 보장
    if not present:
        present = ["q_proj","v_proj","o_proj"]
    return sorted(set(present))

target_modules = infer_target_modules(base_model)
print("LoRA target modules:", target_modules)

🎯 LoRA target modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']


In [51]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

trainable params: 11,038,496 || all params: 866,402,592 || trainable%: 1.2741


데이터 로드 및 전처리

In [None]:
from datasets import load_dataset

raw = load_dataset("json", data_files={"train": DATA_FILE})["train"]

def to_text(example):
    instr = example["instruction"].strip()
    out   = example["output"].strip()
    return {"text": f"### Instruction:\n{instr}\n\n### Response:\n{out}\n"}

train_dataset = raw.map(to_text, remove_columns=raw.column_names)
print("Dataset size:", len(train_dataset))
print(train_dataset[0]["text"][:250])

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

✅ Dataset size: 2210
### Instruction:
[STATUS: OK] Pesticide storage safety requirements?

### Response:
[STATUS: OK] Store in locked, ventilated area; separate from food; maintain 10-30°C temperature; keep original labels; inventory annually.



학습 설정 및 학습

In [53]:
from trl import SFTTrainer, SFTConfig

sft_config = SFTConfig(
    output_dir=OUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1.5e-4,
    num_train_epochs=2,
    fp16=True,
    save_steps=200,
    logging_steps=25,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    formatting_func=lambda ex: ex["text"],
)

trainer.train()

Applying formatting function to train dataset:   0%|          | 0/2210 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/2210 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2210 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2210 [00:00<?, ? examples/s]

Step,Training Loss
25,2.5912
50,1.8302
75,1.6774
100,1.6218
125,1.4964
150,1.4935
175,1.4183
200,1.3633
225,1.3882
250,1.3815




TrainOutput(global_step=554, training_loss=1.3601495229810583, metrics={'train_runtime': 919.8363, 'train_samples_per_second': 4.805, 'train_steps_per_second': 0.602, 'total_flos': 1079829923640192.0, 'train_loss': 1.3601495229810583, 'entropy': 1.1178389054078322, 'num_tokens': 285138.0, 'mean_token_accuracy': 0.7477906346321106, 'epoch': 2.0})

LoRA 병합 및 저장

In [None]:
from peft import PeftModel

# 이미 프루닝 구조로 복원된 base_model 사용
model = PeftModel.from_pretrained(base_model, "/content/lora-out")

# LoRA 병합
merged_model = model.merge_and_unload()

# 병합 모델 저장 (.safetensors 생성됨)
merged_model.save_pretrained("/content/lora-merged", safe_serialization=True)
tokenizer.save_pretrained("/content/lora-merged")

print("Fully merged model saved to /content/lora-merged (from pruned base_model)")



✅ Fully merged model saved to /content/lora-merged (from pruned base_model)


하이브리드 규칙 기반 추론 래퍼

In [None]:
from transformers import AutoConfig, AutoModelForCausalLM
from safetensors.torch import load_file
import torch.nn as nn
import torch, json, os

# --- 프루닝 구조 복원 ---
pruned_path = os.path.join(MODEL_DIR, "pruned_structure.json")   # MODEL_DIR에서 불러옴
state_dict_path = os.path.join(MERGED_DIR, "model.safetensors")

with open(pruned_path, "r") as f:
    pruned_info = json.load(f)

if "layer_structure" not in pruned_info:
    raise ValueError("❌ 'layer_structure' key not found in pruned_structure.json")

layer_sizes = {
    int(k): v["intermediate_size"]
    for k, v in pruned_info["layer_structure"].items()
}

config = AutoConfig.from_pretrained(MERGED_DIR, trust_remote_code=True)
merged_model = AutoModelForCausalLM.from_config(config)

# --- 각 레이어별로 MLP 크기 조정 ---
for i, layer in enumerate(merged_model.model.layers):
    if i in layer_sizes:
        new_dim = layer_sizes[i]
        in_dim = layer.mlp.up_proj.weight.shape[1]
        layer.mlp.gate_proj = nn.Linear(in_dim, new_dim, bias=False)
        layer.mlp.up_proj = nn.Linear(in_dim, new_dim, bias=False)
        layer.mlp.down_proj = nn.Linear(new_dim, in_dim, bias=False)
        print(f"🔧 Layer {i}: MLP resized to {new_dim}")

# --- 가중치 로드 전 ---
merged_model.resize_token_embeddings(len(tokenizer))

# --- 가중치 로드 ---
state_dict = load_file(state_dict_path)
missing, unexpected = merged_model.load_state_dict(state_dict, strict=False)
print(f"Loaded weights (missing={len(missing)}, unexpected={len(unexpected)})")

device = "cuda" if torch.cuda.is_available() else "cpu"
merged_model.to(device)
merged_model.eval()

print("Pruned structure successfully restored for inference.")


🔧 Layer 0: MLP resized to 5357
🔧 Layer 1: MLP resized to 5357
🔧 Layer 2: MLP resized to 3802
🔧 Layer 3: MLP resized to 3802
🔧 Layer 4: MLP resized to 3802
🔧 Layer 5: MLP resized to 3802
🔧 Layer 6: MLP resized to 3802
🔧 Layer 7: MLP resized to 3802
🔧 Layer 8: MLP resized to 3802
🔧 Layer 9: MLP resized to 6413
🔧 Layer 10: MLP resized to 3802
🔧 Layer 11: MLP resized to 3802
🔧 Layer 12: MLP resized to 4879
🔧 Layer 13: MLP resized to 4907
🔧 Layer 14: MLP resized to 5368
🔧 Layer 15: MLP resized to 4924
🔧 Layer 16: MLP resized to 5093
🔧 Layer 17: MLP resized to 6082
🔧 Layer 18: MLP resized to 6912
🔧 Layer 19: MLP resized to 6912
🔧 Layer 20: MLP resized to 6912
🔧 Layer 21: MLP resized to 6912
🔧 Layer 22: MLP resized to 6912
🔧 Layer 23: MLP resized to 6912
🔧 Layer 24: MLP resized to 6912
🔧 Layer 25: MLP resized to 6912
✅ Loaded weights (missing=1, unexpected=0)
🚀 Pruned structure successfully restored for inference.


규칙 기반 하이브리드 추론 함수

In [62]:
def hybrid_infer(user_question, temp=None, hum=None, co2=None, light=None, max_new_tokens=128):
    RANGES = {"temp": (20,25), "hum": (65,75), "co2": (800,1000), "light": (45000,70000)}
    alerts = []
    if temp is not None:
        lo, hi = RANGES["temp"]
        if temp > hi: alerts.append(f"TEMP_HIGH({int(temp)})")
        elif temp < lo: alerts.append(f"TEMP_LOW({int(temp)})")
    if hum is not None:
        lo, hi = RANGES["hum"]
        if hum > hi: alerts.append(f"HUM_HIGH({int(hum)})")
        elif hum < lo: alerts.append(f"HUM_LOW({int(hum)})")
    if co2 is not None:
        lo, hi = RANGES["co2"]
        if co2 > hi: alerts.append(f"CO2_HIGH({int(co2)})")
        elif co2 < lo: alerts.append(f"CO2_LOW({int(co2)})")
    if light is not None:
        lo, hi = RANGES["light"]
        if light < lo: alerts.append(f"LIGHT_LOW({int(light)})")

    prefix = f"[STATUS: {alerts[0]}]" if alerts else "[STATUS: OK]"
    prompt = f"### Instruction:\n{prefix} {user_question}\n\n### Response:\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = merged_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.25,
            top_p=0.9,
            repetition_penalty=1.05,
            pad_token_id=tokenizer.eos_token_id,
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = decoded.split("### Response:")[-1].strip()

    print(f"\n[STATUS PROMPT] {prefix}")
    print("🧩 Question:", user_question)
    print("💬 Model Response:\n", response)

추론 테스트

In [63]:
print("\n[정상 상태 예시]")
hybrid_infer("How should I care for my tomato plants this week?", temp=23, hum=70, co2=900, light=50000)

print("\n[온도 높은 상황 예시]")
hybrid_infer("temperature is good?", temp=3335, hum=70, co2=900, light=50000)

print("\n[습도 낮은 상황 예시]")
hybrid_infer("is the enviroment okay?", temp=22, hum=1240, co2=850, light=48000)


[정상 상태 예시]

[STATUS PROMPT] [STATUS: OK]
🧩 Question: How should I care for my tomato plants this week?
💬 Model Response:
 [STATUS: OK] Maintain 60-70% humidity with minimum 20,000 lux supplemental lighting. Add calcium foliar spray at 150ppm and magnesium sulfate at 0.5%.

[온도 높은 상황 예시]

[STATUS PROMPT] [STATUS: TEMP_HIGH(3335)]
🧩 Question: temperature is good?
💬 Model Response:

[습도 낮은 상황 예시]

[STATUS PROMPT] [STATUS: HUM_HIGH(1240)]
🧩 Question: is the enviroment okay?
💬 Model Response:
