In [7]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] =  "3"
# cuBLAS 결정적 모드 (둘 중 하나 사용) — 반드시 torch import 전에 설정
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"   # 더 큰 워크스페이스

# (선택) 기타 재현성 관련
os.environ["PYTHONHASHSEED"] = "42"
import random, numpy as np, torch

# 완전 결정적 모드
torch.use_deterministic_algorithms(True, warn_only=False)

# cuDNN/TF32 설정
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

# 시드 고정
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


import argparse, io, base64, re, requests
from PIL import Image
import pandas as pd
from tqdm import tqdm
from transformers import (
    AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig, AutoTokenizer,
    Trainer, TrainingArguments
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from qwen_vl_utils import process_vision_info
from dataclasses import dataclass
from typing import List, Dict, Any

SYSTEM_RULES = (
    "You are a multimodal assistant.\n"
    "\n"
    "GLOBAL FORMAT RULES:\n"
    "- Output exactly ONE final block with NO role tags, headers, lists, or extra sections.\n"
    "- Never enumerate (no (i)/(ii), numbers, bullets). Do NOT echo instructions.\n"
    "- Do NOT wrap output in quotes. No leading/trailing spaces or trailing newline.\n"
    "- Preserve original casing, punctuation, and spacing from the source when answering spans.\n"
    "\n"
    "TASK STYLES (use [TASK_HINT] when present; otherwise infer from inputs):\n"
    "1) captioning (image, no explicit question):\n"
    "   - Begin the caption with: 'The image is ...'\n"
    "   - Write a detailed description in 2–6 sentences (paragraph style), no labels.\n"
    "   - No quotes. No list formatting.\n"
    "\n"
    "2) vqa (image + question):\n"
    "   - Return ONLY the exact short answer span as plain text (one line).\n"
    "   - Do NOT add words, punctuation, or explanations. Do NOT normalize case or numbers.\n"
    "\n"
    "3) math_reasoning (word problems):\n"
    "   - You MUST output step-by-step reasoning FIRST, then a blank line, then the final line.\n"
    "   - Use the numbers from the problem in the steps. Keep units only if required by the answer.\n"
    "   - Do NOT output the final line until after the steps are written.\n"
    "   - Write ONLY 4–8 SHORT plain sentences, NO bullets/numbers/latex. NO introductions like 'To determine...' and NO variables such as x.\n"
    "   - Each sentence MUST include at least one inline computation marker like '<<a+b=c>>', '<<a-b=c>>', '<<a*b=c>>', or '<<a/b=c>>'. Use the given numbers directly.\n"
    "   - Keep the style compact as in the dataset examples.\n"
    "   - After the sentences, add ONE blank line and then the final line EXACTLY: '#### <final_answer>' (digits only; decimal allowed; no commas/units).\n"
    "   - Examples:\n"
    "     Dad gave Olaf 10 toy cars, Mom gave 10 + 5 = <<10+5=15>>15. Auntie gave 6, Uncle gave 6 - 1 = <<6-1=5>>5, Grandpa gave 2 * 5 = <<2*5=10>>10. All gifts total 10 + 15 + 6 + 5 + 10 = <<10+15+6+5+10=46>>46. Olaf now has 150 + 46 = <<150+46=196>>196.\n"
    "     \n"
    "     #### 196\n"
    "     He has 6 - 2 = <<6-2=4>>4 cats. He has 4 - 1 = <<4-1=3>>3 parrots. He has 4 + 6 = <<4+6=10>>10 snakes. Total pets 2 + 4 + 3 + 10 = <<2+4+3+10=19>>19.\n"
    "     \n"
    "     #### 19\n"
    "     She spend $56 because 7 * 8 = <<7*8=56>>56. She has 100 - 56 = <<100-56=44>>44 left. She can get 44 / 5 = <<44/5=8.8>>8.8 five-dollar bills → $40 because 8 * 5 = <<8*5=40>>40. Money left 44 - 40 = <<44-40=4>>4.\n"
    "     \n"
    "     #### 4\n"
    "     Mimi picked up 2 dozen seashells, which is <<2*12=24>> shells. Kyle found twice as many, so <<24*2=48>> shells. Leigh grabbed one-third of Kyle's shells, so <<48/3=16>> shells.\n"
    "     \n"
    "     #### 16\n"
    "\n"
    "4) summarization (long text, no direct question):\n"
    "   - If a formal act title exists (e.g., explicit short title line or '<TITLE> of <YEAR>'), OUTPUT EXACTLY '<TITLE> - <summary>' on ONE line.\n"
    "   - The part AFTER ' - ' MUST be a NON-EMPTY summary of 20–60 words (1–3 sentences). Title-only output is INVALID.\n"
    "   - If no clear title exists, output a single-line summary of 20–60 words (no line breaks).\n"
    "   - NEVER include raw headings like 'SECTION 1. SHORT TITLE.' or 'SEC.' in the output. Do NOT write 'Title - ...'. Start with the actual title text only.\n"
    "   - Do NOT mention 'image' or 'document'. Focus on the bill’s purpose, actions (Directs/Requires/Establishes/Authorizes), scope, and any key targets/timelines.\n"
    "   - If your draft lacks the ' - ' or has fewer than 20 words after it, treat it as INVALID and rewrite before finishing.\n"
    "   - Examples:\n"
    "     'Veterans Access to Timely Medical Appointments Act - Directs VA to implement a standardized scheduling policy with 7-day primary care and 14-day specialty targets, require reliable wait-time data, strengthen training and oversight, improve phone access, and publish performance reports.'\n"
    "     'King Holiday and Service Act of 1994 - Extends and revises the commission honoring Martin Luther King, Jr., expands service opportunities and grant authority, updates reporting and membership requirements, and aligns provisions in the National and Community Service Act.'\n"
    "     'Establishes the Commission To Assess the Nuclear Activities of the Islamic Republic of Iran which shall assess the status of, the relationship between, and the intentions behind the military and the civilian nuclear activities of the Islamic Republic of Iran.\n\nTerminates the Commission 60 days after submission of the report required under this Act.'\n"
    "\n"
    "5) text_qa (multiple questions provided):\n"
    "   - Return a Python-style dict string with ONLY one key: 'input_text'.\n"
    "   - Its value is a JSON-like list of answers in the SAME order as the questions.\n"
    "   - Example: {'input_text': ['ans1', 'ans2', 'ans3']}\n"
)


def build_prompt(row: dict) -> str:
    lines = []
    if row.get("task"):     lines.append(f"[TASK_HINT]: {row['task']}")
    if row.get("question"): lines.append(f"[QUESTION]: {row['question']}")
    if row.get("input_type") == "text" and row.get("input"):
        lines.append(f"[TEXT]: {row['input']}")
    lines.append("Respond with the single best answer only.")
    return "\n".join(lines)

# <<< CHANGED: messages엔 '자리표시자'만 두고, PIL 이미지는 넣지 않음
def build_messages(row, has_image: bool):
    user_content = []
    if has_image:
        user_content.append({"type": "image"})  # 자리표시자만
    user_content.append({"type": "text", "text": build_prompt(row)})
    messages = [
        {"role": "system", "content": [{"type":"text", "text": SYSTEM_RULES}]},
        {"role": "user",   "content": user_content},
    ]
    return messages

def build_messages_from_row(row: dict) -> list:
    """한 샘플의 messages(list[dict])를 생성.
       이미지가 여러 장이면 모두 user content에 추가."""
    user_content = []
    
    # 이미지 프롬프트
    img = load_image_any(row.get("input")) if row.get("input_type") == "image" else None
    if img is not None:
         user_content.append({"type": "image", "image": img})  # 이미지 객체를 그대로 넣음
    # 텍스트 프롬프트
    user_content.append({"type": "text", "text": build_prompt(row)})

    messages = [
        {"role": "system", "content": SYSTEM_RULES},  # 문자열 그대로
        {"role": "user",   "content": user_content},
    ]
    return messages

def load_image_any(x: str):
    if not isinstance(x, str):
        return None
    if x.startswith("http://") or x.startswith("https://"):
        try:
            r = requests.get(x, timeout=15)
            r.raise_for_status()
            return Image.open(io.BytesIO(r.content)).convert("RGB")
        except Exception:
            return None
    try:
        raw = base64.b64decode(x, validate=True)
        return Image.open(io.BytesIO(raw)).convert("RGB")
    except Exception:
        return None

def find_id_col(df: pd.DataFrame):
    for c in ("ID","id"):
        if c in df.columns:
            return c
    df.reset_index(inplace=True)
    df.rename(columns={"index":"ID"}, inplace=True)
    return "ID"

# 학습용 답변 컬럼 추정: 우선순위대로 찾기
def get_target_text(row: dict) -> str:
    for k in ("output", "answer", "label", "target"):
        if isinstance(row.get(k), str) and len(row[k].strip()) > 0:
            return row[k].strip()
    # 없으면 빈 문자열 반환(로스 없음)
    return ""

In [8]:
from transformers import Qwen2_5_VLForConditionalGeneration

# -----------------------------
# 경로/모델/설정
# -----------------------------
os.environ["CUDA_VISIBLE_DEVICES"] =  "3"
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"

DATA_PATH  = "../data/train.parquet"   # 학습 데이터
VAL_PATH   = "../data/val.parquet"   # (옵션) 검증
OUTPUT_DIR = "./qwen2_5_vl_7b_lora_2"

# -----------------------------
# 4bit 양자화
# -----------------------------
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)
device_map = "auto"

# -----------------------------
# Processor & Model
# -----------------------------
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
    #attn_implementation="flash_attention_2",
    trust_remote_code=True,
)
print(f"Loaded model: {MODEL_NAME} on {base.device}")


# -----------------------------
# 생성 파라미터(결정적)
# -----------------------------
gen_kwargs = dict(
    max_new_tokens=352,
    temperature=None,
    top_p=None,
    do_sample=False,
    repetition_penalty=1.05,
    use_cache=True,
)


# -----------------------------
# QLoRA 준비
# -----------------------------
# 1) QLoRA 준비
base = prepare_model_for_kbit_training(base)
base.config.use_cache = False
base.enable_input_require_grads()
base.gradient_checkpointing_enable()

# 2) LoRA 타겟: LLM + projector(★핵심)
target_modules = [
    # LLM
    "q_proj","k_proj","v_proj","o_proj",
    "gate_proj","up_proj","down_proj",
    # Projector (visual.merger.mlp: 5120->3584)
    "merger.mlp.0","merger.mlp.2",
]

lora_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=target_modules,
    task_type=TaskType.CAUSAL_LM,
    bias="none"
)

model = get_peft_model(base, lora_cfg)
model.to("cuda")
model.print_trainable_parameters()
# -----------------------------
# 데이터셋
# -----------------------------
class VLParquetDataset(torch.utils.data.Dataset):
    def __init__(self, parquet_path: str):
        self.df = pd.read_parquet(parquet_path, engine="pyarrow")
        self.id_col = find_id_col(self.df)

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx].to_dict()
        has_img = (row.get("input_type") == "image") and isinstance(row.get("input"), str)
        messages = build_messages_from_row(row)

        # 학습용 텍스트: prompt(+assistant 헤더) + 정답 + <|im_end|>
        prompt_text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        target = get_target_text(row)
        # target이 비어도 괜찮지만, 실제 SFT는 비어있지 않은 데이터가 좋아요.
        full_text = f"{prompt_text}{target}<|im_end|>"

        image = load_image_any(row.get("input")) if has_img else None
        return {
            "messages": messages,           # 프롬프트 재생성용
            "prompt_text": prompt_text,     # prefix 길이 계산용
            "full_text": full_text,         # 실제 입력
            "image": image,
            "id": row[self.id_col],
        }

train_ds = VLParquetDataset(DATA_PATH)
eval_ds  = VLParquetDataset(VAL_PATH) if os.path.exists(VAL_PATH) else None
print("DATASET LOADED")

# -----------------------------
# Collator: 배치 인코딩 + 라벨 마스킹
# -----------------------------
@dataclass
class VLSFTCollator:
    processor: Any
    max_length: int = None  # 필요시 최대 토큰수 제한

    def __call__(self, features: List[Dict[str,Any]]) -> Dict[str, torch.Tensor]:
        # 1) 메시지 리스트로부터 비전 입력 정리 (자리표시자 대응)
        messages_list = [f["messages"] for f in features]
        image_inputs, _ = process_vision_info(messages_list)

        # 2) 텍스트들 준비
        texts_full   = [f["full_text"] for f in features]
        texts_prompt = [f["prompt_text"] for f in features]

        # 3) 실제 학습 입력(Full) 배치 인코딩
        inputs_full = self.processor(
            text=texts_full,
            images=image_inputs,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        # 4) prefix 길이 계산: 프롬프트만 같은 길이로 패딩해서 attention_mask로 길이를 구함
        inputs_pref = self.processor(
            text=texts_prompt,
            images=image_inputs,
            padding="max_length",
            max_length=inputs_full["input_ids"].shape[1],
            truncation=True,
            return_tensors="pt",
        )

        # 5) 라벨 생성: 프롬프트 위치(= inputs_pref.attention_mask==1)는 -100
        labels = inputs_full["input_ids"].clone()
        labels[inputs_pref["attention_mask"].bool()] = -100

        batch = dict(inputs_full)
        batch["labels"] = labels
        return batch

collator = VLSFTCollator(processor, max_length=None)



Loading checkpoint shards: 100%|██████████| 5/5 [00:22<00:00,  4.41s/it]


Loaded model: Qwen/Qwen2.5-VL-7B-Instruct on cuda:0
trainable params: 47,892,480 || all params: 8,340,059,136 || trainable%: 0.5742
DATASET LOADED


In [9]:
# -----------------------------
# 학습 설정
# -----------------------------
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,          # VRAM에 맞춰 조절
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=1,
    learning_rate=1e-4,                     # LoRA는 1e-4 ~ 2e-4 자주 사용
    num_train_epochs=2,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    #evaluation_strategy="steps" if eval_ds is not None else "no",
    save_strategy="steps",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    bf16=True,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",               # bitsandbytes 최적화기
    ddp_find_unused_parameters=False,
    report_to="none",
    remove_unused_columns=False
)

# 캐시 비활성화(gradient checkpointing과 충돌 방지)
model.config.use_cache = False

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=collator,
)


print("Starting training...")
trainer.train()

# LoRA 어댑터 저장
trainer.model.save_pretrained(os.path.join(OUTPUT_DIR, "lora_adapter"))
processor.save_pretrained(OUTPUT_DIR)

Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
50,6.4441
100,4.9539
150,4.7287
200,4.2692
250,4.0603
300,4.2744
350,4.0
400,4.2225
450,4.1456
500,4.0159


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast

[]