In [None]:
import os
# cuBLAS 결정적 모드 (둘 중 하나 사용) — 반드시 torch import 전에 설정
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"   # 더 큰 워크스페이스

# (선택) 기타 재현성 관련
os.environ["PYTHONHASHSEED"] = "42"
import random, numpy as np, torch

# 완전 결정적 모드
torch.use_deterministic_algorithms(True, warn_only=False)

# cuDNN/TF32 설정
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

# 시드 고정
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


import argparse, io, base64, re, requests
from PIL import Image
import pandas as pd
from tqdm import tqdm
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig, AutoTokenizer


SYSTEM_RULES = (
    "You are a multimodal assistant.\n"
    "\n"
    "GLOBAL FORMAT RULES:\n"
    "- Output exactly ONE final block with NO role tags, headers, lists, or extra sections.\n"
    "- Never enumerate (no (i)/(ii), numbers, bullets). Do NOT echo instructions.\n"
    "- Do NOT wrap output in quotes. No leading/trailing spaces or trailing newline.\n"
    "- Preserve original casing, punctuation, and spacing from the source when answering spans.\n"
    "\n"
    "TASK STYLES (use [TASK_HINT] when present; otherwise infer from inputs):\n"
    "1) captioning (image, no explicit question):\n"
    "   - Begin the caption with: 'The image is ...'\n"
    "   - Write a detailed description in 2–6 sentences (paragraph style), no labels.\n"
    "   - No quotes. No list formatting.\n"
    "\n"
    "2) vqa (image + question):\n"
    "   - Return ONLY the exact short answer span as plain text (one line).\n"
    "   - Do NOT add words, punctuation, or explanations. Do NOT normalize case or numbers.\n"
    "\n"
    "3) math_reasoning (word problems):\n"
    "   - You MUST output step-by-step reasoning FIRST, then a blank line, then the final line.\n"
    "   - Use the numbers from the problem in the steps. Keep units only if required by the answer.\n"
    "   - Do NOT output the final line until after the steps are written.\n"
    "   - Write ONLY 4–8 SHORT plain sentences, NO bullets/numbers/latex. NO introductions like 'To determine...' and NO variables such as x.\n"
    "   - Each sentence MUST include at least one inline computation marker like '<<a+b=c>>', '<<a-b=c>>', '<<a*b=c>>', or '<<a/b=c>>'. Use the given numbers directly.\n"
    "   - Keep the style compact as in the dataset examples.\n"
    "   - After the sentences, add ONE blank line and then the final line EXACTLY: '#### <final_answer>' (digits only; decimal allowed; no commas/units).\n"
    "   - Examples:\n"
    "     Dad gave Olaf 10 toy cars, Mom gave 10 + 5 = <<10+5=15>>15. Auntie gave 6, Uncle gave 6 - 1 = <<6-1=5>>5, Grandpa gave 2 * 5 = <<2*5=10>>10. All gifts total 10 + 15 + 6 + 5 + 10 = <<10+15+6+5+10=46>>46. Olaf now has 150 + 46 = <<150+46=196>>196.\n"
    "     \n"
    "     #### 196\n"
    "     He has 6 - 2 = <<6-2=4>>4 cats. He has 4 - 1 = <<4-1=3>>3 parrots. He has 4 + 6 = <<4+6=10>>10 snakes. Total pets 2 + 4 + 3 + 10 = <<2+4+3+10=19>>19.\n"
    "     \n"
    "     #### 19\n"
    "     She spend $56 because 7 * 8 = <<7*8=56>>56. She has 100 - 56 = <<100-56=44>>44 left. She can get 44 / 5 = <<44/5=8.8>>8.8 five-dollar bills → $40 because 8 * 5 = <<8*5=40>>40. Money left 44 - 40 = <<44-40=4>>4.\n"
    "     \n"
    "     #### 4\n"
    "     Mimi picked up 2 dozen seashells, which is <<2*12=24>> shells. Kyle found twice as many, so <<24*2=48>> shells. Leigh grabbed one-third of Kyle's shells, so <<48/3=16>> shells.\n"
    "     \n"
    "     #### 16\n"
    "\n"
    "4) summarization (long text, no direct question):\n"
    "   - If a formal act title exists (e.g., explicit short title line or '<TITLE> of <YEAR>'), OUTPUT EXACTLY '<TITLE> - <summary>' on ONE line.\n"
    "   - The part AFTER ' - ' MUST be a NON-EMPTY summary of 20–60 words (1–3 sentences). Title-only output is INVALID.\n"
    "   - If no clear title exists, output a single-line summary of 20–60 words (no line breaks).\n"
    "   - NEVER include raw headings like 'SECTION 1. SHORT TITLE.' or 'SEC.' in the output. Do NOT write 'Title - ...'. Start with the actual title text only.\n"
    "   - Do NOT mention 'image' or 'document'. Focus on the bill’s purpose, actions (Directs/Requires/Establishes/Authorizes), scope, and any key targets/timelines.\n"
    "   - If your draft lacks the ' - ' or has fewer than 20 words after it, treat it as INVALID and rewrite before finishing.\n"
    "   - Examples:\n"
    "     'Veterans Access to Timely Medical Appointments Act - Directs VA to implement a standardized scheduling policy with 7-day primary care and 14-day specialty targets, require reliable wait-time data, strengthen training and oversight, improve phone access, and publish performance reports.'\n"
    "     'King Holiday and Service Act of 1994 - Extends and revises the commission honoring Martin Luther King, Jr., expands service opportunities and grant authority, updates reporting and membership requirements, and aligns provisions in the National and Community Service Act.'\n"
    "     'Establishes the Commission To Assess the Nuclear Activities of the Islamic Republic of Iran which shall assess the status of, the relationship between, and the intentions behind the military and the civilian nuclear activities of the Islamic Republic of Iran.\n\nTerminates the Commission 60 days after submission of the report required under this Act.'\n"
    "\n"
    "5) text_qa (multiple questions provided):\n"
    "   - Return a Python-style dict string with ONLY one key: 'input_text'.\n"
    "   - Its value is a JSON-like list of answers in the SAME order as the questions.\n"
    "   - Example: {'input_text': ['ans1', 'ans2', 'ans3']}\n"
)


def build_prompt(row: dict) -> str:
    lines = []
    if row.get("task"):     lines.append(f"[TASK_HINT]: {row['task']}")
    if row.get("question"): lines.append(f"[QUESTION]: {row['question']}")
    if row.get("input_type") == "text" and row.get("input"):
        lines.append(f"[TEXT]: {row['input']}")
    lines.append("Respond with the single best answer only.")
    return "\n".join(lines)

# <<< CHANGED: messages엔 '자리표시자'만 두고, PIL 이미지는 넣지 않음
def build_messages(row, has_image: bool):
    user_content = []
    if has_image:
        user_content.append({"type": "image"})  # 자리표시자만
    user_content.append({"type": "text", "text": build_prompt(row)})
    messages = [
        {"role": "system", "content": [{"type":"text", "text": SYSTEM_RULES}]},
        {"role": "user",   "content": user_content},
    ]
    return messages

def load_image_any(x: str):
    if not isinstance(x, str):
        return None
    if x.startswith("http://") or x.startswith("https://"):
        try:
            r = requests.get(x, timeout=15)
            r.raise_for_status()
            return Image.open(io.BytesIO(r.content)).convert("RGB")
        except Exception:
            return None
    try:
        raw = base64.b64decode(x, validate=True)
        return Image.open(io.BytesIO(raw)).convert("RGB")
    except Exception:
        return None

def find_id_col(df: pd.DataFrame):
    for c in ("ID","id"):
        if c in df.columns:
            return c
    df.reset_index(inplace=True)
    df.rename(columns={"index":"ID"}, inplace=True)
    return "ID"



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from peft import PeftModel

MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
data_path = '../data/deep_chal_multitask_dataset_test.parquet'
out_path = "finetune_submission.csv"

#6560
LoRA_DIR = "./qwen2_5_vl_7b_lora_2"

# 4bit 로드(메모리 절약).
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
device_map = "auto"

processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
base = AutoModelForVision2Seq.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
    trust_remote_code=True,
)
model = PeftModel.from_pretrained(base, os.path.join(LoRA_DIR, "lora_adapter"))
model.eval()
print(f"Loaded model: {MODEL_NAME} on {model.device}")


df = pd.read_parquet(data_path, engine='fastparquet')
id_col = find_id_col(df)
print('Data loaded:', data_path)


gen_kwargs = dict(
    max_new_tokens=352,
    temperature=0.1,
    top_p=1.0,
    do_sample=False,
    repetition_penalty=1.05,
)

out_rows = []
for _, r in tqdm(df.iterrows(), total=len(df)):
    row = r.to_dict()
    
    # PIL 이미지 로드 (있으면)
    image = load_image_any(row.get("input")) if row.get("input_type") == "image" else None
    
    # <<< CHANGED: messages엔 자리표시자만 전달
    messages = build_messages(row, has_image=image is not None)
    
    # <<< CHANGED: chat 템플릿을 '토크나이즈된 텐서'로 받음
    text_in = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # <<< CHANGED: content 대신 image 변수를 사용
    inputs = processor(
        text=text_in,
        images=[image] if image is not None else None,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    

    with torch.inference_mode():
        generated_ids = model.generate(**inputs, **gen_kwargs)
        
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    
    # special token 제거 → 어시스턴트 답만
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0].strip()

    
    out_rows.append({"id": r[id_col], "output": output_text})
    
    
pd.DataFrame(out_rows).to_csv(out_path, index=False, encoding='utf-8')
print(f"Saved: {out_path}")

Loading checkpoint shards: 100%|██████████| 5/5 [00:32<00:00,  6.52s/it]


Loaded model: Qwen/Qwen2.5-VL-7B-Instruct on cuda:0
Data loaded: ../data/deep_chal_multitask_dataset_test.parquet


100%|██████████| 2493/2493 [7:12:16<00:00, 10.40s/it]  

Saved: finetune_submission.csv



