In [1]:
#!/usr/bin/env python
# coding: utf-8
#
# 檔名: Carl_VLM_test_optimized.py
# 說明: 此版本根據效能回饋進行了優化。
#      移除了有問題的批次處理邏輯，改回經過驗證更為高效的「單張循序處理」模式。
#      預期執行速度將與 CAR_VLM訓練.py 相當 (約 3-4 秒/張)。
#

# --- 0. 基本函式庫導入 ---
import os
import pandas as pd
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
from tqdm.notebook import tqdm # 在 Jupyter Notebook 中提供美觀的進度條
import glob
import time
from PIL import Image # 使用 PIL 進行高效的圖片讀取

# ====================================================================
# --- 1. 使用者參數設定 (請在此處修改) ---
# ====================================================================

# 選擇要處理的資料集類型：'train' 或 'test'
DATASET_TYPE = 'test'

# BATCH_SIZE 參數在此版本中不再使用，但保留以便未來切換
# BATCH_SIZE = 8 # (已停用)

# ====================================================================
# --- 2. 組態設定 (通常不需修改) ---
# ====================================================================
class Config:
    BASE_DATA_DIR = "./"
    MODEL_ID = "llava-hf/llava-1.5-7b-hf"
    PROMPT = (
        "You are an expert in traffic safety analysis. "
        "Describe this dashcam image in one sentence focusing on potential hazards, "
        "such as nearby vehicles' actions, pedestrians, road conditions, and the ego car's movement. "
        "If no immediate hazard is visible, describe the general traffic scene."
    )
    # --- 優化設定 ---
    RESIZE_WIDTH = 800
    # 在 Windows 上安裝 flash-attn 有困難，建議設為 False
    USE_FLASH_ATTENTION_2 = False


# --- 3. 模型載入函式 ---
def load_model_and_processor(config):
    """載入 LLaVA 模型與處理器到 GPU。"""
    print("正在載入模型，請稍候...")
    if not torch.cuda.is_available():
        raise ConnectionError("沒有偵測到 CUDA GPU。此模型需要在 GPU 上運行。")
    print(f"偵測到 CUDA 設備: {torch.cuda.get_device_name(0)}")

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    attn_implementation = "flash_attention_2" if config.USE_FLASH_ATTENTION_2 else "eager"

    model = LlavaForConditionalGeneration.from_pretrained(
        config.MODEL_ID,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        quantization_config=quantization_config,
        attn_implementation=attn_implementation,
    )
    processor = AutoProcessor.from_pretrained(config.MODEL_ID)
    print(f"模型載入完成！(Attention Implementation: {attn_implementation})")
    return model, processor

# --- 4. 圖片描述生成函式 (修改為高效的單張處理模式) ---
def generate_caption_for_image(model, processor, image_path, prompt, resize_width):
    """
    為單張圖片生成描述。
    此函式直接採用 CAR_VLM訓練.py 中經過驗證的高效寫法。
    """
    try:
        # 使用 PIL 直接讀取和轉換圖片，更為高效
        raw_image = Image.open(image_path).convert('RGB')

        if resize_width:
            width, height = raw_image.size
            if width > resize_width:
                new_width = resize_width
                new_height = int(new_width * height / width)
                raw_image = raw_image.resize((new_width, new_height))

        final_prompt = f"USER: <image>\n{prompt}\nASSISTANT:"
        # 處理單張圖片，不需 padding，且明確指定 float16 型別
        inputs = processor(text=final_prompt, images=raw_image, return_tensors="pt").to("cuda", torch.float16)
        
        output = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        
        decoded_text = processor.decode(output[0], skip_special_tokens=True)
        caption = decoded_text.split("ASSISTANT:")[-1].strip()
        return caption
    except Exception as e:
        print(f"處理圖片 {image_path} 時發生錯誤: {e}")
        return "Error generating caption."

# --- 5. 資料集處理主函式 (修改為循序處理邏輯) ---
def process_dataset(model, processor, config, csv_path, data_type, data_folder, output_csv_path):
    """處理單一資料集 (road 或 freeway) 的主流程。"""
    if not os.path.exists(csv_path):
        print(f"警告：找不到 CSV 檔案 {csv_path}。跳過此資料集處理。")
        return

    print(f"\n--- 開始處理 {data_type} ({data_folder} set) ---")
    df = pd.read_csv(csv_path)

    # 斷點續傳邏輯
    if os.path.exists(output_csv_path):
        print(f"偵測到已存在的輸出檔 {output_csv_path}。將從上次中斷處繼續。")
        output_df = pd.read_csv(output_csv_path)
        if 'captions' not in output_df.columns: output_df['captions'] = pd.NA
        if 'image_count' not in output_df.columns: output_df['image_count'] = 0
    else:
        output_df = df.copy()
        output_df['image_count'] = 0
        output_df['captions'] = pd.NA

    # 外層迴圈，處理每個影片切成的圖片資料夾
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"處理 {data_type} 進度"):
        # 檢查是否已處理過
        if pd.notna(output_df.at[index, 'captions']) and output_df.at[index, 'captions'] != "":
            continue

        file_name = row['file_name']
        image_dir = os.path.join(config.BASE_DATA_DIR, data_type, data_folder, file_name)

        if not os.path.isdir(image_dir):
            output_df.at[index, 'captions'] = "Directory not found."
            continue

        image_files = sorted(glob.glob(os.path.join(image_dir, '*.[jJ][pP][gG]')))
        output_df.at[index, 'image_count'] = len(image_files)

        if not image_files:
            output_df.at[index, 'captions'] = "No images found."
            continue

        all_captions = []
        
        # 內層迴圈：一張一張處理圖片，並顯示進度
        # 使用 leave=False 讓內層進度條在完成後消失，保持介面整潔
        for image_path in tqdm(image_files, desc=f"  -> {file_name}", leave=False):
            caption = generate_caption_for_image(model, processor, image_path, config.PROMPT, config.RESIZE_WIDTH)
            all_captions.append(caption)

        combined_captions = " | ".join(all_captions)
        output_df.at[index, 'captions'] = combined_captions

        # 每處理完一個 file_name 就存檔一次，確保進度不遺失
        output_df.to_csv(output_csv_path, index=False)

    print(f"\n--- {data_type} ({data_folder} set) 處理完成！結果已儲存至: {output_csv_path} ---")


# ====================================================================
# --- 6. 主程式執行區塊 ---
# ====================================================================
if __name__ == '__main__':
    try:
        # 根據使用者設定的參數決定檔案名稱
        data_folder = DATASET_TYPE
    
        road_csv_path = os.path.join(Config.BASE_DATA_DIR, f'road_{data_folder}.csv')
        freeway_csv_path = os.path.join(Config.BASE_DATA_DIR, f'freeway_{data_folder}.csv')
    
        output_road_csv_path = os.path.join(Config.BASE_DATA_DIR, f'road_{data_folder}_with_captions.csv')
        output_freeway_csv_path = os.path.join(Config.BASE_DATA_DIR, f'freeway_{data_folder}_with_captions.csv')
    
        # 載入模型
        main_model, main_processor = load_model_and_processor(Config)
    
        print(f"\n*** 開始處理 {data_folder.upper()} 資料集 (採用高效循序模式) ***")
    
        # 呼叫處理函式
        process_dataset(main_model, main_processor, Config, road_csv_path, 'road', data_folder, output_road_csv_path)
        process_dataset(main_model, main_processor, Config, freeway_csv_path, 'freeway', data_folder, output_freeway_csv_path)
    
        print("\n*** 所有任務執行完畢！ ***")
    
    except Exception as e:
        import traceback
        print(f"\n程式執行時發生嚴重錯誤: {e}")
        traceback.print_exc()

正在載入模型，請稍候...
偵測到 CUDA 設備: NVIDIA GeForce RTX 3060 Ti


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


模型載入完成！(Attention Implementation: eager)

*** 開始處理 TEST 資料集 (採用高效循序模式) ***

--- 開始處理 road (test set) ---
偵測到已存在的輸出檔 ./road_test_with_captions.csv。將從上次中斷處繼續。


處理 road 進度:   0%|          | 0/121 [00:00<?, ?it/s]


--- road (test set) 處理完成！結果已儲存至: ./road_test_with_captions.csv ---

--- 開始處理 freeway (test set) ---
偵測到已存在的輸出檔 ./freeway_test_with_captions.csv。將從上次中斷處繼續。


處理 freeway 進度:   0%|          | 0/122 [00:00<?, ?it/s]

  -> freeway_0180:   0%|          | 0/89 [00:00<?, ?it/s]

  output_df.at[index, 'captions'] = combined_captions


  -> freeway_0181:   0%|          | 0/95 [00:00<?, ?it/s]

  -> freeway_0182:   0%|          | 0/102 [00:00<?, ?it/s]

  -> freeway_0183:   0%|          | 0/109 [00:00<?, ?it/s]

  -> freeway_0184:   0%|          | 0/111 [00:00<?, ?it/s]

  -> freeway_0185:   0%|          | 0/159 [00:00<?, ?it/s]

  -> freeway_0186:   0%|          | 0/118 [00:00<?, ?it/s]

  -> freeway_0187:   0%|          | 0/90 [00:00<?, ?it/s]

  -> freeway_0188:   0%|          | 0/45 [00:00<?, ?it/s]

  -> freeway_0189:   0%|          | 0/73 [00:00<?, ?it/s]

  -> freeway_0190:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0191:   0%|          | 0/67 [00:00<?, ?it/s]

  -> freeway_0192:   0%|          | 0/97 [00:00<?, ?it/s]

  -> freeway_0193:   0%|          | 0/97 [00:00<?, ?it/s]

  -> freeway_0194:   0%|          | 0/21 [00:00<?, ?it/s]

  -> freeway_0195:   0%|          | 0/90 [00:00<?, ?it/s]

  -> freeway_0196:   0%|          | 0/70 [00:00<?, ?it/s]

  -> freeway_0197:   0%|          | 0/104 [00:00<?, ?it/s]

  -> freeway_0198:   0%|          | 0/47 [00:00<?, ?it/s]

  -> freeway_0199:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0200:   0%|          | 0/106 [00:00<?, ?it/s]

  -> freeway_0201:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0202:   0%|          | 0/103 [00:00<?, ?it/s]

  -> freeway_0203:   0%|          | 0/102 [00:00<?, ?it/s]

  -> freeway_0204:   0%|          | 0/111 [00:00<?, ?it/s]

  -> freeway_0205:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0206:   0%|          | 0/33 [00:00<?, ?it/s]

  -> freeway_0207:   0%|          | 0/48 [00:00<?, ?it/s]

  -> freeway_0208:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0209:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0210:   0%|          | 0/64 [00:00<?, ?it/s]

  -> freeway_0211:   0%|          | 0/152 [00:00<?, ?it/s]

  -> freeway_0212:   0%|          | 0/66 [00:00<?, ?it/s]

  -> freeway_0213:   0%|          | 0/70 [00:00<?, ?it/s]

  -> freeway_0214:   0%|          | 0/134 [00:00<?, ?it/s]

  -> freeway_0215:   0%|          | 0/112 [00:00<?, ?it/s]

  -> freeway_0216:   0%|          | 0/133 [00:00<?, ?it/s]

  -> freeway_0217:   0%|          | 0/75 [00:00<?, ?it/s]

  -> freeway_0218:   0%|          | 0/86 [00:00<?, ?it/s]

  -> freeway_0219:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0220:   0%|          | 0/94 [00:00<?, ?it/s]

  -> freeway_0221:   0%|          | 0/106 [00:00<?, ?it/s]

  -> freeway_0222:   0%|          | 0/108 [00:00<?, ?it/s]

  -> freeway_0223:   0%|          | 0/96 [00:00<?, ?it/s]

  -> freeway_0224:   0%|          | 0/40 [00:00<?, ?it/s]

  -> freeway_0225:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0226:   0%|          | 0/107 [00:00<?, ?it/s]

  -> freeway_0227:   0%|          | 0/81 [00:00<?, ?it/s]

  -> freeway_0228:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0229:   0%|          | 0/124 [00:00<?, ?it/s]

  -> freeway_0230:   0%|          | 0/162 [00:00<?, ?it/s]

  -> freeway_0231:   0%|          | 0/65 [00:00<?, ?it/s]

  -> freeway_0232:   0%|          | 0/37 [00:00<?, ?it/s]

  -> freeway_0233:   0%|          | 0/48 [00:00<?, ?it/s]

  -> freeway_0234:   0%|          | 0/88 [00:00<?, ?it/s]

  -> freeway_0235:   0%|          | 0/29 [00:00<?, ?it/s]

  -> freeway_0236:   0%|          | 0/23 [00:00<?, ?it/s]

  -> freeway_0237:   0%|          | 0/63 [00:00<?, ?it/s]

  -> freeway_0238:   0%|          | 0/123 [00:00<?, ?it/s]

  -> freeway_0239:   0%|          | 0/84 [00:00<?, ?it/s]

  -> freeway_0240:   0%|          | 0/54 [00:00<?, ?it/s]

  -> freeway_0241:   0%|          | 0/128 [00:00<?, ?it/s]

  -> freeway_0242:   0%|          | 0/136 [00:00<?, ?it/s]

  -> freeway_0243:   0%|          | 0/109 [00:00<?, ?it/s]

  -> freeway_0244:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0245:   0%|          | 0/25 [00:00<?, ?it/s]

  -> freeway_0246:   0%|          | 0/117 [00:00<?, ?it/s]

  -> freeway_0247:   0%|          | 0/69 [00:00<?, ?it/s]

  -> freeway_0248:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0249:   0%|          | 0/20 [00:00<?, ?it/s]

  -> freeway_0250:   0%|          | 0/51 [00:00<?, ?it/s]

  -> freeway_0251:   0%|          | 0/69 [00:00<?, ?it/s]

  -> freeway_0252:   0%|          | 0/127 [00:00<?, ?it/s]

  -> freeway_0253:   0%|          | 0/114 [00:00<?, ?it/s]

  -> freeway_0254:   0%|          | 0/10 [00:00<?, ?it/s]

  -> freeway_0255:   0%|          | 0/140 [00:00<?, ?it/s]

  -> freeway_0256:   0%|          | 0/125 [00:00<?, ?it/s]

  -> freeway_0257:   0%|          | 0/73 [00:00<?, ?it/s]

  -> freeway_0258:   0%|          | 0/53 [00:00<?, ?it/s]

  -> freeway_0259:   0%|          | 0/127 [00:00<?, ?it/s]

  -> freeway_0260:   0%|          | 0/129 [00:00<?, ?it/s]

  -> freeway_0261:   0%|          | 0/133 [00:00<?, ?it/s]

  -> freeway_0262:   0%|          | 0/65 [00:00<?, ?it/s]

  -> freeway_0263:   0%|          | 0/17 [00:00<?, ?it/s]

  -> freeway_0264:   0%|          | 0/69 [00:00<?, ?it/s]

  -> freeway_0265:   0%|          | 0/111 [00:00<?, ?it/s]

  -> freeway_0266:   0%|          | 0/105 [00:00<?, ?it/s]

  -> freeway_0267:   0%|          | 0/72 [00:00<?, ?it/s]

  -> freeway_0268:   0%|          | 0/26 [00:00<?, ?it/s]

  -> freeway_0269:   0%|          | 0/136 [00:00<?, ?it/s]

  -> freeway_0270:   0%|          | 0/101 [00:00<?, ?it/s]

  -> freeway_0271:   0%|          | 0/44 [00:00<?, ?it/s]

  -> freeway_0272:   0%|          | 0/71 [00:00<?, ?it/s]

  -> freeway_0273:   0%|          | 0/90 [00:00<?, ?it/s]

  -> freeway_0274:   0%|          | 0/67 [00:00<?, ?it/s]

  -> freeway_0275:   0%|          | 0/32 [00:00<?, ?it/s]

  -> freeway_0276:   0%|          | 0/34 [00:00<?, ?it/s]

  -> freeway_0277:   0%|          | 0/7 [00:00<?, ?it/s]

  -> freeway_0278:   0%|          | 0/6 [00:00<?, ?it/s]

  -> freeway_0279:   0%|          | 0/32 [00:00<?, ?it/s]

  -> freeway_0280:   0%|          | 0/122 [00:00<?, ?it/s]

  -> freeway_0281:   0%|          | 0/97 [00:00<?, ?it/s]

  -> freeway_0282:   0%|          | 0/131 [00:00<?, ?it/s]

  -> freeway_0283:   0%|          | 0/106 [00:00<?, ?it/s]

  -> freeway_0284:   0%|          | 0/153 [00:00<?, ?it/s]

  -> freeway_0285:   0%|          | 0/136 [00:00<?, ?it/s]

  -> freeway_0286:   0%|          | 0/122 [00:00<?, ?it/s]

  -> freeway_0287:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0288:   0%|          | 0/98 [00:00<?, ?it/s]

  -> freeway_0289:   0%|          | 0/90 [00:00<?, ?it/s]

  -> freeway_0290:   0%|          | 0/20 [00:00<?, ?it/s]

  -> freeway_0291:   0%|          | 0/63 [00:00<?, ?it/s]

  -> freeway_0292:   0%|          | 0/76 [00:00<?, ?it/s]

  -> freeway_0293:   0%|          | 0/51 [00:00<?, ?it/s]

  -> freeway_0294:   0%|          | 0/132 [00:00<?, ?it/s]

  -> freeway_0295:   0%|          | 0/15 [00:00<?, ?it/s]

  -> freeway_0296:   0%|          | 0/33 [00:00<?, ?it/s]

  -> freeway_0297:   0%|          | 0/110 [00:00<?, ?it/s]

  -> freeway_0298:   0%|          | 0/119 [00:00<?, ?it/s]

  -> freeway_0299:   0%|          | 0/70 [00:00<?, ?it/s]

  -> freeway_0300:   0%|          | 0/104 [00:00<?, ?it/s]

  -> freeway_0301:   0%|          | 0/121 [00:00<?, ?it/s]


--- freeway (test set) 處理完成！結果已儲存至: ./freeway_test_with_captions.csv ---

*** 所有任務執行完畢！ ***
