# V-CoT: Visual Chain-of-Thought Training

**Grounding Reasoning in Vision-Language Models**

This notebook provides an end-to-end pipeline for:
1. **Setup**: Install dependencies and mount Google Drive
2. **Data Generation**: Distill reasoning traces from GPT-4o
3. **Training**: Fine-tune Qwen2.5-VL with QLoRA using Unsloth
4. **Evaluation**: Measure IoU and accuracy metrics
5. **Demo**: Interactive Gradio interface

**Requirements:**
- Google Colab with GPU (T4/A100 recommended)
- OpenAI API key (for data generation)
- HuggingFace token (optional, for gated models)

---
## 1. Setup & Configuration

In [1]:
#@title 1.1 Configuration { display-mode: "form" }
#@markdown ### Project Settings
PROJECT_NAME = "V-CoT"  #@param {type:"string"}
SAVE_TO_DRIVE = True  #@param {type:"boolean"}

#@markdown ### API Keys (stored securely)
USE_COLAB_SECRETS = True  #@param {type:"boolean"}
#@markdown If False, you'll be prompted to enter keys manually

#@markdown ### Training Settings
MAX_STEPS = 600  #@param {type:"integer"}
BATCH_SIZE = 4  #@param {type:"integer"}
LEARNING_RATE = 2e-4  #@param {type:"number"}
LORA_RANK = 16  #@param {type:"integer"}

#@markdown ### Data Generation Settings
MAX_SAMPLES = 10332  #@param {type:"integer"}
GENERATE_NEW_DATA = False  #@param {type:"boolean"}

#@markdown ### Resume Training
RESUME_FROM_CHECKPOINT = True  #@param {type:"boolean"}
#@markdown Automatically resume from last checkpoint if available

import os

# Set paths
DRIVE_BASE = f"/content/drive/MyDrive/Colab Notebooks/{PROJECT_NAME}"
LOCAL_BASE = f"/content/{PROJECT_NAME}"
CHECKPOINT_DIR = f"{DRIVE_BASE}/checkpoints" if SAVE_TO_DRIVE else f"{LOCAL_BASE}/outputs/checkpoints"
DATA_DIR = f"{DRIVE_BASE}/data" if SAVE_TO_DRIVE else f"{LOCAL_BASE}/data/processed"
LOG_DIR = f"{DRIVE_BASE}/logs" if SAVE_TO_DRIVE else f"{LOCAL_BASE}/logs"

print(f"Project: {PROJECT_NAME}")
print(f"Checkpoint directory: {CHECKPOINT_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Resume from checkpoint: {RESUME_FROM_CHECKPOINT}")

Project: V-CoT
Checkpoint directory: /content/drive/MyDrive/Colab Notebooks/V-CoT/checkpoints
Data directory: /content/drive/MyDrive/Colab Notebooks/V-CoT/data
Resume from checkpoint: True


In [2]:
#@title 1.2 Mount Google Drive { display-mode: "form" }
from google.colab import drive

if SAVE_TO_DRIVE:
    drive.mount('/content/drive')

    # Create project directories in Drive
    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
    os.makedirs(DATA_DIR, exist_ok=True)
    os.makedirs(LOG_DIR, exist_ok=True)

    print(f"\n‚úì Google Drive mounted")
    print(f"‚úì Created directories in: {DRIVE_BASE}")

    # List existing checkpoints
    if os.path.exists(CHECKPOINT_DIR):
        checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith('checkpoint-')]
        if checkpoints:
            print(f"\nExisting checkpoints found:")
            for cp in sorted(checkpoints):
                print(f"  - {cp}")
        else:
            print("\nNo existing checkpoints found.")
else:
    print("Saving to local storage (will be lost when runtime disconnects)")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

‚úì Google Drive mounted
‚úì Created directories in: /content/drive/MyDrive/Colab Notebooks/V-CoT

No existing checkpoints found.


In [3]:
#@title 1.3 Setup API Keys { display-mode: "form" }
import os

if USE_COLAB_SECRETS:
    try:
        from google.colab import userdata

        # Try to get keys from Colab secrets
        try:
            os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
            print("‚úì OPENAI_API_KEY loaded from Colab secrets")
        except:
            print("‚ö† OPENAI_API_KEY not found in secrets")

        try:
            os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
            print("‚úì HF_TOKEN loaded from Colab secrets")
        except:
            print("‚ö† HF_TOKEN not found in secrets (optional)")

        try:
            os.environ['DEEPINFRA_API_KEY'] = userdata.get('DEEPINFRA_API_KEY')
            print("‚úì DEEPINFRA_API_KEY loaded from Colab secrets")
        except:
            print("‚ö† DEEPINFRA_API_KEY not found in secrets (optional)")

        try:
            os.environ['OPENROUTER_API_KEY'] = userdata.get('OPENROUTER_API_KEY')
            print("‚úì OPENROUTER_API_KEY loaded from Colab secrets")
        except:
            print("‚ö† OPENROUTER_API_KEY not found in secrets (optional)")

    except ImportError:
        print("Not running in Colab, skipping secrets")
else:
    # Manual input
    from getpass import getpass

    if not os.environ.get('OPENAI_API_KEY'):
        os.environ['OPENAI_API_KEY'] = getpass('Enter OpenAI API Key: ')

    if not os.environ.get('HF_TOKEN'):
        hf_token = getpass('Enter HuggingFace Token (press Enter to skip): ')
        if hf_token:
            os.environ['HF_TOKEN'] = hf_token

‚úì OPENAI_API_KEY loaded from Colab secrets
‚úì HF_TOKEN loaded from Colab secrets
‚úì DEEPINFRA_API_KEY loaded from Colab secrets
‚úì OPENROUTER_API_KEY loaded from Colab secrets


In [4]:
#@title 1.4 Install Dependencies { display-mode: "form" }
# First, fix NumPy version (must be done before other installs)
!pip uninstall numpy -y
!pip install "numpy<2.0.0"

# Install Unsloth (optimized for Colab)
!pip install --no-deps trl peft accelerate bitsandbytes
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install other dependencies
!pip install transformers>=4.45.0 datasets openai gradio opencv-python pyyaml tqdm scipy wandb

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy<2.0.0
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.0/61.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m18.0/18.0 MB[0m [31m131.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytensor 2.35.1 requires numpy>=2.0, but you 

Collecting trl
  Downloading trl-0.26.1-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading trl-0.26.1-py3-none-any.whl (517 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m517.4/517.4 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.1/59.1 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl, bitsandbytes
Successfully installed bitsandbytes-0.49.0 trl-0.26.1
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai

In [4]:
!pip install tenacity



In [4]:
#@title 1.5 Verify GPU & Installation { display-mode: "form" }
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("\n‚ö†Ô∏è WARNING: No GPU detected! Training will be extremely slow.")
    print("Go to Runtime > Change runtime type > Select GPU")

# Verify Unsloth
try:
    from unsloth import FastVisionModel
    print("\n‚úì Unsloth installed successfully")
except ImportError as e:
    print(f"\n‚úó Unsloth installation failed: {e}")

PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: NVIDIA L4
GPU Memory: 23.8 GB
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!

‚úì Unsloth installed successfully


## 2. Data Preparation (VisCOT Dataset)

Download and convert the [VisCOT dataset](https://huggingface.co/datasets/deepcs233/Visual-CoT) which contains **438K samples with real bounding boxes**.

In [None]:
#@title 2.1 Download & Convert VisCOT Dataset { display-mode: "form" }
import os
import json
import requests
from pathlib import Path
from tqdm import tqdm
from datasets import load_dataset
from PIL import Image
from io import BytesIO
import hashlib

#@markdown ### Dataset Settings
MAX_SAMPLES = 50000  #@param {type:"integer"}
#@markdown Maximum samples to convert (VisCOT has 438K total)

INCLUDE_THOUGHT = True  #@param {type:"boolean"}
#@markdown Include chain-of-thought reasoning in responses

BBOX_FORMAT = "vcot"  #@param ["vcot", "qwen_native"]
#@markdown vcot: <ref>obj</ref><box>[x1,y1,x2,y2]</box>
#@markdown qwen_native: <ref>obj</ref><|box_start|>(y1,x1),(y2,x2)<|box_end|>

# Output paths
output_dir = Path(DATA_DIR)
images_dir = output_dir / "images"
output_dir.mkdir(parents=True, exist_ok=True)
images_dir.mkdir(parents=True, exist_ok=True)

print(f"Output directory: {output_dir}")
print(f"Images directory: {images_dir}")

def normalize_bbox(bbox, width, height, format_type="vcot"):
    """Normalize bbox to 0-1000 scale and format."""
    x1 = int(bbox[0] / width * 1000)
    y1 = int(bbox[1] / height * 1000)
    x2 = int(bbox[2] / width * 1000)
    y2 = int(bbox[3] / height * 1000)
    
    # Clamp values
    x1, y1 = max(0, x1), max(0, y1)
    x2, y2 = min(1000, x2), min(1000, y2)
    
    if format_type == "vcot":
        return f"<box>[{x1}, {y1}, {x2}, {y2}]</box>"
    else:  # qwen_native
        return f"<|box_start|>({y1},{x1}),({y2},{x2})<|box_end|>"

def format_response(sample, bbox_format="vcot"):
    """Format the assistant response with grounded reasoning."""
    
    thought = sample.get('thought', '')
    full_answer = sample.get('full_answer', sample.get('answer', ''))
    bboxs = sample.get('bboxs', [])
    width = sample.get('width', 1)
    height = sample.get('height', 1)
    
    # Build response
    parts = []
    
    # Add thought/reasoning if available
    if thought and INCLUDE_THOUGHT:
        parts.append(thought)
    elif full_answer:
        parts.append(full_answer)
    
    # Add grounded bounding boxes
    if bboxs and len(bboxs) > 0:
        # Get the main bbox (first one)
        bbox_str = normalize_bbox(bboxs[0], width, height, bbox_format)
        
        # Try to extract object name from answer or use generic
        answer = sample.get('answer', 'relevant region')
        obj_name = answer if len(answer) < 30 else 'key region'
        
        grounding = f"\n\nThe <ref>{obj_name}</ref>{bbox_str} is highlighted in the image."
        parts.append(grounding)
        
        # Add additional boxes if present
        for i, bbox in enumerate(bboxs[1:4], 2):  # Max 4 boxes
            bbox_str = normalize_bbox(bbox, width, height, bbox_format)
            parts.append(f"<ref>region {i}</ref>{bbox_str}")
    
    return "".join(parts)

def download_image(image_source, images_dir, sample_id):
    """Download or copy image and return local path."""
    
    # Generate unique filename
    filename = f"{sample_id}.jpg"
    local_path = images_dir / filename
    
    if local_path.exists():
        return str(local_path)
    
    try:
        # If image_source is a PIL Image
        if hasattr(image_source, 'save'):
            image_source.convert('RGB').save(local_path, 'JPEG', quality=85)
            return str(local_path)
        
        # If it's a URL
        if isinstance(image_source, str) and image_source.startswith('http'):
            response = requests.get(image_source, timeout=10)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content)).convert('RGB')
            img.save(local_path, 'JPEG', quality=85)
            return str(local_path)
        
        # If it's bytes
        if isinstance(image_source, bytes):
            img = Image.open(BytesIO(image_source)).convert('RGB')
            img.save(local_path, 'JPEG', quality=85)
            return str(local_path)
            
    except Exception as e:
        print(f"Failed to save image {sample_id}: {e}")
        return None
    
    return None

def convert_viscot_to_vcot(dataset, max_samples, bbox_format):
    """Convert VisCOT dataset to V-CoT training format."""
    
    converted = []
    skipped = 0
    
    for i, sample in enumerate(tqdm(dataset, total=min(len(dataset), max_samples), desc="Converting")):
        if i >= max_samples:
            break
        
        # Skip samples without bounding boxes
        bboxs = sample.get('bboxs', [])
        if not bboxs or len(bboxs) == 0:
            skipped += 1
            continue
        
        # Skip if missing required fields
        if not sample.get('question') or not sample.get('image'):
            skipped += 1
            continue
        
        # Generate sample ID
        sample_id = hashlib.md5(f"{sample['question']}_{i}".encode()).hexdigest()[:12]
        
        # Handle image
        image_path = download_image(sample['image'], images_dir, sample_id)
        if not image_path:
            skipped += 1
            continue
        
        # Format response with bounding boxes
        response = format_response(sample, bbox_format)
        
        # Skip if response is too short
        if len(response) < 20:
            skipped += 1
            continue
        
        # Create training sample
        converted.append({
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image_path},
                        {"type": "text", "text": sample['question']}
                    ]
                },
                {
                    "role": "assistant",
                    "content": response
                }
            ],
            "metadata": {
                "source": "viscot",
                "dataset": sample.get('dataset', 'unknown'),
                "answer": sample.get('answer', '')
            }
        })
    
    print(f"\nConverted: {len(converted)} samples")
    print(f"Skipped: {skipped} samples (no bbox or invalid)")
    
    return converted

# Load VisCOT dataset
print("Loading VisCOT dataset from HuggingFace...")
print("This may take a few minutes on first run...\n")

try:
    # Try loading the main dataset
    dataset = load_dataset("deepcs233/Visual-CoT", split="train")
    print(f"Loaded {len(dataset)} samples from VisCOT")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("\nTrying alternative loading method...")
    # Try loading specific file
    dataset = load_dataset("deepcs233/Visual-CoT", data_files="viscot_363k.json", split="train")
    print(f"Loaded {len(dataset)} samples")

# Show sample
print("\n--- Sample from VisCOT ---")
sample = dataset[0]
print(f"Question: {sample.get('question', 'N/A')}")
print(f"Answer: {sample.get('answer', 'N/A')}")
print(f"BBoxes: {sample.get('bboxs', 'N/A')}")
print(f"Has thought: {'thought' in sample and bool(sample['thought'])}")

# Convert dataset
print(f"\n--- Converting {MAX_SAMPLES} samples ---")
converted_data = convert_viscot_to_vcot(dataset, MAX_SAMPLES, BBOX_FORMAT)

# Split into train/val
val_size = int(len(converted_data) * 0.1)
val_data = converted_data[:val_size]
train_data = converted_data[val_size:]

# Save
train_path = output_dir / "train.jsonl"
val_path = output_dir / "val.jsonl"

with open(train_path, 'w') as f:
    for item in train_data:
        f.write(json.dumps(item) + "\n")

with open(val_path, 'w') as f:
    for item in val_data:
        f.write(json.dumps(item) + "\n")

print(f"\n‚úì Saved {len(train_data)} training samples to {train_path}")
print(f"‚úì Saved {len(val_data)} validation samples to {val_path}")
print(f"‚úì Images saved to {images_dir}")

# Verify bbox format
print("\n--- Verification ---")
with open(train_path, 'r') as f:
    sample = json.loads(f.readline())
    response = sample['messages'][1]['content']
    has_box = '<box>' in response or '<|box_start|>' in response
    print(f"Sample response (first 300 chars):")
    print(response[:300] + "...")
    print(f"\n‚úì Contains bounding box: {has_box}")

In [None]:
#@title 2.2 Analyze Converted Data { display-mode: "form" }
import json
import re

def analyze_converted_data(file_path):
    """Analyze the converted dataset for valid bounding boxes."""
    
    stats = {
        'total': 0,
        'has_vcot_box': 0,
        'has_qwen_box': 0,
        'has_ref': 0,
        'avg_response_len': 0
    }
    
    response_lengths = []
    
    with open(file_path, 'r') as f:
        for line in f:
            if not line.strip():
                continue
            
            stats['total'] += 1
            sample = json.loads(line)
            response = sample['messages'][1]['content']
            
            response_lengths.append(len(response))
            
            if '<ref>' in response:
                stats['has_ref'] += 1
            if '<box>[' in response:
                stats['has_vcot_box'] += 1
            if '<|box_start|>' in response:
                stats['has_qwen_box'] += 1
    
    stats['avg_response_len'] = sum(response_lengths) / len(response_lengths) if response_lengths else 0
    
    return stats

train_stats = analyze_converted_data(f"{DATA_DIR}/train.jsonl")
val_stats = analyze_converted_data(f"{DATA_DIR}/val.jsonl")

print("=" * 50)
print("CONVERTED DATA ANALYSIS")
print("=" * 50)

print(f"\n--- Training Data ---")
print(f"Total samples:        {train_stats['total']}")
print(f"With <ref> tags:      {train_stats['has_ref']} ({100*train_stats['has_ref']/max(1,train_stats['total']):.1f}%)")
print(f"With V-CoT <box>:     {train_stats['has_vcot_box']} ({100*train_stats['has_vcot_box']/max(1,train_stats['total']):.1f}%)")
print(f"With Qwen native box: {train_stats['has_qwen_box']} ({100*train_stats['has_qwen_box']/max(1,train_stats['total']):.1f}%)")
print(f"Avg response length:  {train_stats['avg_response_len']:.0f} chars")

print(f"\n--- Validation Data ---")
print(f"Total samples:        {val_stats['total']}")
print(f"With <ref> tags:      {val_stats['has_ref']} ({100*val_stats['has_ref']/max(1,val_stats['total']):.1f}%)")
print(f"With V-CoT <box>:     {val_stats['has_vcot_box']} ({100*val_stats['has_vcot_box']/max(1,val_stats['total']):.1f}%)")

# Show example with bbox
print("\n--- Example with Bounding Box ---")
with open(f"{DATA_DIR}/train.jsonl", 'r') as f:
    for line in f:
        sample = json.loads(line)
        response = sample['messages'][1]['content']
        if '<box>' in response or '<|box_start|>' in response:
            print(f"Q: {sample['messages'][0]['content'][1]['text']}")
            print(f"A: {response[:500]}...")
            break

print("\n‚úì Data ready for training!")

---
## 2. Data Generation (GPT-4o Distillation)

In [15]:
# #@title 2.1 Data Generation Functions (Optimized for Peak Performance) { display-mode: "form" }
# import os
# import json
# import base64
# import asyncio
# import uuid
# import re
# from pathlib import Path
# from typing import List, Dict, Any, Optional
# from tqdm.asyncio import tqdm_asyncio
# from openai import AsyncOpenAI, RateLimitError, APIError
# from datasets import load_dataset
# from io import BytesIO
# from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

# # --- CONFIGURATION ---
# MAX_CONCURRENT = 1
# IMAGE_DETAIL = "auto" # Change to "high" for absolute max performance (costs more)

# SYSTEM_PROMPT = """
# You are an expert Visual Reasoning Assistant. Your goal is to explain the answer to a science question step-by-step.
# CRITICAL RULE: Whenever you mention a physical object in the image that supports your reasoning, you MUST immediately follow it with its bounding box in the format: <ref>object_name</ref><box>[x_min, y_min, x_max, y_max]</box>.
# - Coordinates must be normalized from 0 to 1000.
# - (0,0) is top-left, (1000,1000) is bottom-right.
# - Example: "The <ref>red gear</ref><box>[100, 200, 300, 400]</box> turns clockwise."
# """

# # --- HELPER: QWEN FORMAT CONVERTER ---
# def convert_to_qwen_format(text):
#     """
#     Converts GPT-4o's <box>[x1, y1, x2, y2]</box> to Qwen's native <|box_start|>(y1,x1),(y2,x2)<|box_end|>
#     Includes clamping to ensure [0, 1000] range.
#     """
#     pattern = r"<box>\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]</box>"

#     def replace_func(match):
#         # 1. Parse Integers
#         raw_vals = list(map(int, match.groups()))

#         # 2. CLAMP values to [0, 1000] to prevent token errors
#         x1, y1, x2, y2 = [max(0, min(1000, v)) for v in raw_vals]

#         # 3. Format: Qwen uses (y,x) ordering
#         return f"<|box_start|>({y1},{x1}),({y2},{x2})<|box_end|>"

#     return re.sub(pattern, replace_func, text)

# # --- RETRY LOGIC ---
# @retry(
#     retry=retry_if_exception_type((RateLimitError, APIError)),
#     wait=wait_exponential(multiplier=2, min=4, max=120), # <--- Slower backoff (waits longer between tries)
#     stop=stop_after_attempt(30) # <--- Increased from 6 to 20 to prevent "Permanent Failure"
# )
# async def get_gpt4o_response_safe(client, messages):
#     return await client.chat.completions.create(
#         model="gpt-4o",
#         messages=messages,
#         max_tokens=2048, # Increased to allow full reasoning chains
#         temperature=0.7,
#     )

# async def process_single_image(client, image_bytes, question, answer, semaphore, mime_type="image/png"):
#     async with semaphore:
#         b64 = base64.b64encode(image_bytes).decode('utf-8')
#         user_prompt = f"""Question: {question}\n\nThe correct answer is: {answer}\n\nPlease explain step-by-step how to arrive at this answer by carefully examining the image. Remember to annotate every object you mention with its bounding box."""

#         try:
#             response = await get_gpt4o_response_safe(
#                 client,
#                 messages=[
#                     {"role": "system", "content": SYSTEM_PROMPT},
#                     {"role": "user", "content": [
#                         {"type": "text", "text": user_prompt},
#                         {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{b64}", "detail": IMAGE_DETAIL}}
#                     ]}
#                 ]
#             )
#             return response.choices[0].message.content
#         except Exception as e:
#             print(f"Permanent Failure on item: {e}")
#             return None

# def load_scienceqa_samples(max_samples=500):
#     print("Loading ScienceQA dataset...")
#     dataset = load_dataset("derek-thomas/ScienceQA", split="train")

#     samples = []
#     for item in dataset:
#         if item.get("image") is not None:
#             choices = item.get("choices", [])
#             answer_idx = item.get("answer", 0)
#             answer_text = choices[answer_idx] if answer_idx < len(choices) else ""

#             img_buffer = BytesIO()
#             item["image"].convert("RGB").save(img_buffer, format="PNG")

#             samples.append({
#                 "image_bytes": img_buffer.getvalue(),
#                 "question": item.get("question", ""),
#                 "answer": answer_text,
#                 "subject": item.get("subject", ""),
#             })

#             if len(samples) >= max_samples:
#                 break

#     print(f"Loaded {len(samples)} samples with images")
#     return samples

# async def generate_training_data(output_dir, max_samples=500, val_split=0.1):
#     api_key = os.environ.get("OPENAI_API_KEY")
#     if not api_key:
#         raise ValueError("OPENAI_API_KEY not set!")

#     client = AsyncOpenAI(api_key=api_key)
#     semaphore = asyncio.Semaphore(MAX_CONCURRENT)

#     output_path = Path(output_dir)
#     images_dir = output_path / "images"
#     images_dir.mkdir(parents=True, exist_ok=True)

#     samples = load_scienceqa_samples(max_samples)

#     print(f"\nGenerating reasoning traces for {len(samples)} samples...")
#     print(f"Concurrency Limit: {MAX_CONCURRENT}")

#     tasks = [process_single_image(client, s["image_bytes"], s["question"], s["answer"], semaphore) for s in samples]
#     results = await tqdm_asyncio.gather(*tasks, desc="Processing")

#     processed = []

#     for sample, response in zip(samples, results):
#         if response:
#             qwen_formatted_response = convert_to_qwen_format(response)

#             image_filename = f"{uuid.uuid4()}.png"
#             image_path = images_dir / image_filename

#             with open(image_path, "wb") as f:
#                 f.write(sample["image_bytes"])

#             processed.append({
#                 "messages": [
#                     {
#                         "role": "user",
#                         "content": [
#                             {"type": "image", "image": str(image_path)},
#                             {"type": "text", "text": sample["question"]}
#                         ]
#                     },
#                     {"role": "assistant", "content": qwen_formatted_response}
#                 ],
#                 "metadata": {
#                     "source": "scienceqa",
#                     "answer": sample["answer"]
#                 }
#             })

#     val_size = int(len(processed) * val_split)
#     train_data, val_data = processed[val_size:], processed[:val_size]

#     with open(output_path / "train.jsonl", "w") as f:
#         for item in train_data:
#             f.write(json.dumps(item) + "\n")

#     with open(output_path / "val.jsonl", "w") as f:
#         for item in val_data:
#             f.write(json.dumps(item) + "\n")

#     print(f"\n‚úì Generated {len(train_data)} training samples (Qwen Native Format)")
#     print(f"‚úì Generated {len(val_data)} validation samples")
#     print(f"‚úì Images saved to {images_dir}")

#     return train_data, val_data

In [14]:
# #@title 2.2 Generate or Load Training Data { display-mode: "form" }
# import shutil

# train_file = f"{DATA_DIR}/train.jsonl"
# val_file = f"{DATA_DIR}/val.jsonl"

# if GENERATE_NEW_DATA or not os.path.exists(train_file):
#     if not os.environ.get("OPENAI_API_KEY"):
#         print("‚ö†Ô∏è OPENAI_API_KEY not set. Using dummy sample data instead.")

#         # 1. Create Data Directory
#         os.makedirs(DATA_DIR, exist_ok=True)
#         images_dir = Path(DATA_DIR) / "images"
#         images_dir.mkdir(exist_ok=True)

#         # 2. Create a Dummy Image (Black 100x100 pixel square)
#         # We need a real file on disk so the model doesn't crash trying to load "path/to/image"
#         from PIL import Image
#         dummy_image_path = images_dir / "dummy.png"
#         Image.new('RGB', (100, 100), color='black').save(dummy_image_path)

#         # 3. Create Sample Data with NATIVE QWEN FORMAT
#         # Note: We use <|box_start|>(y,x),(y,x)<|box_end|> directly here
#         sample_data = [
#             {
#                 "messages": [
#                     {
#                         "role": "user",
#                         "content": [
#                             {"type": "image", "image": str(dummy_image_path)},
#                             {"type": "text", "text": "Explain what you see."}
#                         ]
#                     },
#                     {
#                         "role": "assistant",
#                         "content": "I can see the <ref>main object</ref><|box_start|>(150,100),(350,400)<|box_end|> in the center."
#                     }
#                 ],
#                 "metadata": {"source": "dummy"}
#             },
#             {
#                 "messages": [
#                     {
#                         "role": "user",
#                         "content": [
#                             {"type": "image", "image": str(dummy_image_path)},
#                             {"type": "text", "text": "Describe this diagram."}
#                         ]
#                     },
#                     {
#                         "role": "assistant",
#                         "content": "The <ref>component A</ref><|box_start|>(100,50),(400,300)<|box_end|> connects to <ref>component B</ref><|box_start|>(150,350),(450,600)<|box_end|>."
#                     }
#                 ],
#                 "metadata": {"source": "dummy"}
#             },
#         ]

#         with open(train_file, "w") as f:
#             for item in sample_data:
#                 f.write(json.dumps(item) + "\n")

#         with open(val_file, "w") as f:
#             f.write(json.dumps(sample_data[0]) + "\n")

#         print(f"‚úì Created DUMMY data at {DATA_DIR} (Used for testing pipeline only)")
#     else:
#         print("Generating training data with GPT-4o...")
#         # Await the async function we defined in the previous cell
#         await generate_training_data(DATA_DIR, max_samples=MAX_SAMPLES)
# else:
#     # Count existing samples
#     with open(train_file) as f:
#         train_count = sum(1 for _ in f)
#     with open(val_file) as f:
#         val_count = sum(1 for _ in f)
#     print(f"‚úì Using existing data: {train_count} train, {val_count} val samples")

Generating training data with GPT-4o...
Loading ScienceQA dataset...
Loaded 6218 samples with images

Generating reasoning traces for 6218 samples...
Concurrency Limit: 3


Processing:   0%|          | 8/6218 [00:35<11:39:44,  6.76s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29497, Requested 960. Please try again in 914ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29582, Requested 974. Please try again in 1.112s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29288, Requested 979. Please try again in 534ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 

Processing:   0%|          | 9/6218 [00:43<11:59:50,  6.96s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 979. Please try again in 1.958s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 983. Please try again in 1.966s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 968. Please try again in 1.936s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type'

Processing:   0%|          | 11/6218 [00:56<11:00:09,  6.38s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 992. Please try again in 1.984s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29123, Requested 991. Please try again in 228ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29870, Requested 976. Please try again in 1.692s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type':

Processing:   0%|          | 12/6218 [01:06<12:37:13,  7.32s/it]

Permanent Failure on item: RetryError[<Future at 0x7e124774bbf0 state=finished raised RateLimitError>]
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 968. Please try again in 1.936s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 977. Please try again in 1.954s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29840, Requested 979. Pl

Processing:   0%|          | 13/6218 [01:16<14:13:11,  8.25s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29252, Requested 979. Please try again in 462ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29229, Requested 974. Please try again in 406ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29129, Requested 980. Please try again in 218ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': '

Processing:   0%|          | 14/6218 [01:23<13:38:29,  7.92s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 969. Please try again in 1.938s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29901, Requested 969. Please try again in 1.74s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 970. Please try again in 1.94s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 

Processing:   0%|          | 15/6218 [01:51<23:58:51, 13.92s/it]

Permanent Failure on item: RetryError[<Future at 0x7e12477f5f40 state=finished raised RateLimitError>]
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 960. Please try again in 1.92s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:   0%|          | 16/6218 [01:56<19:37:15, 11.39s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29042, Requested 964. Please try again in 12ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29759, Requested 968. Please try again in 1.454s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 972. Please try again in 1.944s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 

Processing:   0%|          | 17/6218 [02:06<18:26:05, 10.70s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29940, Requested 980. Please try again in 1.84s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:   0%|          | 18/6218 [02:11<15:34:55,  9.05s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 964. Please try again in 1.928s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Permanent Failure on item: RetryError[<Future at 0x7e1247973770 state=finished raised RateLimitError>]
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 982. Please try again in 1.964s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 964. Pl

Processing:   0%|          | 19/6218 [02:25<18:04:23, 10.50s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29336, Requested 959. Please try again in 590ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29265, Requested 969. Please try again in 468ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:   0%|          | 20/6218 [02:29<15:05:12,  8.76s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 978. Please try again in 1.955s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 979. Please try again in 1.958s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 959. Please try again in 1.917s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type'

Processing:   0%|          | 22/6218 [02:53<16:24:35,  9.53s/it]

Permanent Failure on item: RetryError[<Future at 0x7e1247ad6240 state=finished raised RateLimitError>]
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 961. Please try again in 1.921s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29922, Requested 961. Please try again in 1.766s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29825, Requested 961. Pl

Processing:   0%|          | 23/6218 [03:14<22:32:48, 13.10s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29776, Requested 958. Please try again in 1.468s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 968. Please try again in 1.936s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 957. Please try again in 1.913s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type'

Processing:   0%|          | 24/6218 [03:26<22:00:56, 12.80s/it]

Permanent Failure on item: RetryError[<Future at 0x7e126944deb0 state=finished raised RateLimitError>]
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29674, Requested 979. Please try again in 1.306s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29829, Requested 971. Please try again in 1.6s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:   0%|          | 25/6218 [03:31<17:55:28, 10.42s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 959. Please try again in 1.917s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 961. Please try again in 1.921s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:   0%|          | 27/6218 [03:42<12:55:09,  7.51s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 960. Please try again in 1.92s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 966. Please try again in 1.932s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 958. Please try again in 1.916s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type':

Processing:   0%|          | 28/6218 [03:53<14:31:35,  8.45s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29869, Requested 974. Please try again in 1.686s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:   0%|          | 29/6218 [04:00<14:01:35,  8.16s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 979. Please try again in 1.958s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 30000, Requested 969. Please try again in 1.938s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:   1%|          | 32/6218 [04:14<10:19:02,  6.00s/it]

Failed: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-h9ycPT2oNi86uRIKtDpdsirL on tokens per min (TPM): Limit 30000, Used 29890, Requested 969. Please try again in 1.718s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:   1%|          | 32/6218 [04:16<13:46:13,  8.01s/it]


CancelledError: 

In [7]:
# # Batch API
# #@title 2.1 Batch Submission (Uses your DATA_DIR)
# import os
# import json
# import base64
# import uuid
# from pathlib import Path
# from io import BytesIO
# from datasets import load_dataset
# from openai import OpenAI

# # --- CONFIGURATION ---
# # We use the DATA_DIR you defined in your setup cell
# # e.g., /content/drive/MyDrive/.../data
# BATCH_SIZE_LIMIT = 500
# IMAGE_DETAIL = "auto"

# SYSTEM_PROMPT = """
# You are an expert Visual Reasoning Assistant. Your goal is to explain the answer to a science question step-by-step.
# CRITICAL RULE: Whenever you mention a physical object in the image that supports your reasoning, you MUST immediately follow it with its bounding box in the format: <ref>object_name</ref><box>[x_min, y_min, x_max, y_max]</box>.
# - Coordinates must be normalized from 0 to 1000.
# - (0,0) is top-left, (1000,1000) is bottom-right.
# - Example: "The <ref>red gear</ref><box>[100, 200, 300, 400]</box> turns clockwise."
# """

# def prepare_and_submit_batches():
#     api_key = os.environ.get("OPENAI_API_KEY")
#     if not api_key:
#         raise ValueError("OPENAI_API_KEY not set!")

#     client = OpenAI(api_key=api_key)

#     # 1. Setup Directories inside your existing DATA_DIR
#     data_path = Path(DATA_DIR)
#     images_dir = data_path / "images"
#     request_dir = data_path / "requests"

#     # Create subdirectories
#     images_dir.mkdir(parents=True, exist_ok=True)
#     request_dir.mkdir(parents=True, exist_ok=True)

#     print(f"‚úì Using Drive path: {data_path}")
#     print("Loading ScienceQA dataset...")
#     dataset = load_dataset("derek-thomas/ScienceQA", split="train")

#     current_batch = []
#     batch_index = 0
#     submitted_batches = []

#     print("Processing images and preparing batch files...")

#     for i, item in enumerate(dataset):
#         # Skip items without images
#         if item.get("image") is None:
#             continue

#         question = item.get("question", "")
#         choices = item.get("choices", [])
#         answer_idx = item.get("answer", 0)
#         answer_text = choices[answer_idx] if answer_idx < len(choices) else ""

#         # Save Image to Drive (DATA_DIR/images)
#         unique_id = str(uuid.uuid4())
#         image_filename = f"{unique_id}.png"
#         image_path = images_dir / image_filename

#         # Check if exists to speed up re-runs
#         if not image_path.exists():
#             img_buffer = BytesIO()
#             item["image"].convert("RGB").save(img_buffer, format="PNG")
#             image_bytes = img_buffer.getvalue()
#             with open(image_path, "wb") as f:
#                 f.write(image_bytes)
#         else:
#             with open(image_path, "rb") as f:
#                 image_bytes = f.read()

#         b64_image = base64.b64encode(image_bytes).decode('utf-8')

#         user_prompt = f"Question: {question}\n\nThe correct answer is: {answer_text}\n\nPlease explain step-by-step how to arrive at this answer by carefully examining the image. Remember to annotate every object you mention with its bounding box."

#         # Build Request
#         request_obj = {
#             "custom_id": f"{unique_id}|{question}|{answer_text}",
#             "method": "POST",
#             "url": "/v1/chat/completions",
#             "body": {
#                 "model": "gpt-4o",
#                 "messages": [
#                     {"role": "system", "content": SYSTEM_PROMPT},
#                     {"role": "user", "content": [
#                         {"type": "text", "text": user_prompt},
#                         {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_image}", "detail": IMAGE_DETAIL}}
#                     ]}
#                 ],
#                 "max_tokens": 2048,
#                 "temperature": 0.7
#             }
#         }
#         current_batch.append(request_obj)

#         # Submit Chunk if full
#         if len(current_batch) >= BATCH_SIZE_LIMIT:
#             batch_id = submit_single_batch(client, current_batch, batch_index, request_dir)
#             submitted_batches.append(batch_id)
#             current_batch = []
#             batch_index += 1

#     # Submit remaining
#     if current_batch:
#         batch_id = submit_single_batch(client, current_batch, batch_index, request_dir)
#         submitted_batches.append(batch_id)

#     # Save Batch IDs to DATA_DIR so we can retrieve them later
#     with open(data_path / "batch_ids.json", "w") as f:
#         json.dump(submitted_batches, f)

#     print(f"\n‚úì Successfully submitted {len(submitted_batches)} batches.")
#     print(f"‚úì Images and Batch IDs saved to: {data_path}")
#     print("‚úì Check status at: https://platform.openai.com/batches")

# def submit_single_batch(client, batch_data, index, output_dir):
#     filename = output_dir / f"batch_{index}.jsonl"
#     with open(filename, "w") as f:
#         for entry in batch_data:
#             f.write(json.dumps(entry) + "\n")

#     print(f"Uploading batch file {index} ({len(batch_data)} items)...")
#     batch_input_file = client.files.create(
#         file=open(filename, "rb"),
#         purpose="batch"
#     )

#     batch_job = client.batches.create(
#         input_file_id=batch_input_file.id,
#         endpoint="/v1/chat/completions",
#         completion_window="24h",
#         metadata={"description": f"scienceqa_training_chunk_{index}"}
#     )
#     return batch_job.id

# prepare_and_submit_batches()

‚úì Using Drive path: /content/drive/MyDrive/Colab Notebooks/V-CoT/data
Loading ScienceQA dataset...
Processing images and preparing batch files...
Uploading batch file 0 (500 items)...
Uploading batch file 1 (500 items)...
Uploading batch file 2 (500 items)...
Uploading batch file 3 (500 items)...
Uploading batch file 4 (500 items)...
Uploading batch file 5 (500 items)...
Uploading batch file 6 (500 items)...
Uploading batch file 7 (500 items)...
Uploading batch file 8 (500 items)...
Uploading batch file 9 (500 items)...
Uploading batch file 10 (500 items)...
Uploading batch file 11 (500 items)...
Uploading batch file 12 (218 items)...

‚úì Successfully submitted 13 batches.
‚úì Images and Batch IDs saved to: /content/drive/MyDrive/Colab Notebooks/V-CoT/data
‚úì Check status at: https://platform.openai.com/batches


In [None]:
# #@title 2.2 Batch Retrieval & Processing (Phase 2 - Run when batches are done)
# import json
# import re
# import os
# from pathlib import Path
# from openai import OpenAI

# # --- CONFIGURATION ---
# # Assumes DATA_DIR is already defined by your setup cell (e.g., /content/drive/MyDrive/.../data)
# OUTPUT_PATH = Path(DATA_DIR)
# FINAL_TRAIN_FILE = "train.jsonl"
# FINAL_VAL_FILE = "val.jsonl"

# def convert_to_qwen_format(text):
#     """
#     Converts GPT-4o's <box>[x1, y1, x2, y2]</box> to Qwen's native <|box_start|>(y1,x1),(y2,x2)<|box_end|>
#     Includes clamping to ensure [0, 1000] range.
#     """
#     pattern = r"<box>\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]</box>"

#     def replace_func(match):
#         vals = list(map(int, match.groups()))
#         # Clamp values to [0, 1000]
#         x1, y1, x2, y2 = [max(0, min(1000, v)) for v in vals]
#         # Swap for Qwen native format: (y,x) ordering
#         return f"<|box_start|>({y1},{x1}),({y2},{x2})<|box_end|>"

#     return re.sub(pattern, replace_func, text)

# def retrieve_and_process_batches():
#     api_key = os.environ.get("OPENAI_API_KEY")
#     if not api_key:
#         raise ValueError("OPENAI_API_KEY not set!")

#     client = OpenAI(api_key=api_key)

#     # 1. Load Batch IDs from Drive
#     batch_ids_file = OUTPUT_PATH / "batch_ids.json"

#     if not batch_ids_file.exists():
#         print(f"‚ùå Error: {batch_ids_file} not found.")
#         print("Did you run Phase 1 successfully? Check your Drive folder.")
#         return

#     with open(batch_ids_file, "r") as f:
#         batch_ids = json.load(f)

#     processed_data = []
#     pending_count = 0

#     print(f"Checking status for {len(batch_ids)} batches...")

#     for b_id in batch_ids:
#         try:
#             batch = client.batches.retrieve(b_id)
#         except Exception as e:
#             print(f"  Error retrieving {b_id}: {e}")
#             continue

#         if batch.status == "failed":
#             print(f"  ‚ùå Batch {b_id} FAILED. Check OpenAI Dashboard for errors.")
#             continue

#         if batch.status != "completed":
#             print(f"  ‚è≥ Batch {b_id} is {batch.status}...")
#             pending_count += 1
#             continue

#         # Download Results if completed
#         if batch.output_file_id:
#             print(f"  ‚úì Downloading results for {b_id}...")
#             content = client.files.content(batch.output_file_id).content.decode('utf-8')

#             for line in content.splitlines():
#                 if not line: continue
#                 res = json.loads(line)

#                 # Extract metadata from custom_id (uuid|question|answer)
#                 custom_id = res['custom_id']
#                 try:
#                     uuid_str, question, answer = custom_id.split("|", 2)
#                 except ValueError:
#                     continue # Skip malformed IDs

#                 # Process Successful Responses
#                 if res['response']['status_code'] == 200:
#                     gpt_content = res['response']['body']['choices'][0]['message']['content']

#                     # --- CONVERT TO QWEN NATIVE FORMAT ---
#                     final_content = convert_to_qwen_format(gpt_content)

#                     # Verify Image Exists in Drive
#                     image_path = OUTPUT_PATH / "images" / f"{uuid_str}.png"

#                     if image_path.exists():
#                         processed_data.append({
#                             "messages": [
#                                 {
#                                     "role": "user",
#                                     "content": [
#                                         {"type": "image", "image": str(image_path)},
#                                         {"type": "text", "text": question}
#                                     ]
#                                 },
#                                 {"role": "assistant", "content": final_content}
#                             ],
#                             "metadata": {"source": "scienceqa", "answer": answer}
#                         })
#                 else:
#                     print(f"  ‚ö†Ô∏è Item failed with code {res['response']['status_code']}")

#     # 2. Save Final Datasets
#     if processed_data:
#         # Simple train/val split (90/10)
#         val_size = max(1, int(len(processed_data) * 0.1))
#         train_data = processed_data[val_size:]
#         val_data = processed_data[:val_size]

#         with open(OUTPUT_PATH / FINAL_TRAIN_FILE, "w") as f:
#             for item in train_data: f.write(json.dumps(item) + "\n")

#         with open(OUTPUT_PATH / FINAL_VAL_FILE, "w") as f:
#             for item in val_data: f.write(json.dumps(item) + "\n")

#         print(f"\n‚úì SUCCESS! {len(processed_data)} total samples processed.")
#         print(f"  - Training: {len(train_data)} samples")
#         print(f"  - Validation: {len(val_data)} samples")
#         print(f"  - Saved to: {OUTPUT_PATH}/{FINAL_TRAIN_FILE}")
#     else:
#         if pending_count > 0:
#             print(f"\n‚è≥ No data saved yet. {pending_count} batches are still processing.")
#             print("Please run this cell again later (batches typically take 12-24 hours).")
#         else:
#             print("\n‚ùå No valid data found in completed batches.")

# # Run the function
# retrieve_and_process_batches()

In [7]:
#@title 2.1 Data Generation (OpenRouter / Qwen2.5-VL-72B) { display-mode: "form" }
import os
import json
import base64
import asyncio
import uuid
from pathlib import Path
from tqdm.asyncio import tqdm_asyncio
from openai import AsyncOpenAI, RateLimitError, APIError, NotFoundError
from datasets import load_dataset
from io import BytesIO
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

# --- CONFIGURATION ---
# 1. Get Key: https://openrouter.ai/keys
OPENROUTER_API_KEY = os.environ['OPENROUTER_API_KEY'] # <--- PASTE KEY HERE

# 2. OpenRouter Configuration
BASE_URL = "https://openrouter.ai/api/v1"
MODEL_NAME = "qwen/qwen2.5-vl-72b-instruct" # Try the free tier first, or remove ":free" for paid

# 3. Headers (Required by OpenRouter for best reliability)
EXTRA_HEADERS = {
    "HTTP-Referer": "https://colab.research.google.com",
    "X-Title": "ScienceQA-Training"
}

MAX_CONCURRENT = 10

# Qwen Native System Prompt
SYSTEM_PROMPT = """
You are an expert Visual Reasoning Assistant.
Your goal is to explain the answer to a science question step-by-step.

CRITICAL FORMATTING RULE:
When you mention a physical object in the image, you MUST ground it using Qwen's native bounding box format:
<ref>object name</ref><|box_start|>(ymin,xmin),(ymax,xmax)<|box_end|>

- Coordinates are normalized 0-1000.
- Use (y,x) ordering as per the standard.
- Example: "The <ref>red gear</ref><|box_start|>(200,100),(400,300)<|box_end|> turns clockwise."
"""

# --- RETRY LOGIC ---
@retry(
    retry=retry_if_exception_type((RateLimitError, APIError)),
    wait=wait_exponential(multiplier=1, min=2, max=20),
    stop=stop_after_attempt(10)
)
async def get_model_response(client, messages):
    return await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        max_tokens=2048,
        temperature=0.7,
        extra_headers=EXTRA_HEADERS
    )

async def process_single_image(client, image_bytes, question, answer, semaphore):
    async with semaphore:
        await asyncio.sleep(0.1)

        b64 = base64.b64encode(image_bytes).decode('utf-8')
        user_prompt = f"Question: {question}\nCorrect Answer: {answer}\n\nExplain the reasoning step-by-step and ground objects with bounding boxes."

        try:
            response = await get_model_response(
                client,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": [
                        {"type": "text", "text": user_prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}}
                    ]}
                ]
            )
            return response.choices[0].message.content
        except NotFoundError:
            print(f"‚ùå Error: Model '{MODEL_NAME}' not found on OpenRouter. Check credits or model ID.")
            return None
        except Exception as e:
            print(f"‚ö†Ô∏è Failed on item: {e}")
            return None

def load_scienceqa_samples(max_samples=500):
    print("Loading ScienceQA dataset...")
    dataset = load_dataset("derek-thomas/ScienceQA", split="train")

    samples = []
    for item in dataset:
        if item.get("image") is not None:
            choices = item.get("choices", [])
            answer_idx = item.get("answer", 0)
            answer_text = choices[answer_idx] if answer_idx < len(choices) else ""

            img_buffer = BytesIO()
            item["image"].convert("RGB").save(img_buffer, format="PNG")

            samples.append({
                "image_bytes": img_buffer.getvalue(),
                "question": item.get("question", ""),
                "answer": answer_text,
                "subject": item.get("subject", ""),
            })

            if len(samples) >= max_samples:
                break

    print(f"Loaded {len(samples)} samples with images")
    return samples

async def generate_training_data(output_dir, max_samples=500, val_split=0.1):
    if "YOUR_OPENROUTER_KEY" in OPENROUTER_API_KEY:
         raise ValueError("Please replace 'YOUR_OPENROUTER_KEY_HERE' with your actual API key.")

    client = AsyncOpenAI(api_key=OPENROUTER_API_KEY, base_url=BASE_URL)
    semaphore = asyncio.Semaphore(MAX_CONCURRENT)

    output_path = Path(output_dir)
    images_dir = output_path / "images"
    images_dir.mkdir(parents=True, exist_ok=True)

    samples = load_scienceqa_samples(max_samples)

    print(f"\nGenerating reasoning traces for {len(samples)} samples...")
    print(f"Provider: OpenRouter | Model: {MODEL_NAME}")

    tasks = [process_single_image(client, s["image_bytes"], s["question"], s["answer"], semaphore) for s in samples]
    results = await tqdm_asyncio.gather(*tasks, desc="Processing")

    processed = []

    for sample, response in zip(samples, results):
        if response:
            image_filename = f"{uuid.uuid4()}.png"
            image_path = images_dir / image_filename

            with open(image_path, "wb") as f:
                f.write(sample["image_bytes"])

            processed.append({
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {"type": "image", "image": str(image_path)},
                            {"type": "text", "text": sample["question"]}
                        ]
                    },
                    {"role": "assistant", "content": response}
                ],
                "metadata": {
                    "source": "scienceqa",
                    "answer": sample["answer"]
                }
            })

    val_size = int(len(processed) * val_split)
    train_data, val_data = processed[val_size:], processed[:val_size]

    with open(output_path / "train.jsonl", "w") as f:
        for item in train_data:
            f.write(json.dumps(item) + "\n")

    with open(output_path / "val.jsonl", "w") as f:
        for item in val_data:
            f.write(json.dumps(item) + "\n")

    print(f"\n‚úì Generated {len(train_data)} training samples")
    print(f"‚úì Generated {len(val_data)} validation samples")
    print(f"‚úì Images saved to {images_dir}")

    return train_data, val_data

#@title 2.2 Run Generator
if GENERATE_NEW_DATA or not os.path.exists(f"{DATA_DIR}/train.jsonl"):
    await generate_training_data(DATA_DIR, max_samples=MAX_SAMPLES)
else:
    print("Using existing data.")

Loading ScienceQA dataset...
Loaded 6218 samples with images

Generating reasoning traces for 6218 samples...
Provider: OpenRouter | Model: qwen/qwen2.5-vl-72b-instruct


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  File "<string>", line 1, in <lambda>
KeyError: '__import__'
Exception ignored in: <coroutine object process_single_image at 0x7e27fccb9800>
Traceback (most recent call last):
  File "<string>", line 1, in <lambda>
KeyError: '__import__'
Exception ignored in: <coroutine object process_single_image at 0x7e27fccb96c0>
Traceback (most recent call last):
  File "<string>", line 1, in <lambda>
KeyError: '__import__'
Exception ignored in: <coroutine object process_single_image at 0x7e27fccb9580>
Traceback (most recent call last):
  File "<string>", line 1, in <lambda>
KeyError: '__import__'
Exception ignored in: <coroutine object process_single_image at 0x7e27fccb9440>
Traceback (most recent call last):
  File "<string>", line 1, in <lambda>
KeyError: '__import__'
Exception ignored in: <coroutine object process_single_image at 0x7e27fccb9300>
Traceback (most recent call last):
  File "<string>", line 1, in <lambda>
KeyError: '


‚úì Generated 5597 training samples
‚úì Generated 621 validation samples
‚úì Images saved to /content/drive/MyDrive/Colab Notebooks/V-CoT/data/images


---
## 3. Model Training

In [5]:
#@title 3.1 Load Model with Unsloth { display-mode: "form" }
from unsloth import FastVisionModel, is_bfloat16_supported
import torch

# Model configuration
MODEL_ID = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"
MAX_SEQ_LENGTH = 2048

print(f"Loading model: {MODEL_ID}")
print(f"This may take a few minutes...\n")

model, tokenizer = FastVisionModel.from_pretrained(
    MODEL_ID,
    load_in_4bit=True,
    use_gradient_checkpointing="unsloth",
)

# Add LoRA adapters
model = FastVisionModel.get_peft_model(
    model,
    r=LORA_RANK,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=LORA_RANK,
    lora_dropout=0,
    bias="none",
)

print("\n‚úì Model loaded with LoRA adapters")

Loading model: unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit
This may take a few minutes...

==((====))==  Unsloth 2025.12.5: Fast Qwen2_5_Vl patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.90G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]


‚úì Model loaded with LoRA adapters


In [7]:
from pathlib import Path
train_file = Path(DATA_DIR) / "train.jsonl"
val_file = Path(DATA_DIR) / "val.jsonl"

In [8]:
#@title 3.2 Prepare Dataset { display-mode: "form" }
import json
from datasets import Dataset

def load_jsonl(file_path):
    """Load JSONL file."""
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

def convert_to_text_dataset(data, tokenizer):
    """
    Convert messages to text format to avoid PyArrow type issues.
    The messages field has mixed types (list vs string in content),
    so we convert everything to text first.
    """
    text_data = []
    skipped = 0
    
    for sample in data:
        try:
            messages = sample.get("messages", [])
            # Apply chat template to convert messages to text
            text = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=False
            )
            text_data.append({"text": text})
        except Exception as e:
            skipped += 1
            continue
    
    if skipped > 0:
        print(f"  ‚ö† Skipped {skipped} samples due to formatting issues")
    
    return text_data

# Load raw data
print("Loading data...")
train_data_raw = load_jsonl(train_file)
val_data_raw = load_jsonl(val_file) if os.path.exists(val_file) else []

print(f"  Raw training samples: {len(train_data_raw)}")
print(f"  Raw validation samples: {len(val_data_raw)}")

# Convert to text format (avoids PyArrow mixed type errors)
print("\nConverting to text format...")
train_data_text = convert_to_text_dataset(train_data_raw, tokenizer)
val_data_text = convert_to_text_dataset(val_data_raw, tokenizer) if val_data_raw else []

print(f"  Converted training samples: {len(train_data_text)}")
print(f"  Converted validation samples: {len(val_data_text)}")

# Create HuggingFace datasets
print("\nCreating datasets...")
train_dataset = Dataset.from_list(train_data_text)

if val_data_text:
    val_dataset = Dataset.from_list(val_data_text)
else:
    val_dataset = None

print(f"\n‚úì Datasets prepared successfully!")
print(f"  Training: {len(train_dataset)} samples")
if val_dataset:
    print(f"  Validation: {len(val_dataset)} samples")

# Show a sample
print("\nüìù Sample training text (first 500 chars):")
print("-" * 50)
print(train_dataset[0]["text"][:500] + "...")


Training samples: 5597
Validation samples: 621


ArrowInvalid: cannot mix list and non-list, non-null values

In [None]:
#@title 3.3 Setup Trainer with Checkpointing { display-mode: "form" }
from trl import SFTTrainer, SFTConfig
import os
import glob

# Find latest checkpoint if resuming
resume_checkpoint = None
if RESUME_FROM_CHECKPOINT and os.path.exists(CHECKPOINT_DIR):
    checkpoints = glob.glob(f"{CHECKPOINT_DIR}/checkpoint-*")
    if checkpoints:
        # Sort by step number and get latest
        checkpoints = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))
        resume_checkpoint = checkpoints[-1]
        print(f"üìÇ Found checkpoint: {resume_checkpoint}")
        print(f"   Training will resume from this checkpoint.")
    else:
        print("No checkpoints found. Starting fresh.")
else:
    print("Starting fresh training (no checkpoint resume).")

# Training configuration
training_args = SFTConfig(
    output_dir=CHECKPOINT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    max_steps=MAX_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=10,
    save_steps=100,  # Save checkpoint every 100 steps
    save_total_limit=3,  # Keep only last 3 checkpoints
    optim="adamw_8bit",
    weight_decay=0.01,
    max_seq_length=MAX_SEQ_LENGTH,
    dataset_num_proc=2,
    packing=False,
    report_to="none",  # Set to "wandb" if using W&B
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
)

print("\n‚úì Trainer configured")
print(f"  - Output: {CHECKPOINT_DIR}")
print(f"  - Max steps: {MAX_STEPS}")
print(f"  - Save every: 100 steps")

In [None]:
#@title 3.4 Start Training { display-mode: "form" }
print("="*60)
print("STARTING TRAINING")
print("="*60)
if resume_checkpoint:
    print(f"Resuming from: {resume_checkpoint}")
print(f"Checkpoints saved to: {CHECKPOINT_DIR}")
print("="*60 + "\n")

# Train
trainer_stats = trainer.train(resume_from_checkpoint=resume_checkpoint)

print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)
print(f"Total steps: {trainer_stats.global_step}")
print(f"Training loss: {trainer_stats.training_loss:.4f}")

In [None]:
#@title 3.5 Save Final Model { display-mode: "form" }
FINAL_MODEL_DIR = f"{CHECKPOINT_DIR}/final"

print(f"Saving final model to: {FINAL_MODEL_DIR}")
model.save_pretrained(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)

# Save training config
import yaml
config = {
    "model": MODEL_ID,
    "lora_rank": LORA_RANK,
    "max_steps": MAX_STEPS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE,
    "training_samples": len(train_data),
}
with open(f"{FINAL_MODEL_DIR}/training_config.yaml", "w") as f:
    yaml.dump(config, f)

print("\n‚úì Model saved successfully!")
print(f"\nTo load this model later:")
print(f"  model, tokenizer = FastVisionModel.from_pretrained('{FINAL_MODEL_DIR}')")

---
## 4. Inference & Evaluation

In [None]:
#@title 4.1 Load Trained Model for Inference { display-mode: "form" }
from unsloth import FastVisionModel

# Use final model or latest checkpoint
if os.path.exists(f"{CHECKPOINT_DIR}/final"):
    inference_model_path = f"{CHECKPOINT_DIR}/final"
else:
    # Find latest checkpoint
    checkpoints = glob.glob(f"{CHECKPOINT_DIR}/checkpoint-*")
    if checkpoints:
        inference_model_path = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
    else:
        inference_model_path = MODEL_ID
        print("No trained model found, using base model")

print(f"Loading model from: {inference_model_path}")

inference_model, inference_tokenizer = FastVisionModel.from_pretrained(
    inference_model_path,
    load_in_4bit=True,
)
FastVisionModel.for_inference(inference_model)

print("‚úì Model loaded for inference")

In [None]:
#@title 4.2 Test Inference { display-mode: "form" }
from PIL import Image
import requests
from io import BytesIO
from qwen_vl_utils import process_vision_info

# Use a reliable test image URL
test_url = "https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba?w=800"
test_question = "What do you see in this image? Describe step by step."

print(f"Testing with image from URL...")
print(f"Question: {test_question}\n")

# Load image with proper headers
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
response = requests.get(test_url, headers=headers, timeout=10)
response.raise_for_status()
test_image = Image.open(BytesIO(response.content)).convert("RGB")
print(f"Image loaded: {test_image.size}")

# Prepare messages in Qwen2-VL format
messages = [{
    "role": "user",
    "content": [
        {"type": "image", "image": test_image},
        {"type": "text", "text": test_question}
    ]
}]

# Apply chat template to get the text prompt
text = inference_tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# Process vision info and prepare inputs
image_inputs, video_inputs = process_vision_info(messages)
inputs = inference_tokenizer(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt"
).to("cuda")

# Generate
with torch.no_grad():
    outputs = inference_model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=False
    )

# Decode only the new tokens
generated_ids = outputs[:, inputs.input_ids.shape[1]:]
response_text = inference_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("Response:")
print("-" * 40)
print(response_text)
print("-" * 40)

In [None]:
#@title 4.3 Evaluation Metrics { display-mode: "form" }
import re

def parse_boxes(text):
    """Extract bounding boxes from model output."""
    pattern = r"<box>\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]</box>"
    matches = re.findall(pattern, text)
    return [[int(x) for x in match] for match in matches]

def calculate_iou(box1, box2):
    """Calculate IoU between two boxes."""
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])

    return inter / (area1 + area2 - inter + 1e-6)

# Example evaluation on validation set
if val_data:
    print("Evaluating on validation set...\n")

    num_with_boxes = 0
    total_boxes = 0

    for i, sample in enumerate(val_data[:5]):  # Test first 5
        question = sample["messages"][0]["content"][0]["text"]
        gold_response = sample["messages"][1]["content"]
        gold_boxes = parse_boxes(gold_response)

        # Generate prediction
        messages = [{"role": "user", "content": [{"type": "text", "text": question}]}]
        inputs = inference_tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda")
        outputs = inference_model.generate(**inputs, max_new_tokens=256)
        pred_response = inference_tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred_boxes = parse_boxes(pred_response)

        if pred_boxes:
            num_with_boxes += 1
        total_boxes += len(pred_boxes)

        print(f"Sample {i+1}: {len(pred_boxes)} boxes predicted, {len(gold_boxes)} gold boxes")

    print(f"\nSummary:")
    print(f"  Samples with boxes: {num_with_boxes}/5")
    print(f"  Total boxes predicted: {total_boxes}")
else:
    print("No validation data available for evaluation.")

---
## 5. Interactive Demo

In [None]:
#@title 5.1 Launch Gradio Demo { display-mode: "form" }
import gradio as gr
import cv2
import numpy as np
import re
from PIL import Image
from transformers import TextIteratorStreamer
from threading import Thread

def stream_with_boxes(image, question):
    """Stream model response with live bounding box visualization."""
    if image is None:
        yield None, "Please upload an image."
        return

    # Convert to PIL if needed
    if isinstance(image, np.ndarray):
        pil_image = Image.fromarray(image)
        orig_h, orig_w = image.shape[:2]
        overlay = image.copy()
    else:
        pil_image = image
        orig_w, orig_h = image.size
        overlay = np.array(image)

    # Prepare input
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": pil_image},
            {"type": "text", "text": question}
        ]
    }]

    inputs = inference_tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True
    ).to("cuda")

    # Setup streamer
    streamer = TextIteratorStreamer(inference_tokenizer, skip_prompt=True, decode_kwargs={"skip_special_tokens": False})
    gen_kwargs = dict(**inputs, streamer=streamer, max_new_tokens=512, use_cache=True)

    # Start generation in thread
    thread = Thread(target=inference_model.generate, kwargs=gen_kwargs)
    thread.start()

    generated_text = ""
    ref_pattern = re.compile(r"<ref>([^<]+)</ref><box>\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]</box>")

    for new_text in streamer:
        generated_text += new_text

        # Parse refs and boxes
        matches = ref_pattern.findall(generated_text)

        # Draw boxes
        current_overlay = overlay.copy()
        for match in matches:
            label = match[0]
            x1, y1, x2, y2 = [int(x) for x in match[1:]]

            # Denormalize
            ax1 = int((x1 / 1000) * orig_w)
            ay1 = int((y1 / 1000) * orig_h)
            ax2 = int((x2 / 1000) * orig_w)
            ay2 = int((y2 / 1000) * orig_h)

            # Draw
            cv2.rectangle(current_overlay, (ax1, ay1), (ax2, ay2), (0, 255, 0), 2)
            cv2.putText(current_overlay, label, (ax1, ay1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

        yield current_overlay, generated_text

    thread.join()

# Create demo
with gr.Blocks(title="V-CoT Demo") as demo:
    gr.Markdown("# V-CoT: Grounded Visual Reasoning\n\nUpload an image and ask a question. The model will explain its reasoning with bounding boxes.")

    with gr.Row():
        with gr.Column():
            img_input = gr.Image(label="Input Image", type="numpy")
            txt_input = gr.Textbox(label="Question", value="Explain step by step what you see in this image.")
            btn = gr.Button("Generate", variant="primary")

        with gr.Column():
            img_output = gr.Image(label="Visualization")
            txt_output = gr.Textbox(label="Response", lines=10)

    btn.click(stream_with_boxes, [img_input, txt_input], [img_output, txt_output])

print("Launching demo...")
demo.launch(share=True, debug=True)

---
## 6. Utilities

In [None]:
#@title 6.1 List Saved Checkpoints { display-mode: "form" }
import os
from datetime import datetime

print(f"Checkpoint directory: {CHECKPOINT_DIR}\n")

if os.path.exists(CHECKPOINT_DIR):
    items = os.listdir(CHECKPOINT_DIR)
    checkpoints = [d for d in items if d.startswith('checkpoint-') or d == 'final']

    if checkpoints:
        print(f"Found {len(checkpoints)} checkpoint(s):\n")
        for cp in sorted(checkpoints):
            cp_path = f"{CHECKPOINT_DIR}/{cp}"
            size = sum(os.path.getsize(os.path.join(cp_path, f)) for f in os.listdir(cp_path) if os.path.isfile(os.path.join(cp_path, f)))
            mtime = datetime.fromtimestamp(os.path.getmtime(cp_path))
            print(f"  üìÅ {cp}")
            print(f"     Size: {size/1e6:.1f} MB")
            print(f"     Modified: {mtime.strftime('%Y-%m-%d %H:%M:%S')}")
    else:
        print("No checkpoints found.")
else:
    print("Checkpoint directory does not exist.")

In [None]:
#@title 6.2 Download Model to Local Machine { display-mode: "form" }
from google.colab import files
import shutil

DOWNLOAD_MODEL = False  #@param {type:"boolean"}

if DOWNLOAD_MODEL:
    model_to_download = f"{CHECKPOINT_DIR}/final"
    if os.path.exists(model_to_download):
        print("Creating zip archive...")
        shutil.make_archive("/content/vcot_model", 'zip', model_to_download)
        print("Starting download...")
        files.download("/content/vcot_model.zip")
    else:
        print(f"Model not found at {model_to_download}")
else:
    print("Set DOWNLOAD_MODEL = True to download the trained model.")

In [None]:
#@title 6.3 Clean Up Old Checkpoints { display-mode: "form" }
KEEP_LAST_N = 2  #@param {type:"integer"}
CONFIRM_DELETE = False  #@param {type:"boolean"}

if os.path.exists(CHECKPOINT_DIR):
    checkpoints = sorted([d for d in os.listdir(CHECKPOINT_DIR) if d.startswith('checkpoint-')],
                        key=lambda x: int(x.split('-')[-1]))

    if len(checkpoints) > KEEP_LAST_N:
        to_delete = checkpoints[:-KEEP_LAST_N]
        print(f"Found {len(checkpoints)} checkpoints, keeping last {KEEP_LAST_N}")
        print(f"Will delete: {to_delete}")

        if CONFIRM_DELETE:
            for cp in to_delete:
                shutil.rmtree(f"{CHECKPOINT_DIR}/{cp}")
                print(f"  Deleted: {cp}")
            print("\n‚úì Cleanup complete")
        else:
            print("\nSet CONFIRM_DELETE = True to actually delete.")
    else:
        print(f"Only {len(checkpoints)} checkpoints found, nothing to clean.")

---
## Quick Reference

### Resume Training
If your Colab session disconnects:
1. Run cells 1.1-1.5 (Setup)
2. Set `RESUME_FROM_CHECKPOINT = True` in cell 1.1
3. Run cells 3.1-3.4 (Training will resume from last checkpoint)

### Checkpoints Location
All checkpoints are saved to Google Drive at:
```
My Drive/Colab Notebooks/V-CoT/checkpoints/
```

### Model Loading
```python
from unsloth import FastVisionModel
model, tokenizer = FastVisionModel.from_pretrained(
    "/content/drive/MyDrive/Colab Notebooks/V-CoT/checkpoints/final"
)
```