# Qwen2.5-VL-3B QLoRA Fine-tuning — OpenPack Temporal Operations

**[Kaggle Notebook Public URL]**
https://www.kaggle.com/code/thrinainiaroori/finetune

Assignment: VLM Challenge — Temporal Operation Intelligence for Logistics  
Dataset: OpenPack (U0101–U0106 train, U0107 val, U0108 test)  
Model: Qwen2.5-VL-3B-Instruct + 4-bit QLoRA

In [1]:
# ── Environment Check ─────────────────────────────────────────────────────────
import subprocess, torch

print(f'CUDA available: {torch.cuda.is_available()}')
print(f'GPU count:      {torch.cuda.device_count()}')
for i in range(torch.cuda.device_count()):
    p = torch.cuda.get_device_properties(i)
    print(f'  GPU {i}: {p.name} | {p.total_memory / 1e9:.1f} GB')

result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used,memory.free',
                         '--format=csv'], capture_output=True, text=True)
print('\n' + result.stdout)

CUDA available: False
GPU count:      0


FileNotFoundError: [WinError 2] The system cannot find the file specified

In [None]:
# ── Install Dependencies ──────────────────────────────────────────────────────
!pip install -q transformers==4.47.0 peft==0.10.0 bitsandbytes==0.45.3 \
             accelerate==0.30.1 trl==0.8.6 qwen-vl-utils==0.0.8 \
             webdataset==0.2.86 decord==0.6.0 loguru==0.7.2 wandb

In [None]:
# ╔══════════════════════════════════════════════════════════╗
# ║           VRAM BUDGET CALCULATION (REQUIRED CELL)       ║
# ╚══════════════════════════════════════════════════════════╝

model_base_4bit   = 3.0    # GB — Qwen2.5-VL-2B at 4-bit NF4
lora_adapters     = 0.3    # GB — LoRA r=16, ~10M trainable params
frames_per_clip   = 8      # Sampled frames per 5-sec clip
frame_tokens      = 256    # Visual tokens per frame (vision encoder output)
batch_size        = 2      # Per-device batch size
token_hidden_dim  = 2048   # Qwen2.5-VL-2B hidden dimension

activation_gb = (frames_per_clip * frame_tokens * batch_size * token_hidden_dim * 2) / 1e9
# With gradient checkpointing: activations recomputed during backward, not stored
activation_gc = activation_gb * 0.4   # 0.4 = fraction actually stored

# AdamW optimizer states for LoRA params only (base model frozen)
optimizer_gb = (lora_adapters * 1e9 * 2 * 4) / 1e9  # 2 states × 4 bytes

theoretical_min = model_base_4bit + lora_adapters + activation_gc + optimizer_gb
cuda_overhead   = 6.0   # KV cache + CUDA context + framework overhead
total_observed  = theoretical_min + cuda_overhead

print('═' * 55)
print('  VRAM BUDGET CALCULATION')
print('═' * 55)
print(f'  model_base_4bit   = {model_base_4bit:.3f} GB')
print(f'  lora_adapters     = {lora_adapters:.3f} GB')
print(f'  activation_raw    = {activation_gb:.4f} GB')
print(f'  activation_gc     = {activation_gc:.4f} GB  (×0.4 factor)')
print(f'  optimizer_adamw   = {optimizer_gb:.4f} GB')
print(f'  ─────────────────────────────────────')
print(f'  Theoretical min   = {theoretical_min:.3f} GB')
print(f'  + CUDA overhead   = {cuda_overhead:.1f}   GB (empirical)')
print(f'  Expected observed = {total_observed:.1f} GB')
print(f'  T4 VRAM limit     = 16.0 GB')
safe = '✓ SAFE' if total_observed < 16 else '✗ EXCEEDS LIMIT'
print(f'  Status            = {safe}')
print('═' * 55)

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Force single GPU for stability
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
# ── Setup Repo & Config ───────────────────────────────────────────────────────
import sys, os
from pathlib import Path

# Clone your GitHub repo
!git clone https://github.com/thriniiiiiiiiiiii/VLM-Temporal-Operation-Intelligence-for-Logistics.git /kaggle/working/repo
sys.path.insert(0, '/kaggle/working/repo')

CONFIG_PATH = '/kaggle/working/repo/configs/training_config.yaml'

# Update data paths for Kaggle
import yaml
with open(CONFIG_PATH) as f:
    config = yaml.safe_load(f)

config['data']['openpack_root'] = '/kaggle/input/openpack/data'
config['data']['shard_dir']     = '/kaggle/working/shards'
config['training']['output_dir'] = '/kaggle/working/checkpoints'
config['training']['report_to']  = 'none'  # Disable wandb on Kaggle

kaggle_config_path = '/kaggle/working/config_kaggle.yaml'
with open(kaggle_config_path, 'w') as f:
    yaml.dump(config, f)
print('Config updated for Kaggle paths')

In [None]:
# ── Run Data Pipeline ─────────────────────────────────────────────────────────
!python /kaggle/working/repo/data_pipeline.py \
    --config {kaggle_config_path} \
    --split train

!python /kaggle/working/repo/data_pipeline.py \
    --config {kaggle_config_path} \
    --split val

# Show shard files created
!ls -lh /kaggle/working/shards/train/ | head -20
!echo "Shards created:"
!ls /kaggle/working/shards/train/*.tar | wc -l

In [None]:
# ── Load Model + Apply QLoRA ──────────────────────────────────────────────────
import torch, yaml
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from peft import LoraConfig, TaskType, get_peft_model

with open(kaggle_config_path) as f:
    config = yaml.safe_load(f)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    config['model']['base_id'],
    quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
)

# CRITICAL: Both flags required for quantized model + gradient checkpointing
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

lc = config['lora']
lora_config = LoraConfig(
    r=lc['r'],
    lora_alpha=lc['lora_alpha'],
    target_modules=lc['target_modules'],
    lora_dropout=lc['lora_dropout'],
    bias=lc['bias'],
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Verify GPU memory after load
alloc = torch.cuda.memory_allocated() / 1e9
reserved = torch.cuda.memory_reserved() / 1e9
print(f'\nGPU memory allocated: {alloc:.2f} GB')
print(f'GPU memory reserved:  {reserved:.2f} GB')

In [None]:
# ── Build Datasets + Start Training ──────────────────────────────────────────
import sys
sys.path.insert(0, '/kaggle/working/repo')
from training.finetune import (
    OpenPackDataset, VLMCollator,
    build_training_args
)
from transformers import AutoProcessor, TrainingArguments
from trl import SFTTrainer

processor = AutoProcessor.from_pretrained(
    config['model']['base_id'], trust_remote_code=True
)
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

train_dataset = OpenPackDataset(
    shard_dir='/kaggle/working/shards/train',
    processor=processor,
    frames_per_clip=config['data']['frames_per_clip'],
)
val_dataset = OpenPackDataset(
    shard_dir='/kaggle/working/shards/val',
    processor=processor,
    frames_per_clip=config['data']['frames_per_clip'],
)

print(f'Train clips: {len(train_dataset)}')
print(f'Val   clips: {len(val_dataset)}')

training_args = build_training_args(config)
collator      = VLMCollator(processor)

trainer = SFTTrainer(
    dataset_text_field='text', max_seq_length=1024, 
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collator,
    tokenizer=processor.tokenizer,
)

# Crash-tolerant: resume from latest checkpoint if exists
from pathlib import Path
checkpoints = sorted(Path(config['training']['output_dir']).glob('checkpoint-*'))
checkpoint = str(checkpoints[-1]) if checkpoints else None
if checkpoint:
    print(f'Resuming from: {checkpoint}')

trainer.train(resume_from_checkpoint=checkpoint)

In [None]:
# ── Save Final Model ──────────────────────────────────────────────────────────
final_path = '/kaggle/working/checkpoints/final'
trainer.save_model(final_path)
processor.save_pretrained(final_path)
print(f'Final model saved to: {final_path}')

import json
with open(f'{final_path}/training_log.json', 'w') as f:
    json.dump(trainer.state.log_history, f, indent=2)

# Final GPU memory check
import torch
alloc = torch.cuda.memory_allocated() / 1e9
print(f'Peak GPU memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB')