# ðŸ§ª Quick Test - FIXED Version

Dá»±a trÃªn code Ä‘Ã£ work tá»« qwen3-vl-8b-caption-gen.ipynb

In [None]:
# Check GPU
!nvidia-smi

import torch
print(f'PyTorch: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

In [None]:
# Install dependencies (FIXED versions)
!pip install -q transformers==4.57.0
!pip install -q qwen-vl-utils accelerate peft bitsandbytes datasets pillow pyyaml huggingface_hub scipy

# Optional: flash-attention (cÃ³ thá»ƒ skip)
# !pip install -q flash-attn --no-build-isolation

print('âœ… Dependencies installed')

In [None]:
# Login HuggingFace
from huggingface_hub import login

# Option 1: Colab secrets
try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
except:
    # Option 2: Manual
    login()

print('âœ… Logged in')

In [None]:
# Load dataset (100 samples for test)
from datasets import load_dataset
import os

dataset = load_dataset('5CD-AI/Viet-ViTextVQA-gemini-VQA', split='train')
print(f'âœ… Dataset loaded: {len(dataset)} samples')

In [None]:
# Process 100 samples
import json
from tqdm.auto import tqdm
from PIL import Image

os.makedirs('data', exist_ok=True)
os.makedirs('data/images', exist_ok=True)

TEST_SAMPLES = 100
processed_samples = []

for idx in tqdm(range(min(TEST_SAMPLES, len(dataset)))):
    item = dataset[idx]
    image = item['image']
    conversations = item.get('conversations', [])
    
    if not conversations:
        continue
    
    # Save image
    image_filename = f"image_{item['id']}.jpg"
    image_path = os.path.join('data/images', image_filename)
    
    if not os.path.exists(image_path):
        if image.mode != 'RGB':
            image = image.convert('RGB')
        image.save(image_path)
    
    # Process conversations
    current_question = None
    for turn in conversations:
        role = turn.get('role', turn.get('from'))
        content = turn.get('content', turn.get('value'))
        
        if role in ['user', 'human']:
            current_question = content
        elif role in ['assistant', 'gpt'] and current_question:
            processed_samples.append({
                'id': f"{item['id']}_{len(processed_samples)}",
                'image': image_filename,
                'conversations': [
                    {'from': 'human', 'value': f'<image>\\n{current_question}'},
                    {'from': 'gpt', 'value': content}
                ]
            })
            current_question = None

print(f'âœ… Processed {len(processed_samples)} QA pairs')

# Save JSON
with open('data/train_test.json', 'w', encoding='utf-8') as f:
    json.dump(processed_samples, f, ensure_ascii=False, indent=2)

print('âœ… Saved to data/train_test.json')

In [None]:
# LOAD MODEL - FIXED VERSION
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor  # âœ… Qwen3 (not Qwen2)
from qwen_vl_utils import process_vision_info  # âœ… Important import
import torch

MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"

print(f'Loading {MODEL_ID}...')

model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    dtype=torch.bfloat16,  # âœ… dtype (not torch_dtype)
    attn_implementation="flash_attention_2",  # Comment out if no flash-attn
    device_map="auto",
)

processor = AutoProcessor.from_pretrained(MODEL_ID)

print('âœ… Model loaded successfully!')

# Check memory
allocated = torch.cuda.memory_allocated() / 1e9
print(f'GPU Memory Allocated: {allocated:.2f} GB')

In [None]:
# TEST INFERENCE - FIXED VERSION
import matplotlib.pyplot as plt

# Test sample
test_sample = processed_samples[0]
image_path = os.path.join('data/images', test_sample['image'])
question = test_sample['conversations'][0]['value'].replace('<image>\\n', '')
ground_truth = test_sample['conversations'][1]['value']

# Display image
img = Image.open(image_path)
plt.figure(figsize=(6, 6))
plt.imshow(img)
plt.axis('off')
plt.show()

print(f'Question: {question}')
print(f'Ground Truth: {ground_truth}')
print('\nGenerating...')

# Prepare messages (Ä‘Ãºng theo format Ä‘Ã£ work)
messages = [{
    "role": "user",
    "content": [
        {"type": "image", "image": image_path},
        {"type": "text", "text": question}
    ]
}]

# Apply chat template
text = processor.apply_chat_template(
    messages, 
    tokenize=False, 
    add_generation_prompt=True
)

# Process vision info
image_inputs, video_inputs = process_vision_info(messages)

# Prepare inputs
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt"
).to(model.device)

# Generate
with torch.inference_mode():
    generated_ids = model.generate(**inputs, max_new_tokens=128)

# Decode
generated_ids_trimmed = [
    out_ids[len(in_ids):] 
    for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

answer = processor.batch_decode(
    generated_ids_trimmed, 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=False
)[0]

print(f'\nBase Model Answer: {answer}')
print('\nâœ… Inference works!')

## âœ… If Above Cell Works â†’ Training Will Work!

Náº¿u cell trÃªn cháº¡y OK â†’ CÃ³ thá»ƒ proceed vá»›i full training