In [None]:
!pip install qwen_vl_utils
!pip install -U bitsandbytes
!pip install trl==0.12.0

In [None]:
import os
import random
import numpy as np
import torch
from PIL import Image
import pandas as pd
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoProcessor,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from datasets import Dataset as HFDataset, load_dataset
import zipfile
import shutil

In [None]:
# ----------------- Configuration -----------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=False)

# Additional environment variables for reproducibility
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
os.environ['TRANSFORMERS_SEED'] = str(SEED)
os.environ['PYTORCH_SEED'] = str(SEED)
os.environ['NUMPY_SEED'] = str(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# ----------------- Download and Extract Dataset from Hugging Face -----------------
# Creating local data directory
LOCAL_DATA_DIR = "/content/obss_data"
os.makedirs(LOCAL_DATA_DIR, exist_ok=True)

# Downloading the zip file
zip_path = os.path.join(LOCAL_DATA_DIR, "obss-intern-competition-2025.zip")
if not os.path.exists(zip_path):
    print("Downloading zip file from Hugging Face...")
    import requests
    url = "https://huggingface.co/datasets/obss/ai-intern-challenge-2025/resolve/main/obss-intern-competition-2025.zip"

    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))

    with open(zip_path, 'wb') as f:
        downloaded = 0
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
            downloaded += len(chunk)
            if total_size > 0:
                percent = (downloaded / total_size) * 100
                print(f"\rDownloading: {percent:.1f}%", end='')
    print(f"\nDownloaded zip file to {zip_path}")

# Extracting the zip file
if not os.path.exists(os.path.join(LOCAL_DATA_DIR, "train")):
    print("Extracting zip file...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_DATA_DIR)
    print(f"Extracted files to {LOCAL_DATA_DIR}")

    print("\nExtracted contents:")
    for root, dirs, files in os.walk(LOCAL_DATA_DIR):
        level = root.replace(LOCAL_DATA_DIR, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 2 * (level + 1)
        for file in files[:5]:  # Show first 5 files
            print(f"{subindent}{file}")
        if len(files) > 5:
            print(f"{subindent}... and {len(files) - 5} more files")

In [None]:
IMAGE_DIR = os.path.join(LOCAL_DATA_DIR, "train", "train")
TEST_DIR = os.path.join(LOCAL_DATA_DIR, "test", "test")
TRAIN_CSV = os.path.join(LOCAL_DATA_DIR, "train.csv")
TEST_CSV = os.path.join(LOCAL_DATA_DIR, "test.csv")

SUBMISSION_CSV = os.path.join(LOCAL_DATA_DIR, "submission_pelinsu_kaleli.csv")

OUTPUT_DIR = "/content/qwen25vl_lora_captioning"
DATASET_CACHE = "/content/train_dataset_cache"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Quantization settings
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16  # Use bf16 for A100 training
)

In [None]:
# ----------------- Model & Processor -----------------
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    device_map="auto",
    quantization_config=bnb_config
)

# Enabling gradient checkpointing in order to save memory
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # All attention projections
        "gate_proj", "up_proj", "down_proj",     # MLP layers
        "embed_tokens", "lm_head"                # Input/output embeddings
    ], # Got the best training results with selecting these modules.
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

In [None]:
# ----------------- Dataset Preparation -----------------
# Check if cached dataset exists
if os.path.exists(DATASET_CACHE):
    dataset = HFDataset.load_from_disk(DATASET_CACHE)
else:
    df = pd.read_csv(TRAIN_CSV).dropna()

    # Helper to build one record with proper format
    def make_record(row):
        img_path = os.path.join(IMAGE_DIR, f"{row.image_id}.jpg")
        img = Image.open(img_path).convert("RGB")

        prompt = """Generate a single-sentence, objective, and descriptive caption for the given image. Strive to be as comprehensive and detailed as possible, keeping it between 15 to 25 words. Your caption should focus on multiple significant visible elements: identify primary subjects (e.g., people, animals) and their actions or notable characteristics; describe key objects and their attributes; include specific brand names or clearly legible text from signs/labels; and mention the immediate setting or context if prominent. The caption must be in the present tense, maintain a neutral, factual tone (like a museum or news catalog entry), and avoid subjective opinions."""

        return {
            "image": img,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": prompt}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": row.caption}
                    ]
                }
            ]
        }

    # Parallelize image loading
    records = []
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as exe:
        futures = [exe.submit(make_record, row) for row in df.itertuples(index=False)]
        for fut in tqdm(as_completed(futures), total=len(futures), desc="Preparing samples"):
            records.append(fut.result())

    # Create HF Dataset
    # IT TAKES AROUND 25 MINUTES AFTER PREPARING SAMPLES WITHOUT A PROGRESS BAR DO NOT WORRY !!!
    dataset = HFDataset.from_list(records)
    dataset.save_to_disk(DATASET_CACHE)

In [None]:
# Data collator for SFTTrainer
def collate_fn(examples):
    texts = []
    images = []

    for example in examples:
        text = processor.apply_chat_template(
            example['messages'],
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)
        images.append(example['image'])

    batch = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
    )

    # Create labels
    labels = batch['input_ids'].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    # Mask the prompt part (everything before assistant's response)
    for i, text in enumerate(texts):
        assistant_start = text.find("assistant") + len("assistant")
        tokens = processor.tokenizer.encode(text[:assistant_start], add_special_tokens=False)
        labels[i, :len(tokens)] = -100

    batch['labels'] = labels

    return batch

In [None]:
# Map dataset to add text field for SFTTrainer
dataset = dataset.map(
    lambda ex: {
        "text": processor.apply_chat_template(
            ex["messages"],
            tokenize=False,
            add_generation_prompt=False
        )
    }
)

In [None]:
# ----------------- Training Arguments -----------------
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=4e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    logging_steps=20,
    save_steps=100,
    eval_steps=100,
    eval_strategy="no",
    save_strategy="steps",
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    report_to="none",
    push_to_hub=False,
    dataset_text_field="text",
    dataset_kwargs={"skip_prepare_dataset": True},
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},  # More memory efficient
    seed=SEED,  # Setting seeds for both trainer -
    data_seed=SEED,  # and data sampling for reproducibility
)

# ----------------- Trainer -----------------
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn,
    tokenizer=processor.tokenizer,
    peft_config=peft_config
)

In [None]:
# # ----------------- Training -----------------
# IT TAKES AROUND *10 HOURS* ON A100 40 GB
trainer.train()

In [None]:
# Save model
model.save_pretrained(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)
print(f"Model and processor saved to {OUTPUT_DIR}")

In [None]:
# ----------------- Quick Test -----------------
print("\nTesting the fine-tuned model...")

test_df_for_quick_test = pd.read_csv(TRAIN_CSV)
test_row = test_df_for_quick_test.iloc[5]
test_image_path = os.path.join(IMAGE_DIR, f"{test_row['image_id']}.jpg")

test_seed = SEED + test_row['image_id']
torch.manual_seed(test_seed)
torch.cuda.manual_seed_all(test_seed)

test_img = Image.open(test_image_path).convert("RGB")

# Test prompt, same as training
test_prompt = """Generate a single-sentence, objective, and descriptive caption for the given image. Strive to be as comprehensive and detailed as possible, keeping it between 15 to 25 words. Your caption should focus on multiple significant visible elements: identify primary subjects (e.g., people, animals) and their actions or notable characteristics; describe key objects and their attributes; include specific brand names or clearly legible text from signs/labels; and mention the immediate setting or context if prominent. The caption must be in the present tense, maintain a neutral, factual tone (like a museum or news catalog entry), and avoid subjective opinions."""

test_messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": test_prompt}
        ]
    }
]

test_text_input = processor.apply_chat_template(
    test_messages,
    tokenize=False,
    add_generation_prompt=True
)

test_inputs = processor(
    text=[test_text_input],
    images=[test_img],
    return_tensors="pt",
    padding=True,
).to(device)

# Generate caption
model.eval()
with torch.no_grad():
    test_gen_ids = model.generate(
        **test_inputs,
        max_new_tokens=50,
        do_sample=False,
        temperature=None,
    )
    test_generated_tokens = test_gen_ids[0][test_inputs['input_ids'].shape[1]:]
    test_response = processor.tokenizer.decode(test_generated_tokens, skip_special_tokens=True)


print(f"Image ID: {test_row['image_id']}")
print(f"Original caption: {test_row['caption']}")
print(f"Generated caption: {test_response}")

In [None]:
# ----------------- INFERENCE SECTION -----------------

In [None]:
!pip install qwen_vl_utils
!pip install -U bitsandbytes
!pip install trl==0.12.0

In [None]:
import gc
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
import random
import numpy as np
import torch
from PIL import Image
import pandas as pd
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoProcessor,
    BitsAndBytesConfig
)
from peft import PeftModel
from tqdm import tqdm

In [None]:
# ----------------- Configuration -----------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=False)

# Additional environment variables for reproducibility
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
os.environ['TRANSFORMERS_SEED'] = str(SEED)
os.environ['PYTORCH_SEED'] = str(SEED)
os.environ['NUMPY_SEED'] = str(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Paths to the dataset and images (using local directories)
LOCAL_DATA_DIR = "/content/obss_data"
IMAGE_DIR = os.path.join(LOCAL_DATA_DIR, "train", "train")
TEST_DIR = os.path.join(LOCAL_DATA_DIR, "test", "test")
TRAIN_CSV = os.path.join(LOCAL_DATA_DIR, "train.csv")
TEST_CSV = os.path.join(LOCAL_DATA_DIR, "test.csv")
SUBMISSION_CSV = os.path.join(LOCAL_DATA_DIR, "submission_pelinsu_kaleli.csv")
OUTPUT_DIR = "/content/qwen25vl_lora_captioning"
DATASET_CACHE = "/content/train_dataset_cache"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# ----------------- Loading on T4 for Inference -----------------

bnb_config_t4 = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16  # fp16 for T4
)

base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    device_map="auto",
    quantization_config=bnb_config_t4,
    torch_dtype=torch.float16  # T4 uses fp16
)

In [None]:
# Loading lora weights
model = PeftModel.from_pretrained(
    base_model,
    OUTPUT_DIR,
    torch_dtype=torch.float16,
)

processor = AutoProcessor.from_pretrained(OUTPUT_DIR)

In [None]:
# ----------------- Inference -----------------
# Below I am processing received images in chunks because I was getting Cuda OOM error every time I tried to do inference.
# After many hours of debugging, cause of the problem was actually the 350th image.
# Therefore I have added many safety features until understanding the problem, including Cuda OOM error checking and realized the problem afterwards.
# However since it is still a very reliable way of doing inference I have decided to keep it this way.
# Takes around 5-6 hours total, each chunk around 9 minutes.

CHUNK_SIZE = 100
OUTPUT_PATH = SUBMISSION_CSV

test_df = pd.read_csv(TEST_CSV)

# Function to clean captions in a neat format
def clean_caption(caption):
    caption = caption.strip()

    if caption and caption[-1] not in '.!?':
        caption += '.'

    if caption:
        caption = caption[0].upper() + caption[1:]

    return caption

# Function to process a single chunk
def process_chunk(chunk_df, start_idx):
    chunk_results = []
    model.eval()

    for idx, row in enumerate(tqdm(chunk_df.itertuples(index=False),
                                   total=len(chunk_df),
                                   desc=f"Processing chunk starting at {start_idx}")):
        try:
            image_id = row.image_id
            img_path = os.path.join(TEST_DIR, f"{image_id}.jpg")
            img = Image.open(img_path).convert("RGB")

            # Got the best result with this prompt, a little bit changed from the training one, maybe could've gotten a better score if this was used in the training as well.
            prompt = """Generate a single-sentence, objective, and descriptive caption for the given image. Strive to be as comprehensive and detailed as possible, keeping it between 15 to 30 words. Your caption should focus on multiple significant visible elements: identify primary subjects (e.g., people, animals) and their actions or notable characteristics; describe key objects with their specific attributes and spatial positions; include specific brand names, model numbers, or clearly legible text from signs/labels and mention the immediate setting, background, or foreground context when it helps understand the composition. The caption must be in the present tense, maintain a neutral, factual tone (like a museum or news catalog entry), and avoid subjective opinions."""
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": prompt}
                    ]
                }
            ]

            text_input = processor.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

            inputs = processor(
                text=[text_input],
                images=[img],
                return_tensors="pt",
                padding=True,
            ).to(device)

            with torch.no_grad():
                gen_ids = model.generate(
                    **inputs,
                    max_new_tokens=50,
                    do_sample=False,
                    temperature=None,
                )
                generated_tokens = gen_ids[0][inputs['input_ids'].shape[1]:]
                caption = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)

            caption = clean_caption(caption)
            chunk_results.append({'image_id': image_id, 'caption': caption})

            del inputs, gen_ids, generated_tokens, img

        except Exception as e:
            print(f"\nError processing image {image_id}: {e}")
            chunk_results.append({'image_id': image_id, 'caption': 'An image showing various objects and scenes.'})

        # Clear cache every image
        if (idx + 1) % 1 == 0:
            torch.cuda.empty_cache()

    return chunk_results

all_results = []

# Process in chunks
num_chunks = (len(test_df) + CHUNK_SIZE - 1) // CHUNK_SIZE
print(f"Total images: {len(test_df)}")
print(f"Processing in {num_chunks} chunks of {CHUNK_SIZE} images each\n")

for chunk_idx in range(num_chunks):
    start_idx = chunk_idx * CHUNK_SIZE
    end_idx = min(start_idx + CHUNK_SIZE, len(test_df))

    print(f"\n{'='*50}")
    print(f"Processing Chunk {chunk_idx + 1}/{num_chunks}")
    print(f"Images {start_idx} to {end_idx - 1}")
    print(f"{'='*50}")

    chunk_df = test_df.iloc[start_idx:end_idx]

    chunk_results = process_chunk(chunk_df, start_idx)
    all_results.extend(chunk_results)

    intermediate_df = pd.DataFrame(all_results)
    intermediate_path = OUTPUT_PATH.replace('.csv', f'_intermediate_{end_idx}.csv')
    intermediate_df.to_csv(intermediate_path, index=False)
    print(f"\nSaved intermediate results to {intermediate_path}")
    print(f"Total captions generated: {len(all_results)}")

    # Clear memory after chunk
    gc.collect()
    torch.cuda.empty_cache()
    print(f"GPU memory after chunk: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

    import time
    if chunk_idx < num_chunks - 1:
        print("\nWaiting 5 seconds before next chunk...")
        time.sleep(5)

# Save final results
print(f"\n{'='*50}")
print("Processing complete!")
print(f"{'='*50}")

final_df = pd.DataFrame(all_results)
final_df.to_csv(OUTPUT_PATH, index=False)
print(f"\nFinal results saved to {OUTPUT_PATH}")
print(f"Total captions generated: {len(all_results)}")

print("\nCleaning up intermediate files...")
for chunk_idx in range(num_chunks):
    end_idx = min((chunk_idx + 1) * CHUNK_SIZE, len(test_df))
    intermediate_path = OUTPUT_PATH.replace('.csv', f'_intermediate_{end_idx}.csv')
    if os.path.exists(intermediate_path):
        os.remove(intermediate_path)
        print(f"Removed {intermediate_path}")