# HW2-3 Image captioning

### Library

In [None]:
!pip install unsloth pandas pyarrow torch torchvision datasets transformer accelerate bitsandbytes

In [None]:
import pandas as pd
from PIL import PILImage
from datasets import Dataset, Features, Image, Value
import torch
from torchvision import transforms
from unsloth import FastVisionModel
import json

from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

from transformers import TextStreamer
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

### Dataset

In [None]:
def prepare_dataset(parquet_file, include_caption=True):
    df = pd.read_parquet(parquet_file)
    df['image'] = df['image'].apply(lambda x: x['bytes'])  # flatten {'bytes': ...}

    features = Features({
        'image': Image(decode=True),
        'caption': Value('string') if include_caption else Value('null')  # null type if test set
    })

    columns = ['image', 'caption'] if include_caption else ['image']
    dataset = Dataset.from_pandas(df[columns], features=features)

    def resize_image(example):
        img = example['image']
        if isinstance(img, Image.Image):
            img = img.resize((224, 224))
        return {'image': img}

    dataset = dataset.map(resize_image)
    return dataset

train_ds = prepare_dataset("train_data.parquet", include_caption=True)
valid_ds = prepare_dataset("valid_data.parquet", include_caption=True)

test_df = pd.read_parquet("test_data.parquet")
test_df['image'] = test_df['image'].apply(lambda x: x['bytes'])

# Test dataset: build Hugging Face Dataset (with idx)
features_test = Features({
    'idx': Value('int64'),
    'image': Image(decode=True)
})
test_ds = Dataset.from_pandas(test_df[['idx', 'image']], features=features_test)

### Pretrained model

In [None]:
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

### Finetuning

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

### Prompting

In [None]:
instruction = "You are an expert artist. Describe accurately what you see in this image."

def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : sample["image"]} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["caption"]} ]
        },
    ]
    return { "messages" : conversation }
pass

converted_dataset = [convert_to_conversation(sample) for sample in train_df]

### Training

In [None]:
FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

In [None]:
trainer_stats = trainer.train()

### Saving

In [None]:
model.save_pretrained("Llama-3.2-11B-Vision-caption-finetune-unsloth") # Local saving
tokenizer.save_pretrained("Llama-3.2-11B-Vision-caption-finetune-unsloth")
# model.push_to_hub("Wilbur1240/Llama-3.2-11B-Vision-caption-finetune-unsloth", token = "...") # Online saving
# tokenizer.push_to_hub("Wilbur1240/Llama-3.2-11B-Vision-caption-finetune-unsloth", token = "...") # Online saving

### Inference

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

text_streamer = TextStreamer(tokenizer, skip_prompt=True)

results = []

instruction = "You are an expert artist. Describe accurately what you see in this image."

for example in tqdm(test_df):
    image = example['image']
    idx = example['idx']

    # Build the chat prompt
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]
    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

    # Prepare inputs
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
    ).to("cuda")

    # Generate the caption
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=1.5,
        min_p=0.1,
    )

    # Decode
    caption = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Collect result
    results.append({
        "idx": idx,
        "output": caption.strip()
    })

# 4️⃣ Save results to JSON (like sample_submission.json)
with open('submission.json', 'w') as f:
    json.dump(results, f, indent=2)

print("✅ Inference complete. Results saved to submission.json.")