# Fine-tuning Florence-2 on VizWiz VQA

In this notebook, we will fine-tune Florence-2 by MSFT, a new vision language model capable of various tasks, on vizwiz question answering dataset.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/32933-research-project

/content/drive/MyDrive/32933-research-project


In [13]:
!pip install -q datasets flash_attn timm einops

In [14]:
from torch.utils.data import Dataset
import argparse
import json
from collections import Counter
from PIL import Image

We can load the model using `AutoModelForCausalLM` and the processor using `AutoProcessor`  classes of transformers library. Note that we need to pass `trust_remote_code` as `True` since this model is not a transformers model.

In [15]:
from transformers import AutoModelForCausalLM, AutoProcessor
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True, revision='refs/pr/6').to(device)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True, revision='refs/pr/6')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Florence2LanguageForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of t

In [12]:
# Function to run the model on an example
def run_example(task_prompt, text_input, image):
    prompt = task_prompt + text_input

    # Ensure the image is in RGB mode
    if image.mode != "RGB":
        image = image.convert("RGB")

    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    return parsed_answer


In [8]:
import os
import json
from PIL import Image
from torch.utils.data import Dataset

class VizWizDataset(Dataset):
    def __init__(self, image_dir, annotation_file):
        with open(annotation_file, "r") as f:
            self.samples = json.load(f)
        self.image_dir = image_dir

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        # Prefix the question similar to DocVQADataset (change prefix as needed)
        question = "<VizWiz>" + sample["question"].strip()

        # Extract the first answer; if answers is empty or missing, use a fallback.
        if sample.get("answers") and len(sample["answers"]) > 0:
            # Here, we assume each answer is a dictionary with the key "answer".
            first_answer = sample["answers"][0].get("answer", "unanswerable")
        else:
            first_answer = "unanswerable"

        # Build the image path and open the image.
        image_path = os.path.join(self.image_dir, sample["image"])
        image = Image.open(image_path)
        if image.mode != "RGB":
            image = image.convert("RGB")

        return question, first_answer, image

In [9]:
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoProcessor, get_scheduler
from torch.optim import AdamW


# Assume processor and device are defined earlier, for example:
# processor = AutoProcessor.from_pretrained("your-model-id")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_fn(batch):
    questions, answers, images = zip(*batch)
    inputs = processor(
        text=list(questions),
        images=list(images),
        return_tensors="pt",
        padding=True
    ).to(device)
    return inputs, answers

# Paths specific to VizWiz
vizwiz_train_image_dir = "/content/drive/MyDrive/32933-research-project/vqa-vizwiz/data/balanced_subset2/train"  # Folder with your VizWiz images
vizwiz_val_image_dir = "/content/drive/MyDrive/32933-research-project/vqa-vizwiz/data/balanced_subset2/val"  # Folder with your VizWiz images
vizwiz_train_annotation_file = "/content/drive/MyDrive/32933-research-project/vqa-vizwiz/data/balanced_subset2/annotations/train.json"
vizwiz_val_annotation_file = "/content/drive/MyDrive/32933-research-project/vqa-vizwiz/data/balanced_subset2/annotations/val.json"

# Create datasets using the VizWizDataset class
train_dataset = VizWizDataset(vizwiz_train_image_dir, vizwiz_train_annotation_file)
val_dataset = VizWizDataset(vizwiz_val_image_dir, vizwiz_val_annotation_file)

# Create DataLoaders
batch_size = 1
num_workers = 0

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    num_workers=num_workers,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    num_workers=num_workers
)


In [10]:
import transformers
print(transformers.__version__)

4.51.2


In [11]:
def train_model(train_loader, val_loader, model, processor, epochs=10, lr=1e-6):
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_loader)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        i = -1
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
            i += 1
            inputs, answers = batch

            input_ids = inputs["input_ids"]
            pixel_values = inputs["pixel_values"]
            labels = processor.tokenizer(text=answers, return_tensors="pt", padding=True, return_token_type_ids=False).input_ids.to(device)

            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Average Training Loss: {avg_train_loss}")

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{epochs}"):
                inputs, answers = batch

                input_ids = inputs["input_ids"]
                pixel_values = inputs["pixel_values"]
                labels = processor.tokenizer(text=answers, return_tensors="pt", padding=True, return_token_type_ids=False).input_ids.to(device)

                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
                loss = outputs.loss

                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Average Validation Loss: {avg_val_loss}")

        # Save model checkpoint
        output_dir = f"./model_checkpoints/epoch_{epoch+1}"
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        processor.save_pretrained(output_dir)


In [17]:
for param in model.vision_tower.parameters():
  param.is_trainable = False

In [18]:
train_model(train_loader, val_loader, model, processor, epochs=2)

Training Epoch 1/2: 100%|██████████| 2448/2448 [28:49<00:00,  1.42it/s]


Average Training Loss: 0.6106407959090742


Validation Epoch 1/2: 100%|██████████| 524/524 [02:05<00:00,  4.16it/s]


Average Validation Loss: 0.5259570672915704


Training Epoch 2/2: 100%|██████████| 2448/2448 [28:49<00:00,  1.42it/s]


Average Training Loss: 0.42579746650686157


Validation Epoch 2/2: 100%|██████████| 524/524 [02:06<00:00,  4.15it/s]


Average Validation Loss: 0.5263055362323217
