## Install and Imports

In [1]:
# Instalações
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install trl peft accelerate bitsandbytes pillow
!pip install datasets==2.21.0

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import os
import torch
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer, AutoProcessor
from peft import get_peft_model, LoraConfig

## Load Dataset and Prepare data

In [4]:
%%time

from datasets import load_dataset
ds = load_dataset('HuggingFaceM4/VQAv2', split="train[:10%]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/352 [00:00<?, ?B/s]

The repository for HuggingFaceM4/VQAv2 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/HuggingFaceM4/VQAv2.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/7.24M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.49M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.97M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.5G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.65G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.3G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating testdev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

CPU times: user 9min 22s, sys: 1min 50s, total: 11min 12s
Wall time: 19min 11s


In [5]:
print(ds[0])
print(len(ds))

{'question_type': 'what is this', 'multiple_choice_answer': 'net', 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 458752, 'answer_type': 'other', 'question_id': 458752000, 'question': 'What is this photo taken looking through?', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x787BD29DE8C0>}
44376


In [6]:
ds.column_names

['question_type',
 'multiple_choice_answer',
 'answers',
 'image_id',
 'answer_type',
 'question_id',
 'question',
 'image']

In [7]:
ds[:3]

{'question_type': ['what is this', 'what', 'what color is the'],
 'multiple_choice_answer': ['net', 'pitcher', 'orange'],
 'answers': [[{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3},
   {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6},
   {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}],
  [{'answer': 'pitcher', 'answer_confidence': 'yes', 'answer_id': 1},
   {'answer': 'catcher', 'answer_confidence': 'no', 'answer_id': 2},
   {'answer': 'pitcher', 'answer_confidence': 'yes', 'ans

In [8]:
# Retirar colunas que não serão necessárias no treinamento
delete_column = ["question_type", "answers", "answer_type", "image_id", "question_id"]
dataset = ds.remove_columns(delete_column)

In [9]:
len(dataset)

44376

In [10]:
split_ds = dataset.train_test_split(test_size=0.05)
train_ds = split_ds["test"]
print(train_ds[0])

{'multiple_choice_answer': 'yes', 'question': 'Is this man dressed formally?', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=566x640 at 0x787BD16268F0>}


In [11]:
train_ds

Dataset({
    features: ['multiple_choice_answer', 'question', 'image'],
    num_rows: 2219
})

## Model and Quantização


In [15]:
model_id = "google/paligemma-3b-pt-224"
processor = PaliGemmaProcessor.from_pretrained(model_id)
device = "cuda"
image_token = processor.tokenizer.convert_tokens_to_ids("<image>")

In [16]:
# Quantised Model e Lora Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_type=torch.bfloat16
)
lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj",
                    "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Unused kwargs: ['bnb_4bit_compute_type']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/62.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

trainable params: 11,298,816 || all params: 2,934,765,296 || trainable%: 0.3850


In [17]:
# for param in model.vision_tower.parameters():
#     param.requires_grad = False
# for param in model.multi_modal_projector.parameters():
#     param.requires_grad = False

## Fine tune do Modelo

Preparação para finetuning

In [23]:
def collate_fn(examples):
    # Gerar textos com token <image> para cada exemplo
    texts = [
        "<image> <bos> answer " + example["question"]
        for example in examples
    ]
    labels = [example["multiple_choice_answer"] for example in examples]
    images = [example["image"].convert("RGB") for example in examples]  # Converte para RGB

    # Processar os tokens com o processor
    tokens = processor(
        text=texts,
        images=images,
        suffix=labels,
        return_tensors="pt",
        padding="longest",
        tokenize_newline_separately=False
    )
    tokens = tokens.to(torch.bfloat16).to(device)
    return tokens


In [31]:
args = TrainingArguments(
    num_train_epochs=10,
    remove_unused_columns=False,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    learning_rate=2e-5,
    weight_decay=1e-6,
    adam_beta2=0.999,
    logging_steps=50,
    optim="adamw_hf",
    save_strategy="steps",
    save_steps=10,
    push_to_hub=True,
    save_total_limit=1,
    output_dir="paligemma_vqav2",
    bf16=True,
    dataloader_pin_memory=False
)

In [32]:
trainer = Trainer(
    model=model,
    train_dataset=train_ds,
    data_collator=collate_fn,
    args=args
)

In [33]:
trainer.train()



Step,Training Loss
50,1.3788
100,1.3156
150,1.2103
200,1.1576
250,1.1447
300,1.0098
350,1.0161
400,0.9457
450,0.8553
500,0.8483


TrainOutput(global_step=1380, training_loss=0.7161428962928661, metrics={'train_runtime': 2845.4141, 'train_samples_per_second': 7.799, 'train_steps_per_second': 0.485, 'total_flos': 8.625670290383155e+16, 'train_loss': 0.7161428962928661, 'epoch': 9.92972972972973})

In [34]:
trainer.push_to_hub('vannynakamura/FineTune_paligemma_VQA')

CommitInfo(commit_url='https://huggingface.co/vannynakamura/paligemma_vqav2/commit/99b2023ee2368dee4671eeb6f9075bcf8c4f1fd8', commit_message='vannynakamura/FineTune_paligemma_VQA', commit_description='', oid='99b2023ee2368dee4671eeb6f9075bcf8c4f1fd8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vannynakamura/paligemma_vqav2', endpoint='https://huggingface.co', repo_type='model', repo_id='vannynakamura/paligemma_vqav2'), pr_revision=None, pr_num=None)