Fine-tune BLIP

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Set-up environment

In [3]:
!pip install git+https://github.com/huggingface/transformers.git@main

Collecting git+https://github.com/huggingface/transformers.git@main
  Cloning https://github.com/huggingface/transformers.git (to revision main) to /tmp/pip-req-build-n7hzkl8c
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-n7hzkl8c
  Resolved https://github.com/huggingface/transformers.git to commit df5c5c62ae253055336f5bb0828ca8e3e15ab6bd
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.36.0.dev0-py3-none-any.whl size=8195433 sha256=cb29fce3fd2060c9a24b9b363ba980b36f0dd47cd0c46e7044484ad513f574b3
  Stored in directory: /tmp/pip-ephem-wheel-cache-lut6udth/wheels/cf/59/82/6492402e887a68975030bf8c06532260abc16abb7c

In [4]:
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

## Load the image captioning dataset

Let's load the image captioning dataset, you just need few lines of code for that.

In [5]:
import os
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt

# Ruta a la carpeta principal que contiene las subcarpetas de cada rango de edad
ruta_principal = "/content/drive/MyDrive/ProyectoHF/train"

# Inicializar listas para almacenar las rutas de las imágenes y las edades
rutas_imagenes = []
edades = []

# Iterar sobre cada subcarpeta en la carpeta principal
for carpeta_edad in os.listdir(ruta_principal):
    carpeta_ruta = os.path.join(ruta_principal, carpeta_edad)

    # Verificar si es una carpeta
    if os.path.isdir(carpeta_ruta):
        # Iterar sobre los archivos en la carpeta de edad
        for archivo_imagen in os.listdir(carpeta_ruta):
            # Construir la ruta completa de la imagen
            ruta_imagen = os.path.join(carpeta_ruta, archivo_imagen)
            imagen_drive = Image.open(ruta_imagen)
            # Guardar la ruta de la imagen y la edad en las listas
            rutas_imagenes.append(imagen_drive)
            edades.append(carpeta_edad)

# Crear un DataFrame con pandas
dataset = pd.DataFrame({'imagen': rutas_imagenes, 'edad': edades})

And the corresponding image

## Create PyTorch Dataset

The lines below are entirely copied from the original notebook!

In [6]:
from torch.utils.data import Dataset, DataLoader

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset.iloc[idx]
        encoding = self.processor(images=item["imagen"], text=item["edad"], padding="max_length", return_tensors="pt")
        # remove batch dimension
        encoding = {k:v.squeeze() for k,v in encoding.items()}
        return encoding

## Load model and processor

In [7]:
from transformers import AutoProcessor, BlipForConditionalGeneration

processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Now that we have loaded the processor, let's load the dataset and the dataloader:

In [8]:
train_dataset = ImageCaptioningDataset(dataset, processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=5)

## Train the model

Let's train the model! Run the simply the cell below for training the model

In [9]:
import torch

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

model.train()

for epoch in range(12):
  print("Epoch:", epoch)
  for idx, batch in enumerate(train_dataloader):

    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids,
                    pixel_values=pixel_values,
                    labels=input_ids)

    loss = outputs.loss

    print("Loss:", loss.item())

    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

Epoch: 0


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Loss: 13.241524696350098
Loss: 10.416780471801758
Loss: 10.326852798461914
Loss: 10.283506393432617
Loss: 10.254034996032715
Loss: 10.231329917907715
Loss: 10.211921691894531
Loss: 10.196595191955566
Loss: 10.181267738342285
Loss: 10.166183471679688
Loss: 10.147100448608398
Loss: 10.108280181884766
Loss: 9.78165340423584
Loss: 8.949274063110352
Loss: 8.700672149658203
Loss: 8.49910831451416
Loss: 8.316699981689453
Loss: 8.123614311218262
Loss: 7.9779863357543945
Loss: 7.834007263183594
Loss: 7.70575475692749
Loss: 7.572732925415039
Loss: 7.439435958862305
Loss: 7.311398506164551
Loss: 7.180381774902344
Epoch: 1
Loss: 7.046161651611328
Loss: 6.922792434692383
Loss: 6.791436672210693
Loss: 6.6588287353515625
Loss: 6.52998685836792
Loss: 6.396854400634766
Loss: 6.265326499938965
Loss: 6.136353015899658
Loss: 6.000730514526367
Loss: 5.870941162109375
Loss: 5.738493919372559
Loss: 5.605088710784912
Loss: 5.473462104797363
Loss: 5.3423590660095215
Loss: 5.2106218338012695
Loss: 5.07934761047

## Inference

Let's check the results on our train dataset

In [28]:
# load image

image = Image.open("/content/drive/MyDrive/ProyectoHF/test/21-30/ivan2.jpg")

print(image)

<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=720x1280 at 0x780B75F91FF0>


In [29]:
# prepare image for the model
inputs = processor(images=image, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values

generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_caption)

21 - 30


## Load from the Hub

Once trained you can push the model and processor on the Hub to use them later.
Meanwhile you can play with the model that we have fine-tuned!

In [14]:
# Ruta a la carpeta principal que contiene las subcarpetas de cada rango de edad
ruta_principal = "/content/drive/MyDrive/ProyectoHF/test"

# Inicializar listas para almacenar las rutas de las imágenes y las edades
rutas_imagenes = []
edades = []

# Iterar sobre cada subcarpeta en la carpeta principal
for carpeta_edad in os.listdir(ruta_principal):
    carpeta_ruta = os.path.join(ruta_principal, carpeta_edad)

    # Verificar si es una carpeta
    if os.path.isdir(carpeta_ruta):
        # Iterar sobre los archivos en la carpeta de edad
        for archivo_imagen in os.listdir(carpeta_ruta):
            # Construir la ruta completa de la imagen
            ruta_imagen = os.path.join(carpeta_ruta, archivo_imagen)
            imagen_drive = Image.open(ruta_imagen)
            # Guardar la ruta de la imagen y la edad en las listas
            rutas_imagenes.append(imagen_drive)
            edades.append(carpeta_edad)

# Crear un DataFrame con pandas
dataset2 = pd.DataFrame({'imagen': rutas_imagenes, 'edad': edades})

Let's check the results on our train dataset!