In [10]:
pip install -q peft

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import AutoProcessor, BlipForConditionalGeneration
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from torchvision import transforms
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.optim as optim
from tqdm import tqdm
from PIL import Image
import time
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")



In [4]:
path = "/content/drive/MyDrive/Dataset tbank/ready2train10000.csv"
num_samples = 10000
preready_dataset = (pd.read_csv(path))[:num_samples]
del preready_dataset['Unnamed: 0']
preready_dataset.iloc[[5]]

Unnamed: 0,image,caption
5,0013e7355ffc5ff8fb1ccad3e42d92fe.jpg,CELANA WANITA (BB 45-84 KG)Harem wanita (bisa...


In [5]:
train_set = preready_dataset[:int(0.8*num_samples)]
val_set = preready_dataset[int(0.8*num_samples):]

len(train_set), len(val_set)

(8000, 2000)

In [None]:
for idx in range(len(train_set)):
  item = train_set.iloc[[idx]]
  image = Image.open(f"/content/drive/MyDrive/Dataset tbank/Images_final/{item['image'][idx]}")

In [6]:
class BLIPTunningDataset(Dataset):
  def __init__(self, data, processor):
    self.data = data
    self.processor = processor

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    item = self.data.iloc[[idx]]
    image = Image.open(f"/content/drive/MyDrive/Dataset tbank/Images_final/{item['image'][idx]}")
    caption = item['image'][idx]
    encoding = self.processor(images=image, text=caption, padding="max_length", return_tensors="pt")
    encoding = {k:v.squeeze() for k,v in encoding.items()}
    return encoding

In [8]:
train_dataset = BLIPTunningDataset(train_set, processor)
val_dataset = BLIPTunningDataset(val_set, processor)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

len(train_dataloader), len(val_dataloader)

(125, 32)

# Model training

In [16]:
from peft import LoraConfig, get_peft_model

In [19]:
config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.10,
    bias="none",
    target_modules='all-linear'
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 13,436,800 || all params: 260,850,876 || trainable%: 5.1511


In [20]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

num_epochs = 50

In [21]:
for epoch in range(num_epochs):
  model.train()
  print(f"Epoch: {epoch}")
  start_timepoint = time.time()
  train_loss = 0
  val_loss = 0

  for idx, batch in enumerate(train_dataloader):
    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids,
                    pixel_values=pixel_values,
                    labels=input_ids)

    loss = outputs.loss
    train_loss += loss.item() / len(train_dataloader)


    loss.backward()

    optimizer.step()
    optimizer.zero_grad()
  print(f"Train loss: {train_loss}")

  with torch.no_grad():
    for idx, batch in enumerate(val_dataloader):
      input_ids = batch.pop("input_ids").to(device)
      pixel_values = batch.pop("pixel_values").to(device)

      outputs = model(input_ids=input_ids,
                      pixel_values=pixel_values,
                      labels=input_ids)

      loss = outputs.loss
      val_loss += loss.item() / len(val_dataloader)
    print("Val loss:", val_loss)
  end_timepoint = time.time()
  print(f"Time spent per epoch: {end_timepoint-start_timepoint}")

Epoch: 0


OutOfMemoryError: CUDA out of memory. Tried to allocate 976.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 953.06 MiB is free. Process 13785 has 13.81 GiB memory in use. Of the allocated memory 13.32 GiB is allocated by PyTorch, and 374.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.save(model.state_dict(), 'clip_finetuned.pth')

# Inference

In [None]:
input_image = Image.open("/content/Image2fun.jpg")
plt.imshow(input_image)
plt.show()

In [None]:
# prepare image for the model
inputs = processor(images=input_image, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values

generated_ids = model.generate(pixel_values=pixel_values, max_length=200)
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_caption)