## Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.3/297.3 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m58.9 

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image

#from sklearn.model_selection import train_test_split
import torch
from datasets import Dataset
from transformers import AutoProcessor, Blip2ForConditionalGeneration

## Create training and testing dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
image_path = "/content/drive/MyDrive/datasets/pokemon_hf_png"
caption_path = "/content/drive/MyDrive/datasets/pokemon_caption_hf.csv"

In [None]:
imagename_df = pd.DataFrame(os.listdir(image_path))
imagename_df.columns = ['filename']
imagename_df['pokedex'] = imagename_df['filename'].str.replace('.png', "")
imagename_df['filename'] = imagename_df['filename'].transform(lambda x: image_path + "/" + x)

caption_df = pd.read_csv(caption_path)
caption_df['pokedex'] = caption_df['pokedex'].astype(str)
caption_df.drop(caption_df.columns[[0]],axis=1,inplace=True)

dataset_df = imagename_df.merge(caption_df, how='left', on='pokedex')
dataset_df = dataset_df[['filename', 'caption']].rename(columns={"caption": "text", "filename": "image"})

In [None]:
def gen():
    for index, row in dataset_df.iterrows():
      yield {"text":row["text"], "image":Image.open(row["image"])}
dataset = Dataset.from_generator(gen).shuffle(seed=123)
dataset = dataset.train_test_split(test_size=0.1, shuffle = False)

Generating train split: 0 examples [00:00, ? examples/s]

## Create Dataset for fine-tuning

In [None]:
from torch.utils.data import Dataset, DataLoader

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(images=item["image"], padding="max_length", return_tensors="pt")
        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = item["text"]
        return encoding

def collate_fn(batch):
    # pad the input_ids and attention_mask
    processed_batch = {}
    for key in batch[0].keys():
        if key != "text":
            processed_batch[key] = torch.stack([example[key] for example in batch])
        else:
            text_inputs = processor.tokenizer(
                [example["text"] for example in batch], padding=True, return_tensors="pt"
            )
            processed_batch["input_ids"] = text_inputs["input_ids"]
            processed_batch["attention_mask"] = text_inputs["attention_mask"]
    return processed_batch


## Pretrained models and parameters loading

In [None]:
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map="auto", load_in_8bit=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/5.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj"]
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 5,242,880 || all params: 3,749,922,816 || trainable%: 0.13981301102065136


In [None]:
train_dataset = ImageCaptioningDataset(dataset["train"], processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=20, collate_fn=collate_fn)

## Model Training

In [None]:
import torch

optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

device = "cuda" if torch.cuda.is_available() else "cpu"

model.train()

for epoch in range(10):
  print("Epoch:", epoch)
  for idx, batch in enumerate(train_dataloader):
    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device, torch.float16)

    outputs = model(input_ids=input_ids,
                    pixel_values=pixel_values,
                    labels=input_ids)

    loss = outputs.loss

    print("Loss:", loss.item())

    loss.backward()

    optimizer.step()
    optimizer.zero_grad()
  model.save_pretrained(f"/content/drive/MyDrive/Pretained Models/blip2_easy_{epoch}")

Epoch: 0
Loss: 6.19921875
Loss: 6.78515625
Loss: 4.91796875
Loss: 4.41796875
Loss: 3.84765625
Loss: 3.28515625
Loss: 3.345703125
Loss: 3.384765625
Loss: 2.603515625
Loss: 2.775390625
Loss: 2.873046875
Loss: 2.232421875
Loss: 2.23046875
Loss: 2.453125
Loss: 2.171875
Loss: 1.9833984375
Loss: 1.912109375
Loss: 2.005859375
Loss: 1.7939453125
Loss: 1.990234375
Loss: 1.55859375
Loss: 1.9013671875
Loss: 1.6875
Loss: 1.353515625
Loss: 1.7939453125
Loss: 1.5419921875
Loss: 1.55078125
Loss: 1.5966796875
Loss: 1.55859375
Loss: 1.4697265625
Loss: 1.4609375
Loss: 1.513671875
Loss: 1.2880859375
Loss: 1.28515625
Loss: 1.439453125
Loss: 1.296875
Loss: 1.154296875
Loss: 1.392578125
Epoch: 1
Loss: 1.1474609375
Loss: 1.123046875
Loss: 1.2001953125
Loss: 1.0380859375
Loss: 1.1689453125
Loss: 1.166015625
Loss: 1.109375
Loss: 0.9755859375
Loss: 1.07421875
Loss: 0.97509765625
Loss: 1.1328125
Loss: 1.009765625
Loss: 1.0927734375
Loss: 0.9697265625
Loss: 1.0654296875
Loss: 1.0361328125
Loss: 0.98876953125
Loss