<a href="https://colab.research.google.com/github/yadegarfaramarzi/Learning-Blazor-Web-Assembly/blob/master/Multimodal_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset
from PIL import Image
import requests
from io import BytesIO
from transformers import GPT2TokenizerFast, ViTFeatureExtractor
from transformers import VisionEncoderDecoderModel
from transformers import Trainer, TrainingArguments
from transformers import default_data_collator
import os
from transformers import BlipProcessor, BlipForConditionalGeneration



In [3]:
data = load_dataset("sbu_captions", split="train").shuffle(seed=42)
data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/89.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Dataset({
    features: ['image_url', 'user_id', 'caption'],
    num_rows: 1000000
})

In [4]:
class ProcessDataset(Dataset):
    def __init__(self, df, tokenizer, feature_extractor, decoder_max_length=20):
        """
        Constructor for the dataset class.

        Args:
        - df (pd.DataFrame): DataFrame containing image URLs and captions.
        - tokenizer (AutoTokenizer): Tokenizer for processing captions.
        - feature_extractor: Vision model for extracting image features.
        - decoder_max_length (int): Maximum length of the output caption.
        """
        self.df = df
        self.tokenizer = tokenizer  # This is for the language model
        self.feature_extractor = feature_extractor  # This is for the vision model
        self.decoder_max_length = decoder_max_length  # This is for the caption output

    def __len__(self):
        """
        Returns the length of the dataset.

        This is necessary so that HuggingFace won't complain that the dataset doesn't have __len__ method
        when it starts training.
        """
        return len(self.df)

    def __getitem__(self, idx):
        """
        Returns a single data item (image and its corresponding caption) at the specified index.

        Args:
        - idx (int): Index of the data item to retrieve.

        Returns:
        - encoding (dict): A dictionary containing the processed image and caption.
        """
        # Get file name + text
        img_path = self.df["image_url"][idx]
        caption = self.df["caption"][idx]

        # Process image
        response = requests.get(img_path)
        image = Image.open(BytesIO(response.content))
        pixel_values = self.feature_extractor(image, return_tensors="pt").pixel_values

        # Labels here refer to each token in the caption
        labels = self.tokenizer(caption,
                                truncation=True,
                                padding="max_length",
                                max_length=self.decoder_max_length).input_ids

        # Create a dictionary containing processed image and caption
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding



In [5]:
%mkdir datasets

In [6]:
# Instantiate a GPT-2 tokenizer from Hugging Face's transformers library
# The model is pretrained on GPT-2 architecture with fast tokenization
# The `cache_dir` parameter specifies the directory for caching models
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="/root/datasets/"+"models")

# GPT-2 tokenizer doesn't have a pad token by default, so we set it to the eos_token
# This ensures consistent handling of padding during tokenization
tokenizer.pad_token = tokenizer.eos_token

# Instantiate a Vision Transformer (ViT) feature extractor from Hugging Face's transformers library
# The ViT model used is pretrained on images and is specified by the model identifier "google/vit-base-patch16-224-in21k"
# The `cache_dir` parameter specifies the directory for caching models
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k", cache_dir="/root/datasets/"+"models")


# The dataset is limited to the first 2000 samples for training
# The 'tokenizer' and 'feature_extractor' objects are passed as arguments for text and vision processing
train_dataset = ProcessDataset(
    df=data[:2000],  # Using the first 2000 samples from the 'data' DataFrame
    tokenizer=tokenizer,  # The tokenizer object for processing captions
    feature_extractor=feature_extractor  # The feature extractor object for processing images
)



vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [7]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_pretrained_model_name_or_path="google/vit-base-patch16-224-in21k",  # ViT encoder model
    decoder_pretrained_model_name_or_path="gpt2",  # GPT-2 decoder model
    tie_encoder_decoder=True,  # Tie encoder and decoder weights for joint training
    cache_dir="/root/datasets/"+"models"  # Directory for caching models
)

# Set GPT-2 specific configuration in the VisionEncoderDecoderModel

# GPT-2 only has bos/eos tokens but not decoder_start/pad tokens
# Set decoder_start_token_id, pad_token_id, and eos_token_id using tokenizer values
model.config.decoder_start_token_id = tokenizer.bos_token_id  # Set the beginning-of-sequence token
model.config.pad_token_id = tokenizer.pad_token_id  # Set the pad token for padding sequences
model.config.eos_token_id = tokenizer.eos_token_id  # Set the end-of-sequence token

# Adjust additional model configuration settings
model.config.vocab_size = model.config.decoder.vocab_size  # Match the vocabulary size for consistency
model.config.early_stopping = True  # Enable early stopping during training
model.config.no_repeat_ngram_size = 3  # Set the size of N-gram sequences that cannot be repeated
model.config.length_penalty = 2.0  # Set the length penalty for generated sequences

# Configuration settings specific to the decoder
model.decoder.num_beams = 4  # Set the number of beams used in beam search during decoding
model.decoder.max_length = 20  # Set the maximum length for generated sequences



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.8.crossattention.c_proj.bias', 'h.10.crossattention.q_attn.weight', 'h.7.crossattention.c_attn.weight', 'h.6.crossattention.q_attn.bias', 'h.5.crossattention.c_proj.bias', 'h.3.crossattention.q_attn.bias', 'h.3.crossattention.c_attn.bias', 'h.3.ln_cross_attn.bias', 'h.6.crossattention.c_proj.bias', 'h.9.crossattention.c_attn.weight', 'h.2.crossattention.q_attn.bias', 'h.4.crossattention.q_attn.weight', 'h.1.crossattention.q_attn.weight', 'h.8.ln_cross_attn.bias', 'h.9.crossattention.c_proj.bias', 'h.3.ln_cross_attn.weight', 'h.11.crossattention.q_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.7.crossattention.c_proj.weight', 'h.8.crossattention.q_attn.weight', 'h.4.crossattention.c_proj.weight', 'h.10.ln_cross_attn.weight', 'h.3.crossattention.c_proj.weight', 'h.4.crossattention.q_attn.bias', 'h.9.crossattention.c_proj.weight', 'h.6.ln_cross_attn.bias', 'h.1.ln_cro

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
# Set batch size and number of training epochs
BATCH_SIZE = 16
TRAIN_EPOCHS = 20

# Define the output directory for storing training outputs
output_directory = os.path.join("/root/working_dir/", "captioning_outputs")

# Set up training arguments for the Trainer
training_args = TrainingArguments(
    output_dir=output_directory,  # Directory to save model checkpoints and logs
    per_device_train_batch_size=BATCH_SIZE,  # Number of training samples per device batch
    do_train=True,  # Perform training
    num_train_epochs=TRAIN_EPOCHS,  # Number of training epochs (passes through the entire dataset)
    overwrite_output_dir=True,  # Overwrite the output directory if it exists
    no_cuda=True,  # Do not use GPU (no CUDA)
    dataloader_pin_memory=False  # Specify whether to pin memory in data loaders
)

# Instantiate a Trainer for training the captioning model
trainer = Trainer(
    tokenizer=feature_extractor,  # Tokenizer for processing captions
    model=model,  # Model to be trained
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    data_collator=default_data_collator  # Data collator for handling batches
)



ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`