# LOADING AND IMPORTING ALL LIBRARIES FOR IMAGE CAPTIONING

In [None]:
!pip install transformers rouge_score evaluate datasets

In [None]:
!pip install sentencepiece


In [None]:
import requests
import torch
from PIL import Image
from transformers import *
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

# FUNCTIONS FOR IMAGE CAPTIONING

In [None]:
# load a fine-tuned image captioning model and corresponding tokenizer and image processor
finetuned_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
finetuned_tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
finetuned_image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [None]:
import urllib.parse as parse
import os

# a function to determine whether a string is a URL or not
def is_url(string):
    try:
        result = parse.urlparse(string)
        return all([result.scheme, result.netloc, result.path])
    except:
        return False

# a function to load an image
def load_image(image_path):
    if is_url(image_path):
        return Image.open(requests.get(image_path, stream=True).raw)
    elif os.path.exists(image_path):
        return Image.open(image_path)


# a function to perform inference
def get_caption(model, image_processor, tokenizer, image_path):
    image = load_image(image_path)
    # preprocess the image
    img = image_processor(image, return_tensors="pt").to(device)
    # generate the caption (using greedy decoding by default)
    output = model.generate(**img)
    # decode the output
    caption = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    return caption

In [None]:
# using the pipeline API
image_captioner = pipeline("image-to-text", model="Abdou/vit-swin-base-224-gpt2-image-captioning")
image_captioner.model = image_captioner.model.to(device)

# FOR DETECTION

## yolov8

In [None]:
import os
HOME = os.getcwd()
print(HOME)

In [None]:


%cd {HOME}
!git clone https://github.com/ultralytics/ultralytics
%cd {HOME}/ultralytics
!pip install -e .

from IPython import display
display.clear_output()

import ultralytics
ultralytics.checks()

### detection using yolov8

In [None]:
!pip install supervision

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "https://images.rawpixel.com/image_800/czNmcy1wcml2YXRlL3Jhd3BpeGVsX2ltYWdlcy93ZWJzaXRlX2NvbnRlbnQvZnJob3JzZV9nYWxsb3BfY2FudGVyX21hcmUtaW1hZ2Utcm01MDNfMS1sMDd0dW5iZy5qcGc.jpg?s=DyXVyF2nJuzRikNn0KXYhzn7TwTJhuaRG1WoOQqftgQ"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
#cv2.imwrite("test.png",image)

In [None]:
import supervision as sv
import cv2
from ultralytics import YOLO

model = YOLO('yolov8l.pt')

results = model(image)

print(results[0].boxes)
detections = sv.Detections.from_ultralytics(results[0])

#image = #cv2.imread("/home/swagroy/car_images/CAR1.PNG")

bounding_box_annotator = sv.BoundingBoxAnnotator()
label_annotator = sv.LabelAnnotator()

classes = model.names

labels = [
	f"{classes[class_id]} {confidence:0.2f}"
	for _, _, confidence, class_id, _
	in detections
]

annotated_image = bounding_box_annotator.annotate(
    scene=image, detections=detections)
annotated_image = label_annotator.annotate(
    scene=annotated_image, detections=detections, labels=labels)

sv.plot_image(annotated_image)

print(labels)

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "http://images.cocodataset.org/test-stuff2017/000000009384.jpg"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
import supervision as sv
import cv2
from ultralytics import YOLO

model = YOLO('yolov8l.pt')

results = model(image)

print(results[0].boxes)
detections = sv.Detections.from_ultralytics(results[0])

#image = #cv2.imread("/home/swagroy/car_images/CAR1.PNG")

bounding_box_annotator = sv.BoundingBoxAnnotator()
label_annotator = sv.LabelAnnotator()

classes = model.names

labels = [
	f"{classes[class_id]} {confidence:0.2f}"
	for _, _, confidence, class_id, _
	in detections
]

annotated_image = bounding_box_annotator.annotate(
    scene=image, detections=detections)
annotated_image = label_annotator.annotate(
    scene=annotated_image, detections=detections, labels=labels)

sv.plot_image(annotated_image)

print(labels)

In [None]:
cv2.imwrite("/content/test.png",image)
img_path='/content/test.png'

print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

## yolov5

In [None]:
!git clone https://github.com/ultralytics/yolov5.git
%cd yolov5
!wget https://github.com/ultralytics/yolov5/releases/download/v5.0/yolov5s.pt
!wget https://github.com/ultralytics/yolov5/releases/download/v5.0/yolov5m.pt
!wget https://github.com/ultralytics/yolov5/releases/download/v5.0/yolov5l.pt

In [None]:
import torch
from PIL import Image

# Load the model
model1 = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
model2= torch.hub.load('ultralytics/yolov5', 'yolov5m', pretrained=True)
model3= torch.hub.load('ultralytics/yolov5', 'yolov5l', pretrained=True)

# implementations using yolov5

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "https://images.rawpixel.com/image_800/czNmcy1wcml2YXRlL3Jhd3BpeGVsX2ltYWdlcy93ZWJzaXRlX2NvbnRlbnQvZnJob3JzZV9nYWxsb3BfY2FudGVyX21hcmUtaW1hZ2Utcm01MDNfMS1sMDd0dW5iZy5qcGc.jpg?s=DyXVyF2nJuzRikNn0KXYhzn7TwTJhuaRG1WoOQqftgQ"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()


print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "http://images.cocodataset.org/test-stuff2017/000000009384.jpg"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()

get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)
print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np

url = "https://akm-img-a-in.tosshub.com/businesstoday/images/story/202303/r_0-sixteen_nine.jpg?size=1200:675"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()

get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)
print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np



url = "https://t3.ftcdn.net/jpg/03/36/97/58/360_F_336975809_VvYkV1QZX2E8igeS3kYpcBGiMcK6zWpL.jpg"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()
# Perform inference
results3 = model3(img_path)

# Display results
results3.show()

get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)
print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "https://images.hindustantimes.com/auto/img/2021/12/28/600x338/Indian_cars_1640662074513_1640662081298.jpg"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()

get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)
print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "https://cms.londonzoo.org/sites/default/files/styles/responsive/public/1024/729/1/2022-11/Asim-at-London-Zoo.jpg"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()

# Perform inference
results3 = model3(img_path)

# Display results
results3.show()

get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)
print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "https://i.ytimg.com/vi/BQNRE2ScAq4/maxresdefault.jpg"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()

# Perform inference
results3 = model3(img_path)

# Display results
results3.show()


get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)
print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "https://ihearthorses.com/wp-content/uploads/2020/05/Canva-dog-in-a-cowboy-hat-holding-horse-on-leash-1-scaled.jpg"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()

get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)
print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "https://img.freepik.com/premium-photo/amazing-italian-landscapes-lombardy-scenaries_526992-280.jpg?w=2000"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()

# Perform inference
results3= model3(img_path)

# Display results
results3.show()

get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)
print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "https://i.pinimg.com/474x/c5/6c/cb/c56ccb04377c2905fbb8f9ec00b7ae2f.jpg"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()

# Perform inference
results3= model3(img_path)

# Display results
results3.show()

get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)
print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

In [None]:
# load displayer
from IPython.display import display
import cv2
import numpy as np


url = "https://nmaahc.si.edu/sites/default/files/styles/max_1300x1300/public/images/header/audience-citizen_0.jpg?itok=unjNTfkP"

# Define the URL of the image
image_url = url

# Send an HTTP GET request to the URL
response = requests.get(image_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    # Read the image data from the response content
    image_data = np.frombuffer(response.content, np.uint8)

    # Decode the image using OpenCV
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

else:
    print("Failed to fetch the image. HTTP Status Code:", response.status_code)
image=img
cv2.imwrite("test.png",image)
img_path='/content/yolov5/test.png'
# Perform inference
results1 = model1(img_path)

# Display results
results1.show()

# Perform inference
results2 = model2(img_path)

# Display results
results2.show()

# Perform inference
results3= model3(img_path)

# Display results
results3.show()

get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)
print(f"nlpconnect/vit-gpt2-image-captioning caption: {get_caption(finetuned_model, finetuned_image_processor, finetuned_tokenizer, img_path)}")
print(f"Abdou/vit-swin-base-224-gpt2-image-captioning caption: {get_caption(image_captioner.model,  finetuned_image_processor, finetuned_tokenizer,  img_path)}")

# vit custom model

# Image Captioning using ViT and GPT2

Connect to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!rm -r '/content/image_caption_gen' '/content/wandb'

In [None]:
!pip install -q accelerate wandb

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import AutoFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel
from PIL import Image
import os
from sklearn.model_selection import train_test_split
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, default_data_collator, EarlyStoppingCallback


In [None]:
img_dir = '/content/Images'
caption_path = '/content/captions.txt'

In [None]:
caption_data = pd.read_csv(caption_path)
caption_data.head()

In [None]:
def text_preprocessing(data):
    data['caption'] = data['caption'].apply(lambda x: x.lower())
    data['caption'] = data['caption'].apply(lambda x: x.replace("[^A-Za-z]",""))
    data['caption'] = data['caption'].apply(lambda x: x.replace("\s+"," "))
    data['caption'] = data['caption'].apply(lambda x: " ".join([word for word in x.split() if len(word)>1]))
    return data

In [None]:
caption_data_preprocessed = text_preprocessing(caption_data)
caption_data_preprocessed.head()

In [None]:
enc_model = "google/vit-base-patch16-224-in21k"
dec_model = "gpt2"

fe_extractor = AutoFeatureExtractor.from_pretrained(enc_model)
tokenizer = AutoTokenizer.from_pretrained(dec_model)

In [None]:
#set the padding token to eos token for fixed length input tokens
tokenizer.pad_token = tokenizer.eos_token

Sample one image transformation

In [None]:
max_length = 128
sample = caption_data_preprocessed.iloc[0]

#fetch the image
image = Image.open(os.path.join(img_dir,sample['image'])).convert('RGB')
caption = sample['caption']

#apply the feature extractor on the image
inputs = fe_extractor(image, return_tensors='pt')
#apply the tokenizer on the caption
outputs = tokenizer(caption, max_length=max_length, truncation=True,
                    padding='max_length', return_tensors = 'pt')

In [None]:
inputs['pixel_values'].shape

## Create the Dataset class

In [None]:
class ImageCaptionDataset(Dataset):
  def __init__(self, data):
    self.images = data['image'].values
    self.captions = data['caption'].values

  def __len__(self):
    return len(self.images)

  def __getitem__(self, idx):
    input_data = {}
    #load the image and tokenize it
    image = Image.open(os.path.join(img_dir,str(self.images[idx]))).convert('RGB')
    image_embed = fe_extractor(image, return_tensors = 'pt')

    #load caption and apply tokenizer
    caption = self.captions[idx]
    captions_tok = tokenizer(caption, max_length=max_length,
                             truncation=True, padding='max_length',
                             return_tensors = 'pt')['input_ids'][0]

    #store the image_embeddings and caption_tok in the dict
    input_data['pixel_values'] = image_embed['pixel_values'].squeeze()
    input_data['labels'] = captions_tok
    return input_data

Split the dataset

In [None]:

X, y = train_test_split(caption_data_preprocessed, test_size=0.2,
                        shuffle=True, random_state=42)
train_dataset = ImageCaptionDataset(X)
test_dataset = ImageCaptionDataset(y)

## Model Building

In [None]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    enc_model,
    dec_model
)

Set the decoder_start_token_id and the pad_token_id

In [None]:
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

## Setting up the training configuration

In [None]:
# early_stopping_callback = EarlyStoppingCallback(
#     early_stopping_patience= 1,
#     early_stopping_threshold= 1e-3
# )


training_args = TrainingArguments(
    run_name = 'img_cap_ViT_gpt2_run_5',
    output_dir = 'image_caption_gen',
    evaluation_strategy = 'epoch',
    logging_strategy='steps',
    logging_steps=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-2,
    lr_scheduler_type="cosine",
    warmup_steps=10,
    weight_decay=0.05,
    report_to='wandb',
    num_train_epochs = 3,
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer = fe_extractor,
    data_collator=default_data_collator,
    train_dataset=train_dataset,
    eval_dataset = test_dataset,
    args = training_args,
)

In [None]:
trainer.train()

In [None]:
cp -r /content/image_caption_gen /content/drive/MyDrive/visual_multimodal_modelling

## Inferencing

In [None]:
model = VisionEncoderDecoderModel.from_pretrained(
    '/content/drive/MyDrive/visual_multimodal_modelling/image_caption_gen/checkpoint-6069'
)

## Set the beam search params for inferencing

In [None]:
num_beams = 4
model.config.num_beams = num_beams

In [None]:
inputs = test_dataset[65]['pixel_values']

In [None]:
import torch

model.eval()
with torch.no_grad():
    # uncomment the below line if feature extractor is not applied to the image already
    # inputs = fe_extractor(images=inputs, return_tensors='pt').pixel_values

    # generate caption for the image
    out = model.generate(
        inputs.unsqueeze(0).to('cuda') if torch.cuda.is_available() else inputs.unsqueeze(0).to('cpu'), # move inputs to GPU
        num_beams=num_beams,
        )

# convert token ids to string format
decoded_out = tokenizer.decode(out[0], skip_special_tokens=True)

print(decoded_out)