# Installs

In [None]:
!pip install transformers
!pip install salesforce-lavis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

# Imports

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import time

import re
from PIL import Image
import cv2
import torch
import transformers
from lavis.models import load_model_and_preprocess

# Captioning 

In [None]:
caption_model, vis_processors, txt_processors = load_model_and_preprocess(name="blip_caption", model_type="large_coco", device="cuda")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

100%|██████████| 1.66G/1.66G [00:57<00:00, 31.1MB/s]


In [None]:
images_path = '/content/gdrive/MyDrive/flickr8k_images'
image_lst = sorted(os.listdir(images_path))

generated_captions = []
i=1
for f in image_lst:
    start = time.time()

    raw_image = Image.open(images_path+"/"+f).convert("RGB")
    image = vis_processors["eval"](raw_image).unsqueeze(0).to("cuda")
    samples = {"image": image}
    caption = caption_model.generate(samples, num_beams=3, max_length=50, min_length=10, num_captions=1)
    generated_captions.append(caption[0].split())

    end = time.time()
    print(f"Image {i}: {round(end - start, 2)} sec")
    i+=1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Image 3092: 0.63 sec
Image 3093: 0.65 sec
Image 3094: 0.54 sec
Image 3095: 0.52 sec
Image 3096: 0.54 sec
Image 3097: 0.49 sec
Image 3098: 0.46 sec
Image 3099: 0.59 sec
Image 3100: 0.49 sec
Image 3101: 0.55 sec
Image 3102: 0.46 sec
Image 3103: 0.49 sec
Image 3104: 0.62 sec
Image 3105: 0.59 sec
Image 3106: 0.46 sec
Image 3107: 0.55 sec
Image 3108: 0.49 sec
Image 3109: 0.53 sec
Image 3110: 0.52 sec
Image 3111: 0.49 sec
Image 3112: 0.58 sec
Image 3113: 0.52 sec
Image 3114: 0.56 sec
Image 3115: 0.62 sec
Image 3116: 0.7 sec
Image 3117: 0.52 sec
Image 3118: 0.5 sec
Image 3119: 0.64 sec
Image 3120: 0.59 sec
Image 3121: 0.74 sec
Image 3122: 0.61 sec
Image 3123: 0.59 sec
Image 3124: 0.64 sec
Image 3125: 0.61 sec
Image 3126: 0.57 sec
Image 3127: 0.59 sec
Image 3128: 0.73 sec
Image 3129: 0.56 sec
Image 3130: 0.55 sec
Image 3131: 0.67 sec
Image 3132: 0.61 sec
Image 3133: 0.64 sec
Image 3134: 0.61 sec
Image 3135: 0.61 sec
Image 3136: 0

In [None]:
data = pd.DataFrame(list(zip(image_lst, generated_captions)), columns=['image', 'caption']).set_index('image')

data.to_csv('/content/gdrive/MyDrive/flickr8k_generatedcaptions.csv')