<a href="https://colab.research.google.com/github/vlordier/colabs/blob/main/BLIP_Captions_with_2xCLIP_ranking_open_clip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Using BLIP and CLIP Ranking (and augmentations) for image captioning 

In [None]:
#@title Install dependencies { vertical-output: true }
import torch
!pip install grammar-check #https://pypi.org/project/grammar-check/
!pip install clip-anytorch open_clip_torch
#!gdown https://drive.google.com/uc?id=1DysE570DY-THu1puJnVpj0GaoeAs45vE
#!unzip people1.zip

#!gdown https://drive.google.com/uc?id=1Iqe0Sy3PF0UW_YDWu2zcNIpmWqGY0_2B
#!unzip image-photo.zip
import sys
if 'google.colab' in sys.modules:
    print('Running in Colab.')
    !pip3 install transformers==4.15.0 timm==0.4.12 fairscale==0.4.4
    !git clone https://github.com/christophschuhmann/BLIP/
    %cd BLIP


In [None]:
!pip install -U albumentations
!pip uninstall -y opencv-python-headless==4.5.5.62
!pip install opencv-python-headless==4.5.2.52


In [None]:
from albumentations.augmentations.crops.transforms import RandomResizedCrop
from torchvision import transforms
import albumentations as A
from torchvision.transforms.functional import InterpolationMode

from models.blip import blip_decoder
import glob

#@title Captioning Images of various Types { vertical-output: true }
#hide
from PIL import Image
import numpy as np
import torch
#import clip
import open_clip
import random

rep_pen = 1.8

image_size = 384
number_of_caption_per_image = 10


device = "cuda" if torch.cuda.is_available() else "cpu"

clip_model='ViT-B-32-quickgelu'
model_checkpoint='laion400m_e32'

#https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_avg-8a00ab3c.pt
model_clip, _, preprocess_clip = open_clip.create_model_and_transforms(clip_model, pretrained=model_checkpoint, device=device)

model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth'
    
model = blip_decoder(pretrained=model_url, image_size=image_size, vit='large')
model.eval()
model = model.to(device)



transform = transforms.Compose([
    transforms.RandomResizedCrop(size=(image_size,image_size), scale=(0.8, 1.0), interpolation=InterpolationMode.BICUBIC),
    transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.05),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]) 

# augment PIL image
transform_A = A.Compose([
    A.RandomResizedCrop(image_size, image_size, scale=(0.8, 1.0)),
    A.Blur(blur_limit=3, p=0.1),
    A.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.05, always_apply=False, p=1.0),
    A.GaussNoise(p=0.5),
    A.CoarseDropout(min_holes = 1, max_holes = 3, max_height = int(image_size/4), max_width= int(image_size/4), p=1.0),
    ])

def cos_sim_2d(x, y):
    norm_x = x / np.linalg.norm(x, axis=1, keepdims=True)
    norm_y = y / np.linalg.norm(y, axis=1, keepdims=True)
    return np.matmul(norm_x, norm_y.T)

def clip_rank(image_pil, text_list): #, clip_model='ViT-B-32-quickgelu', model_checkpoint='laion400m_avg'):
    np_im = np.array(image_pil)
    transformed = transform_A(image=np_im)
    transformed_image = transformed["image"]
    image_pil = Image.fromarray(np.uint8(transformed_image)).convert('RGB')

    similarities= []
    image = preprocess_clip(image_pil).unsqueeze(0).to(device)

    image_features = model_clip.encode_image(image).cpu().detach().numpy()

    for txt in text_list:
      text = open_clip.tokenize(txt ).to(device)
      text_features = model_clip.encode_text(text).cpu().detach().numpy()

      sim_= float(cos_sim_2d(text_features, image_features)[0]) 
      similarities.append(sim_)
    return similarities


files= glob.glob("/content/*.jpeg")

with torch.no_grad():

  for f in files[:50]:
    
    raw_image = Image.open(f).convert('RGB')   
    w,h = raw_image.size

    display(raw_image.resize((300, int(300* h/w))))
    print("BEST CAPTION AFTER RANKING WITH CLIP ViT-B-32-laion400m_avg")

    captions = []

    for n in range(number_of_caption_per_image):

      # Get a random transform
      image = transform(raw_image).unsqueeze(0).to(device)     

      max_length = random.choice([30, 40, 50])
      min_length = random.choice([15, 20, 25])

      topP = random.choice([0.2, 0.3, 0.4, 0.5, 0.6, 0.7])

      caption = model.generate(image, 
                               sample=True, 
                               max_length=max_length, 
                               min_length=min_length, 
                               top_p=topP, 
                               repetition_penalty=rep_pen)      
      captions.append(caption[0])

      # Get a random transform
      image = transform(raw_image).unsqueeze(0).to(device)     

      max_length = random.randint(25, 60)
      min_length = random.randint(10, max_length)

      beam_n = random.choice([2,3,4,5,6,7])
      caption = model.generate(image,
                               sample=False,
                               num_beams=beam_n,
                               max_length=max_length,
                               min_length=min_length,
                               repetition_penalty=rep_pen)
      captions.append(caption[0])


    best_cannidates=[]
    sims = clip_rank(raw_image, captions)

    argmax = np.argsort(np.asarray(sims))[:number_of_caption_per_image]

    for a in argmax :
      print(captions[a])




In [None]:
from albumentations.augmentations.crops.transforms import RandomResizedCrop
from torchvision import transforms
import albumentations as A
from torchvision.transforms.functional import InterpolationMode

from models.blip import blip_decoder
import glob

#@title Captioning Images of various Types { vertical-output: true }
#hide
from PIL import Image
import numpy as np
import torch
import clip
#import open_clip
import random

rep_pen = 1.8

image_size = 384
number_of_caption_per_image = 10


device = "cuda" if torch.cuda.is_available() else "cpu"

clip_model='ViT-L/14'
#model_checkpoint='laion400m_avg'
#model_clip, _, preprocess_clip = open_clip.create_model_and_transforms(clip_model, pretrained=model_checkpoint, device=device)

model_clip, preprocess_clip = clip.load(clip_model, device=device)

model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth'
    
model = blip_decoder(pretrained=model_url, image_size=image_size, vit='large')
model.eval()
model = model.to(device)



transform = transforms.Compose([
    transforms.RandomResizedCrop(size=(image_size,image_size), scale=(0.8, 1.0), interpolation=InterpolationMode.BICUBIC),
    transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.05),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]) 

# augment PIL image
transform_A = A.Compose([
    A.RandomResizedCrop(image_size, image_size, scale=(0.8, 1.0)),
    A.Blur(blur_limit=3, p=0.1),
    A.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.05, always_apply=False, p=1.0),
    A.GaussNoise(p=0.5),
    A.CoarseDropout(min_holes = 1, max_holes = 3, max_height = int(image_size/4), max_width= int(image_size/4), p=1.0),
    ])

def cos_sim_2d(x, y):
    norm_x = x / np.linalg.norm(x, axis=1, keepdims=True)
    norm_y = y / np.linalg.norm(y, axis=1, keepdims=True)
    return np.matmul(norm_x, norm_y.T)

def clip_rank(image_pil, text_list): #, clip_model='ViT-B-32-quickgelu', model_checkpoint='laion400m_avg'):
    np_im = np.array(image_pil)
    transformed = transform_A(image=np_im)
    transformed_image = transformed["image"]
    image_pil = Image.fromarray(np.uint8(transformed_image)).convert('RGB')

    similarities= []
    image = preprocess_clip(image_pil).unsqueeze(0).to(device)

    image_features = model_clip.encode_image(image).cpu().detach().numpy()

    for txt in text_list:
      text = open_clip.tokenize(txt ).to(device)
      text_features = model_clip.encode_text(text).cpu().detach().numpy()

      sim_= float(cos_sim_2d(text_features, image_features)[0]) 
      similarities.append(sim_)
    return similarities


files= glob.glob("/content/*.jpg")

with torch.no_grad():

  for f in files[:50]:
    
    raw_image = Image.open(f).convert('RGB')   
    w,h = raw_image.size

    display(raw_image.resize((300, int(300* h/w))))
    print("BEST CAPTION AFTER RANKING WITH CLIP ViT-L/14")

    captions = []

    for n in range(number_of_caption_per_image):

      # Get a random transform
      image = transform(raw_image).unsqueeze(0).to(device)     

      max_length = random.choice([30, 40, 50])
      min_length = random.choice([15, 20, 25])

      topP = random.choice([0.2, 0.3, 0.4, 0.5, 0.6, 0.7])

      caption = model.generate(image, 
                               sample=True, 
                               max_length=max_length, 
                               min_length=min_length, 
                               top_p=topP, 
                               repetition_penalty=rep_pen)      
      captions.append(caption[0])

      # Get a random transform
      image = transform(raw_image).unsqueeze(0).to(device)     

      max_length = random.randint(25, 60)
      min_length = random.randint(10, max_length)

      beam_n = random.choice([2,3,4,5,6,7])
      caption = model.generate(image,
                               sample=False,
                               num_beams=beam_n,
                               max_length=max_length,
                               min_length=min_length,
                               repetition_penalty=rep_pen)
      captions.append(caption[0])


    best_cannidates=[]
    sims = clip_rank(raw_image, captions)

    argmax = np.argsort(np.asarray(sims))[:number_of_caption_per_image]

    for a in argmax :
      print(captions[a])




In [None]:
#@title Captioning Images that show People { vertical-output: true }
#hide
from PIL import Image
import numpy as np
import torch
import clip
rep_pen=1.4

def cos_sim_2d(x, y):
    norm_x = x / np.linalg.norm(x, axis=1, keepdims=True)
    norm_y = y / np.linalg.norm(y, axis=1, keepdims=True)
    return np.matmul(norm_x, norm_y.T)


def clip_rank(image_pil,text_list, clip_model="ViT-L/14"):


    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load(clip_model, device=device)
    #model2, preprocess2 = clip.load("RN50x64", device=device)

    


    similarities= []
    image = preprocess(image_pil).unsqueeze(0).to(device)
    #image2 = preprocess2(image_pil).unsqueeze(0).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image).cpu().detach().numpy()
        #image_features2 = model2.encode_image(image2).cpu().detach().numpy()

        
    with torch.no_grad():
      
      #print(cos_sim_2d(text_features, image_features))
      for txt in text_list:
        text = clip.tokenize(txt ).to(device)
        text_features = model.encode_text(text).cpu().detach().numpy()


        #text_features2 = model2.encode_text(text).cpu().detach().numpy()
        sim_= float(cos_sim_2d(text_features, image_features)[0]) 

        #sim_= float(cos_sim_2d(text_features, image_features)[0]) + float(cos_sim_2d(text_features2, image_features2)[0])
        similarities.append(sim_)
    return similarities





import sys
if 'google.colab' in sys.modules:
    print('Running in Colab.')
    #!pip3 install transformers==4.15.0 timm==0.4.12 fairscale==0.4.4
    #!git clone https://github.com/salesforce/BLIP
    %cd BLIP
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


from models.blip import blip_decoder
import glob
image_size = 384
transform = transforms.Compose([
    transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]) 




model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth'
    
model = blip_decoder(pretrained=model_url, image_size=384, vit='large')
model.eval()
model = model.to(device)


files= glob.glob("/content/*.jpg")
for f in files[:40]:
    
  raw_image = Image.open(f).convert('RGB')   
  w,h = raw_image.size

  display(raw_image.resize((100,int(100* h/w))))
  image = transform(raw_image).unsqueeze(0).to(device)     
  

  captions = []

  for topP in [0.1,  0.2, 0.3, 0.4, 0.5,0.6, 0.7]:
    #[0.05,0.1, 0.15, 0.2,0.25, 0.3,0.35, 0.4, 0.45, 0.5,0.55, 0.6,0.65, 0.7,0.75, 0.8,0.85, 0.9, 0.95]

    with torch.no_grad():

        caption = model.generate(image, sample=True, max_length=30, min_length=10,top_p=topP,repetition_penalty=rep_pen)
        #def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0)
        captions.append(caption)

  for beam_n in [1,2,3,4,5,6,7,8]:
    #[0.05,0.1, 0.15, 0.2,0.25, 0.3,0.35, 0.4, 0.45, 0.5,0.55, 0.6,0.65, 0.7,0.75, 0.8,0.85, 0.9, 0.95]

    with torch.no_grad():

        caption = model.generate(image, sample=False, num_beams=beam_n, max_length=30, min_length=10,repetition_penalty=rep_pen)
        #def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0)
        captions.append(caption)





  for topP in [0.1,  0.2, 0.3, 0.4, 0.5,0.6, 0.7]:
    #[0.05,0.1, 0.15, 0.2,0.25, 0.3,0.35, 0.4, 0.45, 0.5,0.55, 0.6,0.65, 0.7,0.75, 0.8,0.85, 0.9, 0.95]

    with torch.no_grad():

        caption = model.generate(image, sample=True, max_length=45, min_length=30,top_p=topP,repetition_penalty=rep_pen)
        #def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0)
        captions.append(caption)

  for beam_n in [1,2,3,4,5,6,7,8]:
    #[0.05,0.1, 0.15, 0.2,0.25, 0.3,0.35, 0.4, 0.45, 0.5,0.55, 0.6,0.65, 0.7,0.75, 0.8,0.85, 0.9, 0.95]

    with torch.no_grad():

        caption = model.generate(image, sample=False, num_beams=beam_n, max_length=45, min_length=30,repetition_penalty=rep_pen)
        #def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0)
        captions.append(caption)


  """
  for topP in [0.1,  0.2, 0.3, 0.4, 0.5,0.6, 0.7,0.8]:
    #[0.05,0.1, 0.15, 0.2,0.25, 0.3,0.35, 0.4, 0.45, 0.5,0.55, 0.6,0.65, 0.7,0.75, 0.8,0.85, 0.9, 0.95]

    with torch.no_grad():

        caption = model.generate(image, sample=True, max_length=60, min_length=45,top_p=topP,repetition_penalty=rep_pen)
        #def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0)
        captions.append(caption)

  for beam_n in [1,2,3,4,5,6]:
  
    #[0.05,0.1, 0.15, 0.2,0.25, 0.3,0.35, 0.4, 0.45, 0.5,0.55, 0.6,0.65, 0.7,0.75, 0.8,0.85, 0.9, 0.95]

    with torch.no_grad():

        caption = model.generate(image, sample=False, num_beams=beam_n, max_length=60, min_length=45,repetition_penalty=rep_pen)
        #def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0)
        captions.append(caption)
  """
  best_cannidates=[]
  sims= clip_rank(raw_image,captions)
  argmax_ = np.argmax(np.asarray(sims))
  #print("Caption with highest sim")
  #print (captions[argmax_][0])
  best_cannidates.append(captions[argmax_][0])
  #print(sims[argmax_])
  del sims[argmax_]
  del captions[argmax_]
  argmax_ = np.argmax(np.asarray(sims))
  #print("Caption with 2nd highest sim")
  #print (captions[argmax_][0])
  best_cannidates.append(captions[argmax_][0])
  #print(sims[argmax_])
  del sims[argmax_]
  del captions[argmax_]
  argmax_ = np.argmax(np.asarray(sims))
  #print("Caption with 3nd highest sim")
  #print (captions[argmax_][0])
  best_cannidates.append(captions[argmax_][0])
  del sims[argmax_]
  del captions[argmax_]
  argmax_ = np.argmax(np.asarray(sims))
  #print("Caption with 3nd highest sim")
  #print (captions[argmax_][0])
  best_cannidates.append(captions[argmax_][0])
  #print(sims[argmax_])

  sims= clip_rank(raw_image,best_cannidates,clip_model="RN50x64")
  
  argmax_ = np.argmax(np.asarray(sims))
  print("BEST CAPTION AFTER RANKING WITH CLIP ViT L 14  & RESNET50x64:")
  print (best_cannidates[argmax_])


