In [None]:
! pip install git+https://github.com/openai/CLIP.git
! pip install --pre timm
! pip install transformers
! pip install open_clip_torch

In [None]:
# Import libraries
import torch
import clip
import open_clip
import transformers
import random

import pickle as pkl
import numpy as np

from google.colab import drive
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image
from os import path, listdir
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def embed_image(image, model, preprocess, device):
  """
  Embeds an image using OpenAI CLIP model.
  """

  image_ = preprocess(Image.open(image)).unsqueeze(0).to(device)

  with torch.no_grad():
      image_features = model.encode_image(image_)

  return image_features.to('cpu', dtype=torch.float64)

def embed_text(text_list, model, device):
  """
  Embeds text using OpenAI CLIP model.
  """

  with torch.no_grad():

    text = clip.tokenize(text_list).to(device)
    text_features = model.encode_text(text)

  return text_features.to('cpu', dtype=torch.float64)

def load_model(model_name, device):
  """
  Loads an OpenAI CLIP model.
  """

  model, preprocess = clip.load(model_name, device)

  return model, preprocess

def openclip_embed_image(image, model, preprocess, device):
  """
  Embeds an image using an OpenCLIP model.
  """

  image_ = preprocess(Image.open(image)).unsqueeze(0).to(device)

  with torch.no_grad(), torch.cuda.amp.autocast():
      image_features = model.encode_image(image_)

  return image_features.to('cpu', dtype=torch.float64)

def openclip_embed_text(text_list, model, tokenizer, device):
  """
  Embeds text using OpenCLIP model.
  """

  with torch.no_grad():
    text = tokenizer(text_list).to(device)
    text_features = model.encode_text(text)

  return text_features.to('cpu', dtype=torch.float64)

def openclip_load_model(model_name, pretrained, device):
  """
  Loads an OpenCLIP model and tokenizer.
  """

  model, _, preprocess = open_clip.create_model_and_transforms(model_name, device=device, pretrained=pretrained)
  tokenizer = open_clip.get_tokenizer(model_name)

  return model, preprocess, tokenizer

def hf_embed_image(image, model, preprocess, device):
  """
  Embeds an image using a HuggingFace model.
  """

  image_ = preprocess(images=Image.open(image), return_tensors="pt").to(device)

  with torch.no_grad():
      image_features = model.get_image_features(**image_)

  return image_features.to('cpu', dtype=torch.float64)

def hf_embed_text(text_list, model, tokenizer, device):
  """
  Embeds text using a HuggingFace model.
  """

  with torch.no_grad():
    text = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt").to(device)
    text_features = model.get_text_features(**text)

  return text_features.to('cpu', dtype=torch.float64)

def hf_load_model(model_name, device):
  """
  Loads a HuggingFace model and tokenizer.
  """

  model = transformers.CLIPModel.from_pretrained(model_name).to(device)
  processor = transformers.CLIPProcessor.from_pretrained(model_name)
  tokenizer = transformers.CLIPTokenizer.from_pretrained(model_name)

  return model, processor, tokenizer

def compute_cosine_similarity(tensor_1: torch.tensor, tensor_2: torch.tensor) -> float:
  """
  Computes the cosine similarity between two tensors.
  """

  return torch.dot(tensor_1.squeeze(), tensor_2.squeeze()) / (torch.norm(tensor_1) * torch.norm(tensor_2)).item()

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Set random seeds to 42 for reproducibility
np.random.seed(42)
torch.manual_seed(42)
random.seed(42)

In [None]:
# Set path constants
IMG_PATH = f'/content/drive/My Drive/OASIS/images'

IMAGE_EMBEDDING_SAVE_PATH = f'/content/drive/My Drive/OASIS/image_embeddings'
TEXT_EMBEDDING_SAVE_PATH = f'/content/drive/My Drive/OASIS/text_embeddings'
SIMILARITY_SAVE_PATH = f'/content/drive/My Drive/OASIS/text_image_similarities'

PROMPT_PATH = f'/content/drive/My Drive/OASIS/stimuli/attributes.txt'

In [None]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Paths to the 1,004 images of the OASIS dataset
first_impression_images = [path.join(IMG_PATH, i) for i in listdir(IMG_PATH) if 'DS_' not in i]

In [None]:
first_impression_images

In [None]:
# Read in prompts
text_attributes = [line for line in open(PROMPT_PATH).read().split('\n') if line]

In [None]:
# Iterate through all available models in OpenAI CLIP repository
for model_name in clip.available_models():

  # Replace forward slash with dash to prevent issues with path when saving
  model_name_dash = model_name.replace('/', '-')

  # Load model and preprocessor and move to device
  model, preprocess = load_model(model_name, device)
  model.to(device)

  # Create dictionaries to store embeddings and similarities
  image_embedding_dict, text_embedding_dict, text_image_similarity_dict = {}, {}, {}

  # Embed text attributes
  for attribute in tqdm(text_attributes):
    text_embedding_dict[attribute] = embed_text([attribute], model, device)

  # Dump text embeddings to pickle file
  with open(path.join(TEXT_EMBEDDING_SAVE_PATH, f'openai_{model_name_dash}_oasis_text.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_embedding_dict, pkl_writer, protocol=-1)

  # Embed images
  for image in tqdm(first_impression_images):

    image_embedding = embed_image(image, model, preprocess, device)
    image_embedding_dict[image] = image_embedding

  # Dump image embeddings to pickle file
  with open(path.join(IMAGE_EMBEDDING_SAVE_PATH, f'openai_{model_name_dash}_oasis_images.pkl'), 'wb') as pkl_writer:
      pkl.dump(image_embedding_dict, pkl_writer, protocol=-1)

  # Compute cosine similarities between text attributes and images
  for attribute in text_attributes:

    text_embedding = text_embedding_dict[attribute]
    text_image_similarity_dict[attribute] = {img_: compute_cosine_similarity(image_embedding_dict[img_], text_embedding) for img_ in image_embedding_dict.keys()}

  # Dump similarities to pickle file
  with open(path.join(SIMILARITY_SAVE_PATH, f'openai_{model_name_dash}_oasis_similarities.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_image_similarity_dict, pkl_writer, protocol=-1)

In [None]:
! git clone https://github.com/LAION-AI/scaling-laws-openclip
%cd scaling-laws-openclip
! pip install -r requirements.txt
! python download_models.py

In [None]:
# Get list of all scaling CLIP models
models = [f for f in listdir(f'/content/scaling-laws-openclip') if f.endswith('.pt') and 'H-14' not in f and 'g-14' not in f]

model_types = []

# Iterate through models and separate model into model type and the actual pretrained model
for model in models:

  model_type = 'ViT' + model.split('_')[0].split('l')[-1]
  model_types.append(model_type)

# Create list of tuples containing model type and pretrained model
scaling_model_tuples = [(model_types[i], models[i]) for i in range(len(model_types))]

In [None]:
for model_tuple in scaling_model_tuples:

  # Get model type and pretrained model
  model_name, pretraining = model_tuple[0], model_tuple[1]

  # Replace forward slash with dash to prevent issues with path when saving
  model_name_dash = model_name.replace('/', '-')

  # Get name of pretrained model for saving
  write_pretraining = pretraining.split('.')[0]

  print(f'Embedding {model_name} with {pretraining} pretraining')

  # Load model and preprocessor and move to device; get tokenizer
  model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretraining)
  tokenizer = open_clip.get_tokenizer(model_name)
  model.to(device)

  # Create dictionaries to store embeddings and similarities
  image_embedding_dict, text_embedding_dict, text_image_similarity_dict = {}, {}, {}

  # Embed text attributes
  for attribute in tqdm(text_attributes):
    text_embedding_dict[attribute] = openclip_embed_text([attribute], model, tokenizer, device)

  # Dump text embeddings to pickle file
  with open(path.join(TEXT_EMBEDDING_SAVE_PATH, f'scaling_{model_name_dash}_{write_pretraining}_oasis_text.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_embedding_dict, pkl_writer, protocol=-1)

  # Embed images
  for image in tqdm(first_impression_images):

    image_embedding = openclip_embed_image(image, model, preprocess, device)
    image_embedding_dict[image] = image_embedding

  # Dump image embeddings to pickle file
  with open(path.join(IMAGE_EMBEDDING_SAVE_PATH, f'scaling_{model_name_dash}_{write_pretraining}_oasis_images.pkl'), 'wb') as pkl_writer:
      pkl.dump(image_embedding_dict, pkl_writer, protocol=-1)

  # Compute cosine similarities between text attributes and images
  for attribute in text_attributes:

    text_embedding = text_embedding_dict[attribute]
    text_image_similarity_dict[attribute] = {img_: compute_cosine_similarity(image_embedding_dict[img_], text_embedding) for img_ in image_embedding_dict.keys()}

  # Dump similarities to pickle file
  with open(path.join(SIMILARITY_SAVE_PATH, f'scaling_{model_name_dash}_{write_pretraining}_oasis_similarities.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_image_similarity_dict, pkl_writer, protocol=-1)

In [None]:
! wget https://github.com/FacePerceiver/FaRL/releases/download/pretrained_weights/FaRL-Base-Patch16-LAIONFace20M-ep16.pth
! wget https://github.com/FacePerceiver/FaRL/releases/download/pretrained_weights/FaRL-Base-Patch16-LAIONFace20M-ep64.pth

In [None]:
# Load base model and preprocessor for FaceCLIP and move to device
model, preprocess = clip.load("ViT-B/16", device=device)
model = model.to(device)

In [None]:
# Tuples of model name and path to weights for FaRL models
faceclip_tuples = [('FaRL-Base-Patch16-LAIONFace20M-ep16', 'FaRL-Base-Patch16-LAIONFace20M-ep16.pth'),
                   ('FaRL-Base-Patch16-LAIONFace20M-ep64', 'FaRL-Base-Patch16-LAIONFace20M-ep64.pth')]

In [None]:
# Iterate through FaceCLIP models
for model_tuple in faceclip_tuples:

  # Get model type and pretrained model
  model_name, model_path = model_tuple[0], model_tuple[1]

  # Replace forward slash with dash to prevent issues with path when saving
  model_name_dash = model_name.replace('/', '-')

  print(f'Embedding {model_name}')

  # Load model and preprocessor and move to device; get tokenizer
  model.load_state_dict(torch.load(model_path), strict=False)

  # Create dictionaries to store embeddings and similarities
  image_embedding_dict, text_embedding_dict, text_image_similarity_dict = {}, {}, {}

  # Embed text attributes
  for attribute in tqdm(text_attributes):
    text_embedding_dict[attribute] = embed_text(attribute, model, device)

  # Dump text embeddings to pickle file
  with open(path.join(TEXT_EMBEDDING_SAVE_PATH, f'faceclip_{model_name_dash}_oasis_text.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_embedding_dict, pkl_writer, protocol=-1)

  # Embed images
  for image in tqdm(first_impression_images):

    image_embedding = embed_image(image, model, preprocess, device)
    image_embedding_dict[image] = image_embedding

  # Dump image embeddings to pickle file
  with open(path.join(IMAGE_EMBEDDING_SAVE_PATH, f'faceclip_{model_name_dash}_oasis_images.pkl'), 'wb') as pkl_writer:
      pkl.dump(image_embedding_dict, pkl_writer, protocol=-1)

  # Compute cosine similarities between text attributes and images
  for attribute in text_attributes:

    text_embedding = text_embedding_dict[attribute]
    text_image_similarity_dict[attribute] = {img_: compute_cosine_similarity(image_embedding_dict[img_], text_embedding) for img_ in image_embedding_dict.keys()}

  # Dump similarities to pickle file
  with open(path.join(SIMILARITY_SAVE_PATH, f'faceclip_{model_name_dash}_oasis_similarities.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_image_similarity_dict, pkl_writer, protocol=-1)

In [None]:
# List of tuples containing model name and pretrained model for TenCent FaceCLIP models hosted on HuggingFace
hf_faceclip_tuples = [('P01son/FaceCLIP-base-32', 'FaceCLIP-base-32'),
                      ('P01son/FaceCLIP-base-16', 'FaceCLIP-base-16'),
                      ('P01son/FaceCLIP-large-14', 'FaceCLIP-large-14'),
                      ]

In [None]:
# Iterate through all available models in OpenAI CLIP repository
for hf_tuple in hf_faceclip_tuples:

  # Get model path and name
  model_path, model_name = hf_tuple[0], hf_tuple[1]

  # Replace forward slash with dash to prevent issues with path when saving
  model_name_dash = model_name.replace('/', '-')

  # Load model and preprocessor and move to device
  model, preprocess, tokenizer = hf_load_model(model_path, device)
  model.to(device)

  # Create dictionaries to store embeddings and similarities
  image_embedding_dict, text_embedding_dict, text_image_similarity_dict = {}, {}, {}

  # Embed text attributes
  for attribute in tqdm(text_attributes):
    text_embedding_dict[attribute] = hf_embed_text(attribute, model, tokenizer, device)

  # Dump text embeddings to pickle file
  with open(path.join(TEXT_EMBEDDING_SAVE_PATH, f'faceclip_{model_name_dash}_oasis_text.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_embedding_dict, pkl_writer, protocol=-1)

  # Embed images
  for image in tqdm(first_impression_images):

    image_embedding = hf_embed_image(image, model, preprocess, device)
    image_embedding_dict[image] = image_embedding

  # Dump image embeddings to pickle file
  with open(path.join(IMAGE_EMBEDDING_SAVE_PATH, f'faceclip_{model_name_dash}_oasis_images.pkl'), 'wb') as pkl_writer:
      pkl.dump(image_embedding_dict, pkl_writer, protocol=-1)

  # Compute cosine similarities between text attributes and images
  for attribute in text_attributes:

    text_embedding = text_embedding_dict[attribute]
    text_image_similarity_dict[attribute] = {img_: compute_cosine_similarity(image_embedding_dict[img_], text_embedding) for img_ in image_embedding_dict.keys()}

  # Dump similarities to pickle file
  with open(path.join(SIMILARITY_SAVE_PATH, f'faceclip_{model_name_dash}_oasis_similarities.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_image_similarity_dict, pkl_writer, protocol=-1)