In [None]:
! pip install git+https://github.com/openai/CLIP.git
! pip install --pre timm
! pip install transformers
! pip install open_clip_torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-ce98k01r
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-ce98k01r
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import libraries
import torch
import clip
import open_clip
import transformers
import random

import pickle as pkl
import numpy as np

from google.colab import drive
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image
from os import path, listdir
from tqdm import tqdm

In [None]:
def embed_image(image, model, preprocess, device):
  """
  Embeds an image using OpenAI CLIP model.
  """

  image_ = preprocess(Image.open(image)).unsqueeze(0).to(device)

  with torch.no_grad():
      image_features = model.encode_image(image_)

  return image_features.to('cpu', dtype=torch.float64)

def embed_text(text_list, model, device):
  """
  Embeds text using OpenAI CLIP model.
  """

  with torch.no_grad():

    text = clip.tokenize(text_list).to(device)
    text_features = model.encode_text(text)

  return text_features.to('cpu', dtype=torch.float64)

def load_model(model_name, device):
  """
  Loads an OpenAI CLIP model.
  """

  model, preprocess = clip.load(model_name, device)

  return model, preprocess

def openclip_embed_image(image, model, preprocess, device):
  """
  Embeds an image using an OpenCLIP model.
  """

  image_ = preprocess(Image.open(image)).unsqueeze(0).to(device)

  with torch.no_grad(), torch.cuda.amp.autocast():
      image_features = model.encode_image(image_)

  return image_features.to('cpu', dtype=torch.float64)

def openclip_embed_text(text_list, model, tokenizer, device):
  """
  Embeds text using OpenCLIP model.
  """

  with torch.no_grad():
    text = tokenizer(text_list).to(device)
    text_features = model.encode_text(text)

  return text_features.to('cpu', dtype=torch.float64)

def openclip_load_model(model_name, pretrained, device):
  """
  Loads an OpenCLIP model and tokenizer.
  """

  model, _, preprocess = open_clip.create_model_and_transforms(model_name, device=device, pretrained=pretrained)
  tokenizer = open_clip.get_tokenizer(model_name)

  return model, preprocess, tokenizer

def hf_embed_image(image, model, preprocess, device):
  """
  Embeds an image using a HuggingFace model.
  """

  image_ = preprocess(images=Image.open(image), return_tensors="pt").to(device)

  with torch.no_grad():
      image_features = model.get_image_features(**image_)

  return image_features.to('cpu', dtype=torch.float64)

def hf_embed_text(text_list, model, tokenizer, device):
  """
  Embeds text using a HuggingFace model.
  """

  with torch.no_grad():
    text = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt").to(device)
    text_features = model.get_text_features(**text)

  return text_features.to('cpu', dtype=torch.float64)

def hf_load_model(model_name, device):
  """
  Loads a HuggingFace model and tokenizer.
  """

  model = transformers.CLIPModel.from_pretrained(model_name).to(device)
  processor = transformers.CLIPProcessor.from_pretrained(model_name)
  tokenizer = transformers.CLIPTokenizer.from_pretrained(model_name)

  return model, processor, tokenizer

def compute_cosine_similarity(tensor_1: torch.tensor, tensor_2: torch.tensor) -> float:
  """
  Computes the cosine similarity between two tensors.
  """

  return torch.dot(tensor_1.squeeze(), tensor_2.squeeze()) / (torch.norm(tensor_1) * torch.norm(tensor_2)).item()

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Set random seeds to 42 for reproducibility
np.random.seed(42)
torch.manual_seed(42)
random.seed(42)

In [None]:
# Set path constants
IMG_PATH = f'/content/drive/My Drive/CLIP_Facial_Impressions/omi/images'

IMAGE_EMBEDDING_SAVE_PATH = f'/content/drive/My Drive/CLIP_Facial_Impressions/image_embeddings'
TEXT_EMBEDDING_SAVE_PATH = f'/content/drive/My Drive/CLIP_Facial_Impressions/text_embeddings'
SIMILARITY_SAVE_PATH = f'/content/drive/My Drive/CLIP_Facial_Impressions/text_image_similarities'

PROMPT_PATH = f'/content/drive/My Drive/CLIP_Facial_Impressions/stimuli/attributes.txt'
OPPOSITE_PATH = f'/content/drive/My Drive/CLIP_Facial_Impressions/stimuli/attributes_opposites.txt'

In [None]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Paths to the 1,004 images of the first impression dataset
first_impression_images = [path.join(IMG_PATH, f'{i}.jpg') for i in range(1,1005)]

In [None]:
# Read in prompts corresponding to text attributes
prompt_attributes = [line.split(':')[-1] for line in open(PROMPT_PATH).read().split('\n') if line]
opposite_prompts = [line.split(':')[-1] for line in open(OPPOSITE_PATH).read().split('\n') if line]

# Keep only unique prompts
text_attributes = list(set(prompt_attributes + opposite_prompts))

In [None]:
# Iterate through all available models in OpenAI CLIP repository
for model_name in clip.available_models():

  # Replace forward slash with dash to prevent issues with path when saving
  model_name_dash = model_name.replace('/', '-')

  # Load model and preprocessor and move to device
  model, preprocess = load_model(model_name, device)
  model.to(device)

  # Create dictionaries to store embeddings and similarities
  image_embedding_dict, text_embedding_dict, text_image_similarity_dict = {}, {}, {}

  # Embed text attributes
  for attribute in tqdm(text_attributes):
    text_embedding_dict[attribute] = embed_text([attribute], model, device)

  # Dump text embeddings to pickle file
  with open(path.join(TEXT_EMBEDDING_SAVE_PATH, f'openai_{model_name_dash}_text.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_embedding_dict, pkl_writer, protocol=-1)

  # Embed images
  for image in tqdm(first_impression_images):

    image_embedding = embed_image(image, model, preprocess, device)
    image_embedding_dict[image] = image_embedding

  # Dump image embeddings to pickle file
  with open(path.join(IMAGE_EMBEDDING_SAVE_PATH, f'openai_{model_name_dash}_first_impression_images.pkl'), 'wb') as pkl_writer:
      pkl.dump(image_embedding_dict, pkl_writer, protocol=-1)

  # Compute cosine similarities between text attributes and images
  for attribute in text_attributes:

    text_embedding = text_embedding_dict[attribute]
    text_image_similarity_dict[attribute] = {img_: compute_cosine_similarity(image_embedding_dict[img_], text_embedding) for img_ in image_embedding_dict.keys()}

  # Dump similarities to pickle file
  with open(path.join(SIMILARITY_SAVE_PATH, f'openai_{model_name_dash}_first_impression_similarities.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_image_similarity_dict, pkl_writer, protocol=-1)

In [None]:
! git clone https://github.com/LAION-AI/scaling-laws-openclip
%cd scaling-laws-openclip
! pip install -r requirements.txt
! python download_models.py

Cloning into 'scaling-laws-openclip'...
remote: Enumerating objects: 160, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 160 (delta 0), reused 0 (delta 0), pack-reused 134[K
Receiving objects: 100% (160/160), 1.47 MiB | 4.58 MiB/s, done.
Resolving deltas: 100% (90/90), done.
/content/scaling-laws-openclip
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading (…)3B_lr-5e-4_bs-32k.pt: 100% 1.82G/1.82G [00:37<00:00, 48.4MB/s]
'Model-B-32_Data-80M_Samples-3B_lr-5e-4_bs-32k.pt' downloaded.
Downloading (…)3B_lr-1e-3_bs-88k.pt: 100% 1.82G/1.82G [00:32<00:00, 56.7MB/s]
'Model-B-32_Data-400M_Samples-3B_lr-1e-3_bs-88k.pt' downloaded.
Downloading (…)3B_lr-1e-3_bs-88k.pt: 100% 1.82G/1.82G [00:30<00:00, 59.9MB/s]
'Model-B-32_Data-2B_Samples-3B_lr-1e-3_bs-88k.pt' downloaded.
Downloading (…)3B_lr-5e-4_bs-32k.pt: 100% 1.82G/1.82G [00:30<00:00, 59.3MB/s]
'Model-B-32_Dat

In [None]:
# Get list of all scaling CLIP models
models = [f for f in listdir(f'/content/scaling-laws-openclip') if f.endswith('.pt')]

model_types = []

# Iterate through models and separate model into model type and the actual pretrained model
for model in models:

  model_type = 'ViT' + model.split('_')[0].split('l')[-1]
  model_types.append(model_type)

# Create list of tuples containing model type and pretrained model
scaling_model_tuples = [(model_types[i], models[i]) for i in range(len(model_types))]

In [None]:
for model_tuple in scaling_model_tuples:

  # Get model type and pretrained model
  model_name, pretraining = model_tuple[0], model_tuple[1]

  # Replace forward slash with dash to prevent issues with path when saving
  model_name_dash = model_name.replace('/', '-')

  # Get name of pretrained model for saving
  write_pretraining = pretraining.split('.')[0]

  print(f'Embedding {model_name} with {pretraining} pretraining')

  # Load model and preprocessor and move to device; get tokenizer
  model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretraining)
  tokenizer = open_clip.get_tokenizer(model_name)
  model.to(device)

  # Create dictionaries to store embeddings and similarities
  image_embedding_dict, text_embedding_dict, text_image_similarity_dict = {}, {}, {}

  # Embed text attributes
  for attribute in tqdm(text_attributes):
    text_embedding_dict[attribute] = openclip_embed_text([attribute], model, tokenizer, device)

  # Dump text embeddings to pickle file
  with open(path.join(TEXT_EMBEDDING_SAVE_PATH, f'scaling_{model_name_dash}_{write_pretraining}_text.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_embedding_dict, pkl_writer, protocol=-1)

  # Embed images
  for image in tqdm(first_impression_images):

    image_embedding = openclip_embed_image(image, model, preprocess, device)
    image_embedding_dict[image] = image_embedding

  # Dump image embeddings to pickle file
  with open(path.join(IMAGE_EMBEDDING_SAVE_PATH, f'scaling_{model_name_dash}_{write_pretraining}_first_impression_images.pkl'), 'wb') as pkl_writer:
      pkl.dump(image_embedding_dict, pkl_writer, protocol=-1)

  # Compute cosine similarities between text attributes and images
  for attribute in text_attributes:

    text_embedding = text_embedding_dict[attribute]
    text_image_similarity_dict[attribute] = {img_: compute_cosine_similarity(image_embedding_dict[img_], text_embedding) for img_ in image_embedding_dict.keys()}

  # Dump similarities to pickle file
  with open(path.join(SIMILARITY_SAVE_PATH, f'scaling_{model_name_dash}_{write_pretraining}_first_impression_similarities.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_image_similarity_dict, pkl_writer, protocol=-1)

ViT-L-14
Model-L-14_Data-400M_Samples-13B_lr-1e-3_bs-86k.pt


100%|██████████| 1315/1315 [00:13<00:00, 97.92it/s]


text attributes


100%|██████████| 1004/1004 [00:47<00:00, 20.99it/s]


first impressions


100%|██████████| 685/685 [01:30<00:00,  7.57it/s]


cfd


100%|██████████| 142/142 [00:19<00:00,  7.15it/s]


cfd_i
ViT-L-14
Model-L-14_Data-2B_Samples-3B_lr-1e-3_bs-88k.pt


100%|██████████| 1315/1315 [00:13<00:00, 98.01it/s]


text attributes


100%|██████████| 1004/1004 [00:47<00:00, 21.02it/s]


first impressions


100%|██████████| 685/685 [01:30<00:00,  7.56it/s]


cfd


100%|██████████| 142/142 [00:19<00:00,  7.15it/s]


cfd_i
ViT-L-14
Model-L-14_Data-2B_Samples-13B_lr-1e-3_bs-86k.pt


100%|██████████| 1315/1315 [00:13<00:00, 98.82it/s]


text attributes


100%|██████████| 1004/1004 [00:47<00:00, 21.12it/s]


first impressions


100%|██████████| 685/685 [01:30<00:00,  7.56it/s]


cfd


100%|██████████| 142/142 [00:19<00:00,  7.14it/s]


cfd_i
ViT-H-14
Model-H-14_Data-2B_Samples-34B_lr-5e-4_bs-79k.pt


100%|██████████| 1315/1315 [00:26<00:00, 50.44it/s]


text attributes


100%|██████████| 1004/1004 [00:56<00:00, 17.75it/s]


first impressions


100%|██████████| 685/685 [01:35<00:00,  7.14it/s]


cfd


100%|██████████| 142/142 [00:20<00:00,  6.77it/s]


cfd_i
ViT-L-14
Model-L-14_Data-400M_Samples-34B_lr-1e-3_bs-86k.pt


100%|██████████| 1315/1315 [00:13<00:00, 96.52it/s]


text attributes


100%|██████████| 1004/1004 [00:48<00:00, 20.85it/s]


first impressions


100%|██████████| 685/685 [01:30<00:00,  7.55it/s]


cfd


100%|██████████| 142/142 [00:20<00:00,  7.09it/s]


cfd_i
ViT-L-14
Model-L-14_Data-80M_Samples-34B_lr-1e-3_bs-88k.pt


100%|██████████| 1315/1315 [00:13<00:00, 96.25it/s]


text attributes


100%|██████████| 1004/1004 [00:48<00:00, 20.74it/s]


first impressions


100%|██████████| 685/685 [01:30<00:00,  7.54it/s]


cfd


100%|██████████| 142/142 [00:19<00:00,  7.11it/s]


cfd_i
ViT-g-14
Model-g-14_Data-2B_Samples-13B_lr-5e-4_bs-64k.pt


100%|██████████| 1315/1315 [00:26<00:00, 50.20it/s]


text attributes


100%|██████████| 1004/1004 [01:03<00:00, 15.81it/s]


first impressions


100%|██████████| 685/685 [01:40<00:00,  6.78it/s]


cfd


100%|██████████| 142/142 [00:22<00:00,  6.43it/s]


cfd_i
ViT-L-14
Model-L-14_Data-80M_Samples-13B_lr-1e-3_bs-88k.pt


100%|██████████| 1315/1315 [00:13<00:00, 97.19it/s]


text attributes


100%|██████████| 1004/1004 [00:48<00:00, 20.91it/s]


first impressions


100%|██████████| 685/685 [01:30<00:00,  7.56it/s]


cfd


100%|██████████| 142/142 [00:19<00:00,  7.15it/s]


cfd_i
ViT-L-14
Model-L-14_Data-2B_Samples-34B_lr-1e-3_bs-86k.pt


100%|██████████| 1315/1315 [00:13<00:00, 96.14it/s]


text attributes


100%|██████████| 1004/1004 [00:47<00:00, 21.13it/s]


first impressions


100%|██████████| 685/685 [01:30<00:00,  7.56it/s]


cfd


100%|██████████| 142/142 [00:19<00:00,  7.15it/s]

cfd_i





In [None]:
! wget https://github.com/FacePerceiver/FaRL/releases/download/pretrained_weights/FaRL-Base-Patch16-LAIONFace20M-ep16.pth
! wget https://github.com/FacePerceiver/FaRL/releases/download/pretrained_weights/FaRL-Base-Patch16-LAIONFace20M-ep64.pth

In [None]:
# Load base model and preprocessor for FaceCLIP and move to device
model, preprocess = clip.load("ViT-B/16", device=device)
model = model.to(device)

In [None]:
# Tuples of model name and path to weights for FaRL models
faceclip_tuples = [('FaRL-Base-Patch16-LAIONFace20M-ep16', 'FaRL-Base-Patch16-LAIONFace20M-ep16.pth'),
                   ('FaRL-Base-Patch16-LAIONFace20M-ep64', 'FaRL-Base-Patch16-LAIONFace20M-ep64.pth')]

In [None]:
# Iterate through FaceCLIP models
for model_tuple in faceclip_tuples:

  # Get model type and pretrained model
  model_name, model_path = model_tuple[0], model_tuple[1]

  # Replace forward slash with dash to prevent issues with path when saving
  model_name_dash = model_name.replace('/', '-')

  print(f'Embedding {model_name}')

  # Load model and preprocessor and move to device; get tokenizer
  model.load_state_dict(torch.load(model_path), strict=False)

  # Create dictionaries to store embeddings and similarities
  image_embedding_dict, text_embedding_dict, text_image_similarity_dict = {}, {}, {}

  # Embed text attributes
  for attribute in tqdm(text_attributes):
    text_embedding_dict[attribute] = embed_text(attribute, model, device)

  # Dump text embeddings to pickle file
  with open(path.join(TEXT_EMBEDDING_SAVE_PATH, f'faceclip_{model_name_dash}_text.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_embedding_dict, pkl_writer, protocol=-1)

  # Embed images
  for image in tqdm(first_impression_images):

    image_embedding = embed_image(image, model, preprocess, device)
    image_embedding_dict[image] = image_embedding

  # Dump image embeddings to pickle file
  with open(path.join(IMAGE_EMBEDDING_SAVE_PATH, f'faceclip_{model_name_dash}_first_impression_images.pkl'), 'wb') as pkl_writer:
      pkl.dump(image_embedding_dict, pkl_writer, protocol=-1)

  # Compute cosine similarities between text attributes and images
  for attribute in text_attributes:

    text_embedding = text_embedding_dict[attribute]
    text_image_similarity_dict[attribute] = {img_: compute_cosine_similarity(image_embedding_dict[img_], text_embedding) for img_ in image_embedding_dict.keys()}

  # Dump similarities to pickle file
  with open(path.join(SIMILARITY_SAVE_PATH, f'faceclip_{model_name_dash}_first_impression_similarities.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_image_similarity_dict, pkl_writer, protocol=-1)

In [None]:
# List of tuples containing model name and pretrained model for TenCent FaceCLIP models hosted on HuggingFace
hf_faceclip_tuples = [('P01son/FaceCLIP-base-32', 'FaceCLIP-base-32'),
                      ('P01son/FaceCLIP-base-16', 'FaceCLIP-base-16'),
                      ('P01son/FaceCLIP-large-14', 'FaceCLIP-large-14'),
                      ]

In [None]:
# Iterate through all available models in OpenAI CLIP repository
for hf_tuple in hf_faceclip_tuples:

  # Get model path and name
  model_path, model_name = hf_tuple[0], hf_tuple[1]

  # Replace forward slash with dash to prevent issues with path when saving
  model_name_dash = model_name.replace('/', '-')

  # Load model and preprocessor and move to device
  model, preprocess, tokenizer = hf_load_model(model_path, device)
  model.to(device)

  # Create dictionaries to store embeddings and similarities
  image_embedding_dict, text_embedding_dict, text_image_similarity_dict = {}, {}, {}

  # Embed text attributes
  for attribute in tqdm(text_attributes):
    text_embedding_dict[attribute] = hf_embed_text(attribute, model, tokenizer, device)

  # Dump text embeddings to pickle file
  with open(path.join(TEXT_EMBEDDING_SAVE_PATH, f'faceclip_{model_name_dash}_text.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_embedding_dict, pkl_writer, protocol=-1)

  # Embed images
  for image in tqdm(first_impression_images):

    image_embedding = hf_embed_image(image, model, preprocess, device)
    image_embedding_dict[image] = image_embedding

  # Dump image embeddings to pickle file
  with open(path.join(IMAGE_EMBEDDING_SAVE_PATH, f'faceclip_{model_name_dash}_first_impression_images.pkl'), 'wb') as pkl_writer:
      pkl.dump(image_embedding_dict, pkl_writer, protocol=-1)

  # Compute cosine similarities between text attributes and images
  for attribute in text_attributes:

    text_embedding = text_embedding_dict[attribute]
    text_image_similarity_dict[attribute] = {img_: compute_cosine_similarity(image_embedding_dict[img_], text_embedding) for img_ in image_embedding_dict.keys()}

  # Dump similarities to pickle file
  with open(path.join(SIMILARITY_SAVE_PATH, f'faceclip_{model_name_dash}_first_impression_similarities.pkl'), 'wb') as pkl_writer:
      pkl.dump(text_image_similarity_dict, pkl_writer, protocol=-1)