In [1]:
import os
import torch

# tokenizer to use
# TOKENIZER = "microsoft/Multilingual-MiniLM-L12-H384"
TOKENIZER = "openai/clip-vit-large-patch14"
# TOKENIZER = "openai/clip-vit-base-patch32"

# maximum number of words in a language goal
LANG_MAX_WORD_LEN = 25

# language embedding obs key name
LANG_OBS_KEY = "lang_embed"

# (HACK) enable language-vision multiplication, post spatial softmax layer
# LANG_VIS_MULT_ENABLED = True
LANG_VIS_MULT_ENABLED = False

# these global variables will be populated automatically

# whether language conditioning is enabled
LANG_COND_ENABLED = False

# these global variables will be populated lazily
LANG_EMB_MODEL = None
TZ = None

def init_lang_model():
    from transformers import AutoModel, pipeline, AutoTokenizer, CLIPTextModelWithProjection

    os.environ["TOKENIZERS_PARALLELISM"] = "true" # needed to suppress warning about potential deadlock
    global LANG_EMB_MODEL
    global TZ

    # CLIP
    LANG_EMB_MODEL = CLIPTextModelWithProjection.from_pretrained(
        TOKENIZER,
        cache_dir=os.path.expanduser("~/tmp/clip")
    ).eval()

    TZ = AutoTokenizer.from_pretrained(TOKENIZER, TOKENIZERS_PARALLELISM=True)
    
    # MiniLM
    # https://github.com/microsoft/unilm/tree/master/minilm
    # LANG_EMB_MODEL = AutoModel.from_pretrained(TOKENIZER, cache_dir=os.path.expanduser("~/tmp/minilm")).eval()

    # pip install --no-cache-dir transformers sentencepiece
    # TZ = AutoTokenizer.from_pretrained(TOKENIZER, TOKENIZERS_PARALLELISM=True, use_fast=False)


def get_lang_emb(lang):
    if lang is None:
        return None

    if TZ is None:
        init_lang_model()
    
    num_words = len(lang.split())
    if num_words > LANG_MAX_WORD_LEN:
        raise Exception("Number of words {} in sentence {} exceeded max length {}".format(num_words, lang, LANG_MAX_WORD_LEN))

    with torch.no_grad():
        tokens = TZ(
            text=lang,                   # the sentence to be encoded
            add_special_tokens=True,             # Add [CLS] and [SEP]
            max_length=LANG_MAX_WORD_LEN,  # maximum length of a sentence
            padding="max_length",
            return_attention_mask=True,        # Generate the attention mask
            return_tensors="pt",               # ask the function to return PyTorch tensors
        )
        # lang_emb = LANG_EMB_MODEL(**tokens)["pooler_output"].detach()[0]
        lang_emb = LANG_EMB_MODEL(**tokens)['text_embeds'].detach()[0]
        lang_emb = lang_emb.cpu().numpy()

    return lang_emb

In [6]:
import numpy as np
from scipy.spatial.distance import cosine
red_embed = get_lang_emb("firetruck")
green_embed = get_lang_emb("rainforest")
print(np.linalg.norm(red_embed - green_embed), cosine(red_embed, green_embed))

13.398815 0.31040501594543457


In [7]:
import numpy as np
red_embed = get_lang_emb("red")
green_embed = get_lang_emb("green")
print(np.linalg.norm(red_embed - green_embed), cosine(red_embed, green_embed))

11.576969 0.28047096729278564


In [8]:
import numpy as np
red_embed = get_lang_emb("pick up the red cube")
green_embed = get_lang_emb("pick up the green cube")
print(np.linalg.norm(red_embed - green_embed), cosine(red_embed, green_embed))

8.284871 0.1322607398033142


In [7]:
import numpy as np
print(np.dot(red_embed, green_embed))

224.97511


In [10]:
np.sum(np.abs(red_embed - green_embed))

246.56877

In [7]:
from transformers import CLIPImageProcessor, CLIPVisionModel
clip_ckpt = "openai/clip-vit-large-patch14-336"
image_processor = CLIPImageProcessor.from_pretrained(clip_ckpt)
vision_tower = CLIPVisionModel.from_pretrained(clip_ckpt)
vision_tower.requires_grad_(False)

CLIPVisionModel(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
      (position_embedding): Embedding(577, 1024)
    )
    (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0): CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
  

In [11]:
from PIL import Image
import torchvision.transforms as transforms
import requests
import torch

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
pil_image = Image.open(requests.get(url, stream=True).raw)

# Define a transform to convert the PIL image to a PyTorch tensor
transform = transforms.ToTensor()

# Apply the transform to convert the PIL image to a PyTorch tensor
torch_tensor = transform(pil_image)

print(torch_tensor.shape)

torch.Size([3, 480, 640])


In [48]:
inputs = image_processor(images=torch_tensor / 255., return_tensors="pt", do_rescale=False, do_normalize=False)

mm_vision_select_layer = -2

# Get CLIP features
with torch.no_grad():
    features = vision_tower(**inputs, output_hidden_states=True)#.hidden_states[mm_vision_select_layer]

features.pooler_output.shape

torch.Size([1, 1024])

In [32]:
# select_feature = "patch"
# if select_feature == 'patch':
#     feats = feats[:, 1:]
# elif select_feature == 'cls_patch':
#     feats 