In [1]:
from transformers import CLIPTokenizer, CLIPTokenizerFast, CLIPTextModel, CLIPModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, json, os, re, random


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
clip = "openai/clip-vit-base-patch32"
tokenizer = AutoTokenizer.from_pretrained(clip)
model = CLIPTextModel.from_pretrained(clip)
model.cuda()

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [7]:
texts = ["a cat is on top of a car", "a cat is at the bottom of a car"]
encoded_text = tokenizer(
    texts, 
    padding=True,
    return_tensors="pt"
)
encoded_text

{'input_ids': tensor([[49406,   320,  2368,   533,   525,  1253,   539,   320,  1615, 49407,
         49407],
        [49406,   320,  2368,   533,   536,   518,  5931,   539,   320,  1615,
         49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [9]:
tokenizer.convert_ids_to_tokens([49406,   320,  2368,   533,   525,  1253,   539,   320,  1615, 49407, 49407])

['<|startoftext|>',
 'a</w>',
 'cat</w>',
 'is</w>',
 'on</w>',
 'top</w>',
 'of</w>',
 'a</w>',
 'car</w>',
 '<|endoftext|>',
 '<|endoftext|>']

In [12]:
outputs = model(**encoded_text.to(0)).last_hidden_state

In [13]:
outputs.shape

torch.Size([2, 11, 512])

In [11]:
encoded = tokenizer.batch_encode_plus(
        texts,
        return_tensors = "pt",
        padding = True,
        truncation=True
    )

In [10]:
encoded['input_ids']

tensor([[49406,   320,  2368,   533,   525,  1253,   539,   320,  1615, 49407,
         49407],
        [49406,   320,  2368,   533,   536,   518,  5931,   539,   320,  1615,
         49407]])

In [8]:
from transformers import CLIPTextConfig
CLIP_CONFIGS = CLIPTextConfig()


In [11]:
CLIP_CONFIGS.to_dict()

{'return_dict': True,
 'output_hidden_states': False,
 'output_attentions': False,
 'torchscript': False,
 'torch_dtype': None,
 'use_bfloat16': False,
 'tf_legacy_loss': False,
 'pruned_heads': {},
 'tie_word_embeddings': True,
 'is_encoder_decoder': False,
 'is_decoder': False,
 'cross_attention_hidden_size': None,
 'add_cross_attention': False,
 'tie_encoder_decoder': False,
 'max_length': 20,
 'min_length': 0,
 'do_sample': False,
 'early_stopping': False,
 'num_beams': 1,
 'num_beam_groups': 1,
 'diversity_penalty': 0.0,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'typical_p': 1.0,
 'repetition_penalty': 1.0,
 'length_penalty': 1.0,
 'no_repeat_ngram_size': 0,
 'encoder_no_repeat_ngram_size': 0,
 'bad_words_ids': None,
 'num_return_sequences': 1,
 'chunk_size_feed_forward': 0,
 'output_scores': False,
 'return_dict_in_generate': False,
 'forced_bos_token_id': None,
 'forced_eos_token_id': None,
 'remove_invalid_values': False,
 'exponential_decay_length_penalty': None,
 'su