# Clip Tokenizer



In [3]:
#@title 1. Install Requirements

print("This will take a couple of minutes...")
!pip install -q --upgrade diffusers[torch]
#!pip install -q xformers==0.0.17
!pip install -q transformers
#!pip install -q triton



This will take a couple of minutes...


In [20]:
#@title 2. Setup Pipeline

import os
import torch
from transformers import CLIPTokenizer
from transformers import file_utils
from diffusers import StableDiffusionPipeline, DDIMScheduler
# =======================================
# setup pipeline
if torch.cuda.is_available():
    torch.cuda.empty_cache()

MODEL_PATH = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(f'{MODEL_PATH}', revision="fp16", torch_dtype = torch.float16, safety_checker = None)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")


# Vocab_File_Path = os.path.join(MODEL_PATH, 'tokenizer', 'vocab.json')
tokenizer = CLIPTokenizer.from_pretrained(MODEL_PATH, subfolder="tokenizer",revision="fp16")

# Get the transformers cache directory
cache_dir = file_utils.default_cache_path

# tokenizer/vocab.json files are normally saved in cache e.g.
# /root/.cache/huggingface/hub/models--runwayml--stable-diffusion-v1-5/snapshots/xxxxx/tokenizer/vocab.json
# find the vocab file
for root, dirs, files in os.walk(cache_dir):
    for file in files:
        if file == "vocab.json":
            vocab_file_path = os.path.join(root, file)
            print(f"Found vocab file at: {vocab_file_path}")
            break



text_encoder/model.safetensors not found


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


Found vocab file at: /root/.cache/huggingface/hub/models--runwayml--stable-diffusion-v1-5/snapshots/ded79e214aa69e42c24d3f5ac14b76d568679cc2/tokenizer/vocab.json


In [151]:
#@title 3. Setup Tokenize() Function

import json

# =======================================
# for coloring text
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    RED = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

# ===========================================================================
# ===========================================================================
def tokenize(prompt_or_caption):

  with open(vocab_file_path, "r", encoding='utf-8') as f:
    vocab = json.load(f)
    vocab = {v: k for k, v in vocab.items()}

  print(f'vocab size: {tokenizer.vocab_size}')
  print(f'max length: {tokenizer.model_max_length}')

  bos_encoded_input = tokenizer.encode('<|startoftext|>', add_special_tokens=False)
  eos_encoded_input = tokenizer.encode(f'{tokenizer.eos_token}', add_special_tokens=False)
  print(f'{bcolors.OKCYAN}special token (bos): {tokenizer.bos_token}: {bos_encoded_input}{bcolors.ENDC}')
  print(f'{bcolors.OKCYAN}special token (eos): {tokenizer.eos_token}: {eos_encoded_input}{bcolors.ENDC}')




  # print(f'special token (unk): {tokenizer.unk_token}')
  # print(f'special token (pad): {tokenizer.pad_token}\n')

  print(f'{bcolors.OKGREEN}GREEN = Within the 77-Token Limit{bcolors.ENDC}')
  print(f'{bcolors.RED}RED = Outside 77-Token Limit{bcolors.ENDC}\n')

  #inputs = tokenizer(sys.argv[1], padding=True)
  inputs = tokenizer(f'{prompt_or_caption}', padding=True)
  value = sum(inputs["input_ids"])
  value_str = str(value)
  token_data = ''


  #  print the token ids in rows of 10
  row_length = 10
  final_string_ids = '' # final string for the ids
  final_string_words = '' # final string for the words/subwords

  # Find the length of the longest word (for padding)
  max_word_length = max(len(vocab[id]) for id in inputs["input_ids"])

  for i in range(0, len(inputs["input_ids"]), row_length):
    # Extract a row's worth of numbers
    row = inputs["input_ids"][i:i + row_length]

    #  inputs["input_ids"][x] = the ID Numbers
    #  vocab[inputs["input_ids"][x]] = the sub-word

    # Initialize an empty string for the new row
    row_string_ids = ""
    row_string_words = ""

    # Iterate through each token-id in the row, using j as index
    for j, id in enumerate(row):
        # here, id = inputs["input_ids"][j]
        # Calculate the global index of the token
        global_index = i + j

        word = vocab[id]
        padded_word = word.ljust(max_word_length)

        # Determine the color based on the global token index
        if global_index < 77:
            color = bcolors.OKGREEN
        else:
            color = bcolors.RED

        # Append the colored token to the row string
        row_string_ids += f"{color}{id:5d}{bcolors.ENDC}"
        row_string_words += f"{color}{padded_word}{bcolors.ENDC}"

        if j < len(row) - 1:
            row_string_ids += ", "  # Add comma and space between tokens
            row_string_words += ", "  # Add comma and space between tokens


    # Determine if a comma should be added at the end
    end_comma = ',' if i + row_length < len(inputs["input_ids"]) else ''

    # add to final strings
    final_string_ids += row_string_ids + end_comma + '\n'
    final_string_words  += row_string_words + end_comma + '\n'

  print('\n')

  print(final_string_ids)

  print('\n')

  print(final_string_words)




In [152]:
#@title 4. Run

PROMPT_CAPTION = "A charming photo of a small bunny. He is wearing a suit. masterpiece, ultra-quality, hyperrealistic, RAW photo, highly detailed, 4k, medium shot, cinematic photography, natural texture, action shot, XF IQ4, 150MP, 50mm, ISO 1000, 1/250s, natural light"

tokenize(PROMPT_CAPTION)



vocab size: 49408
max length: 77
[96mspecial token (bos): <|startoftext|>: [49406][0m
[96mspecial token (eos): <|endoftext|>: [49407][0m
[92mGREEN = Within the 77-Token Limit[0m
[91mRED = Outside 77-Token Limit[0m



[92m49406[0m, [92m  320[0m, [92m12177[0m, [92m 1125[0m, [92m  539[0m, [92m  320[0m, [92m 2442[0m, [92m 9258[0m, [92m  269[0m, [92m  797[0m,
[92m  533[0m, [92m 3309[0m, [92m  320[0m, [92m 3940[0m, [92m  269[0m, [92m12066[0m, [92m  267[0m, [92m 8118[0m, [92m  268[0m, [92m 3027[0m,
[92m  267[0m, [92m 7997[0m, [92m16157[0m, [92m  267[0m, [92m 6323[0m, [92m 1125[0m, [92m  267[0m, [92m 5302[0m, [92m12609[0m, [92m  267[0m,
[92m  275[0m, [92m  330[0m, [92m  267[0m, [92m 8675[0m, [92m 2000[0m, [92m  267[0m, [92m25602[0m, [92m 2108[0m, [92m  267[0m, [92m 3288[0m,
[92m16505[0m, [92m  267[0m, [92m 1816[0m, [92m 2000[0m, [92m  267[0m, [92m35274[0m, [92m19996[0m, [92m  275[0m, [92m  