# Image Tokenizer
Notebook to get token embeddings from different types of Vision Transformers

In [36]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
from PIL import Image

In [37]:
import torch

# Check if a GPU is available and if not use the CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Print the device
print('Using device:', device)

Using device: cuda


In [38]:
import cv2
from matplotlib import pyplot as plt

# Load the image
img = cv2.imread('/home/apoorv/Documents/CMSC848K/ObitoNet/Dataset/Barn_data/Barn_images/000001.jpg')

# Resize the image
img = cv2.resize(img, (256, 256))

if img is None:
    print('Could not open or find the image')
else:
    # Show the image with cv2
    cv2.imshow('image', img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()


## Vision Transformer (ViT)
Model used: https://huggingface.co/docs/transformers/en/model_doc/vit

In [39]:
from transformers import AutoImageProcessor, ViTModel
import torch
from datasets import load_dataset

# Standard data
# dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
# image = dataset["test"]["image"][0]

# Custom data
image = img

image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k", use_fast=True)
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

[1, 197, 768]

Extract image tokens and CLS token

In [40]:
# CLS token 
cls_token = last_hidden_states[:, 0, :]

# Print the shape of the CLS token
print(cls_token.shape)

# Image tokens
img_tokens = last_hidden_states[:, 1:, :]

# Print the shape of the image tokens
print(img_tokens.shape)

torch.Size([1, 768])
torch.Size([1, 196, 768])


In [41]:
i = 3

# Print the ith image token
# print(img_tokens[0][i])

# Print size of the image token
print(img_tokens[0][i].shape)

torch.Size([768])


## ViT MAE

Model used: https://huggingface.co/docs/transformers/en/model_doc/vit_mae

In [42]:
from transformers import AutoImageProcessor, ViTMAEModel
from PIL import Image
import requests

# Standard data
# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)

# Custom data
image = img

image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base", use_fast=True)
model = ViTMAEModel.from_pretrained("facebook/vit-mae-base")

inputs = image_processor(images=image, return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

[1, 50, 768]

Extract image tokens and CLS token

In [43]:
# CLS token 
cls_token = last_hidden_states[:, 0, :]

# Print the shape of the CLS token
print(cls_token.shape)

# Image tokens
img_tokens = last_hidden_states[:, 1:, :]

# Print the shape of the image tokens
print(img_tokens.shape)

torch.Size([1, 768])
torch.Size([1, 49, 768])


In [44]:
i = 3

# Print the ith image token
print(img_tokens[0][i])

# Print size of the image token
print(img_tokens[0][i].shape)

tensor([-8.9910e-02, -5.9313e-02,  3.4205e-01,  1.9082e-01, -1.2683e+00,
        -4.9870e-01, -1.9346e-01,  1.3152e-01, -6.9172e-02,  3.1076e-01,
         7.0628e-02, -6.1973e-02, -6.9237e-01,  3.8554e-02,  1.0813e-01,
         2.1347e-04,  1.3486e-01, -1.0883e-01, -1.7690e-01,  1.7412e-01,
         1.0534e-01, -1.2633e-01, -2.5232e-02, -3.7265e-01, -1.8562e-01,
         3.6886e-01, -1.6058e-01,  3.0628e-01, -1.4111e-01,  7.5429e-02,
         5.1949e-02,  3.2603e-01, -8.3353e-01, -6.2744e-02, -2.2416e-01,
         2.7130e-01, -6.7815e-02,  4.9448e-02, -1.4388e-01, -3.2875e-02,
        -2.5440e-02, -3.9405e-01, -4.8925e-01, -1.4608e-01,  4.5412e-01,
        -6.3926e-01, -5.6038e-02, -9.1604e-02,  7.5471e-03, -5.6831e-03,
        -1.7545e-01,  1.5103e-01, -5.7806e-02,  7.1675e-03,  3.1573e-01,
        -4.9434e-02,  1.7498e-01, -4.2646e-02,  2.3797e-01,  3.4052e-01,
        -3.2057e-02,  1.5457e-01,  4.2712e-02,  5.5651e-01, -2.5422e-01,
         1.3632e-01, -8.5133e-02,  3.8885e-02, -4.1

## UperNet

Model used: https://huggingface.co/docs/transformers/en/model_doc/upernet#upernet

In [45]:
from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
from PIL import Image
from huggingface_hub import hf_hub_download

image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-tiny")
model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-tiny", output_hidden_states=True)

filepath = hf_hub_download(
    repo_id="hf-internal-testing/fixtures_ade20k", filename="ADE_val_00000001.jpg", repo_type="dataset"
)
image = Image.open(filepath).convert("RGB")

inputs = image_processor(images=image, return_tensors="pt")

outputs = model(**inputs)

hidden_states = outputs.hidden_states

hidden_states = hidden_states[0]

# Print the shape of the hidden states
print(list(hidden_states.shape))

[1, 96, 128, 128]


In [46]:
logits = outputs.logits  # shape (batch_size, num_labels, height, width)
list(logits.shape)

[1, 150, 512, 512]