<a href="https://colab.research.google.com/github/Lednik7/CLIP-ONNX/blob/main/examples/readme_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Restart colab session after installation
Reload the session if something doesn't work

In [1]:
%%capture
!pip install git+https://github.com/Lednik7/CLIP-ONNX.git
!pip install git+https://github.com/openai/CLIP.git
!pip install onnxruntime-gpu

In [2]:
%%capture
!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true

In [3]:
!nvidia-smi # CPU Provider

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [4]:
import onnxruntime

print(onnxruntime.get_device()) # priority device

CPU


## CPU inference mode

In [7]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)



In [5]:
# clip models
import clip
from PIL import Image
import numpy as np

# onnx cannot export with cuda
model, preprocess = clip.load("ViT-B/32", device="cpu", jit=False)

# batch first
image = preprocess(Image.open('/mnt/eds_share/share/Spine2D/GlobusSrgMapData_crop_square/test/images/anon_d4730414.dcm_lateral_full_lumbar_thoracic.jpg')).unsqueeze(0).cpu() # [1, 3, 224, 224]
image_onnx = image.detach().cpu().numpy().astype(np.float32)

# batch first
text = clip.tokenize(["MRI", "Xray", "CT"]).cpu() # [3, 77]
text_onnx = text.detach().cpu().numpy().astype(np.int32)

from clip_onnx import clip_onnx

visual_path = "clip_visual.onnx"
textual_path = "clip_textual.onnx"

onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path)
onnx_model.convert2onnx(image, text, verbose=True)
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # cpu mode

image_features = onnx_model.encode_image(image_onnx)
text_features = onnx_model.encode_text(text_onnx)

logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx)
probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421067 0.00299571]]

[CLIP ONNX] Start convert visual model
[CLIP ONNX] Start check visual model
[CLIP ONNX] Start convert textual model
[CLIP ONNX] Start check textual model
[CLIP ONNX] Models converts successfully
Label probs: [[0.08915787 0.90815187 0.00269029]]


In [14]:
# openclip models
import open_clip
from PIL import Image
import numpy as np
from clip_onnx import clip_onnx

# onnx cannot export with cuda
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k', device="cpu", jit=False)

# batch first
image = preprocess(Image.open('/mnt/eds_share/share/Spine2D/GlobusSrgMapData_crop_square/test/images/anon_d4730414.dcm_lateral_full_lumbar_thoracic.jpg')).unsqueeze(0).cpu() # [1, 3, 224, 224]
image_onnx = image.detach().cpu().numpy().astype(np.float32)

# batch first
text = open_clip.tokenize(["MRI", "Xray", "CT"]).cpu() # [3, 77]
text_onnx = text.detach().cpu().numpy().astype(np.int64)

visual_path = "clip_visual.onnx"
textual_path = "clip_textual.onnx"

onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path, openclip="openclip")
onnx_model.convert2onnx(image, text, verbose=True)
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # cpu mode

image_features = onnx_model.encode_image(image_onnx)
text_features = onnx_model.encode_text(text_onnx)

logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx)
probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421067 0.00299571]]

[CLIP ONNX] Start convert visual model
[CLIP ONNX] Start check visual model
[CLIP ONNX] Start convert textual model




[CLIP ONNX] Start check textual model
[CLIP ONNX] Models converts successfully
Label probs: [[0.01968996 0.93325    0.04706011]]


In [4]:
# openclip models (load local onnx files)
from PIL import Image
import numpy as np
from clip_onnx import clip_onnx
import open_clip

# onnx cannot export with cuda
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k', device="cpu", jit=False)

# batch first
image = preprocess(Image.open('/mnt/eds_share/share/Spine2D/GlobusSrgMapData_crop_square/test/images/anon_d4730414.dcm_lateral_full_lumbar_thoracic.jpg')).unsqueeze(0).cpu() # [1, 3, 224, 224]
image_onnx = image.detach().cpu().numpy().astype(np.float32)

# batch first
text = open_clip.tokenize(["MRI", "Xray", "CT"]).cpu() # [3, 77]
text_onnx = text.detach().cpu().numpy().astype(np.int64)

visual_path = "clip_visual.onnx"
textual_path = "clip_textual.onnx"

onnx_model = clip_onnx(None)
onnx_model.load_onnx(visual_path="clip_visual.onnx",
                     textual_path="clip_textual.onnx",
                     logit_scale=100.0000) # model.logit_scale.exp()
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # cpu mode

image_features = onnx_model.encode_image(image_onnx)
text_features = onnx_model.encode_text(text_onnx)

logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx)
probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421067 0.00299571]]

[CLIP ONNX] Load mode
Label probs: [[0.01968999 0.9332499  0.04706019]]


In [17]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(224, 224))
    <function _convert_to_rgb at 0x7f593c1cb910>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [8]:
# openclip (Coca) models
import open_clip
from PIL import Image
import numpy as np
from clip_onnx import clip_onnx

# onnx cannot export with cuda
model, _, preprocess = open_clip.create_model_and_transforms('coca_ViT-L-14', pretrained='mscoco_finetuned_laion2b_s13b_b90k', device="cpu", jit=False)

# batch first
image = preprocess(Image.open('/mnt/eds_share/share/Spine2D/GlobusSrgMapData_crop_square/test/images/anon_d4730414.dcm_lateral_full_lumbar_thoracic.jpg')).unsqueeze(0).cpu() # [1, 3, 224, 224]
image_onnx = image.detach().cpu().numpy().astype(np.float32)

# batch first
text = open_clip.tokenize(["MRI", "Xray", "CT"]).cpu() # [3, 77]
text_onnx = text.detach().cpu().numpy().astype(np.int64)

visual_path = "clip_visual.onnx"
textual_path = "clip_textual.onnx"

onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path, openclip="openclip")
onnx_model.convert2onnx(image, text, verbose=True)
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # cpu mode

image_features = onnx_model.encode_image(image_onnx)
text_features = onnx_model.encode_text(text_onnx)

logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx)
probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421067 0.00299571]]

[CLIP ONNX] Start convert visual model
[CLIP ONNX] Start check visual model
[CLIP ONNX] Start convert textual model


AttributeError: 'CoCa' object has no attribute 'transformer'

In [11]:
model

CoCa(
  (text): TextTransformer(
    (token_embedding): Embedding(49408, 768)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (visual): VisionTransformer(
    (patchnorm_pre_ln): Identity()
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(1