# Scripts for Exporting PyTorch Models to ONNX and CoreML

In [None]:
!pip install --upgrade "uform[torch]" coremltools

In [1]:
import uform
from PIL import Image

model, processor = uform.get_model('unum-cloud/uform-vl-english-small')
text = 'a small red panda in a zoo'
image = Image.open('../../assets/unum.png')

image_data = processor.preprocess_image(image)
text_data = processor.preprocess_text(text)

image_features, image_embedding = model.encode_image(image_data, return_features=True)
text_features, text_embedding = model.encode_text(text_data, return_features=True)

image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape

  Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so
  Expected in:     <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib
  warn(f"Failed to load image Python extension: {e}")


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

(torch.Size([1, 197, 384]),
 torch.Size([1, 64, 768]),
 torch.Size([1, 256]),
 torch.Size([1, 256]))

In [2]:
model.text_encoder

TextEncoder(model_type='bert', dim=768, context_dim=384, vocab_size=30522, padding_idx=0, num_layers=4, num_heads=12, embedding_dim=256, multimodal_layers_ids=[2, 3], head_one_neuron=False, pooling='cls', max_position_embeddings=64, dropout_prob=0.1)

In [3]:
model.image_encoder

VisualEncoder(dim=384, patch_size=16, image_size=224, num_layers=12, num_heads=6, embedding_dim=256, pooling='cls', num_reg_tokens=0)

In [4]:
# Assuming `model` is your loaded model with image_encoder and text_encoder attributes
for name, module in model.image_encoder.named_children():
    print(f"First layer of image_encoder: {name}")
    break  # We break after the first layer

for name, module in model.text_encoder.named_children():
    print(f"First layer of text_encoder: {name}")
    break  # We break after the first layer

First layer of image_encoder: patch_embed
First layer of text_encoder: word_embeddings


## ONNX

## CoreML

In [5]:
import coremltools as ct
import torch

scikit-learn version 1.2.1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.
Torch version 2.1.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.1.0 is the most recent version that has been tested.


In [11]:
image_input = ct.TensorType(name="input", shape=image_data.shape)
text_input = ct.TensorType(name="input_ids", shape=text_data["input_ids"].shape)
text_attention_input = ct.TensorType(name="attention_mask", shape=text_data["attention_mask"].shape)
text_features = ct.TensorType(name="features")
text_embeddings = ct.TensorType(name="embeddings")
image_features = ct.TensorType(name="features")
image_embeddings = ct.TensorType(name="embeddings")

In [12]:
module = model.image_encoder
module.eval()
module.return_features = True

traced_script_module = torch.jit.trace(module, example_inputs=image_data)
traced_script_module

VisualEncoder(
  original_name=VisualEncoder
  (patch_embed): Conv2d(original_name=Conv2d)
  (blocks): Sequential(
    original_name=Sequential
    (0): VisualEncoderBlock(
      original_name=VisualEncoderBlock
      (norm1): LayerNorm(original_name=LayerNorm)
      (attn): Attention(
        original_name=Attention
        (query): Linear(original_name=Linear)
        (key): Linear(original_name=Linear)
        (value): Linear(original_name=Linear)
        (out): Linear(original_name=Linear)
      )
      (ls1): LayerScale(original_name=LayerScale)
      (norm2): LayerNorm(original_name=LayerNorm)
      (mlp): MLP(
        original_name=MLP
        (hidden_layer): Linear(original_name=Linear)
        (output_layer): Linear(original_name=Linear)
      )
      (ls2): LayerScale(original_name=LayerScale)
    )
    (1): VisualEncoderBlock(
      original_name=VisualEncoderBlock
      (norm1): LayerNorm(original_name=LayerNorm)
      (attn): Attention(
        original_name=Attention
    

In [13]:
coreml_model = ct.convert(
    traced_script_module, source="pytorch",
    inputs=[image_input], outputs=[image_features, image_embeddings],
    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)

coreml_model.author = 'Unum Cloud'
coreml_model.license = 'Apache 2.0'
coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'
coreml_model.save("../uform-vl-english-small-image.mlpackage")

Tuple detected at graph output. This will be flattened in the converted model.
Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]
Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]


In [9]:
module = model.text_encoder
module.eval()
module.return_features = True

traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])
traced_script_module

TextEncoder(
  original_name=TextEncoder
  (word_embeddings): Embedding(original_name=Embedding)
  (position_embeddings): Embedding(original_name=Embedding)
  (layer_norm): LayerNorm(original_name=LayerNorm)
  (dropout): Dropout(original_name=Dropout)
  (blocks): ModuleList(
    original_name=ModuleList
    (0): TextEncoderBlock(
      original_name=TextEncoderBlock
      (norm_attn): LayerNorm(original_name=LayerNorm)
      (attention): Attention(
        original_name=Attention
        (query): Linear(original_name=Linear)
        (key): Linear(original_name=Linear)
        (value): Linear(original_name=Linear)
        (out): Linear(original_name=Linear)
      )
      (norm_mlp): LayerNorm(original_name=LayerNorm)
      (mlp): MLP(
        original_name=MLP
        (hidden_layer): Linear(original_name=Linear)
        (output_layer): Linear(original_name=Linear)
      )
      (dropout): Dropout(original_name=Dropout)
    )
    (1): TextEncoderBlock(
      original_name=TextEncoderBloc

In [10]:
coreml_model = ct.convert(
    traced_script_module, source="pytorch",
    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],
    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)

coreml_model.author = 'Unum Cloud'
coreml_model.license = 'Apache 2.0'
coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'
coreml_model.save("../uform-vl-english-small-text.mlpackage")

Tuple detected at graph output. This will be flattened in the converted model.
Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/157 [00:00<?, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.
Converting PyTorch Frontend ==> MIL Ops:  99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]
Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]
