# Scripts for Exporting PyTorch Models to ONNX and CoreML

In [None]:
!pip install --upgrade "uform[torch]" coremltools

In [None]:
import uform
from PIL import Image

model, processor = uform.get_model('unum-cloud/uform-vl-english-small')
text = 'a small red panda in a zoo'
image = Image.open('../../assets/unum.png')

image_data = processor.preprocess_image(image)
text_data = processor.preprocess_text(text)

image_features, image_embedding = model.encode_image(image_data, return_features=True)
text_features, text_embedding = model.encode_text(text_data, return_features=True)

image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape

In [None]:
model.text_encoder

In [None]:
model.image_encoder

In [None]:
# Assuming `model` is your loaded model with image_encoder and text_encoder attributes
for name, module in model.image_encoder.named_children():
    print(f"First layer of image_encoder: {name}")
    break  # We break after the first layer

for name, module in model.text_encoder.named_children():
    print(f"First layer of text_encoder: {name}")
    break  # We break after the first layer

## CoreML

In [None]:
import coremltools as ct
import torch

In [None]:
image_input = ct.TensorType(name="input", shape=image_data.shape)
text_input = ct.TensorType(name="input_ids", shape=text_data["input_ids"].shape)
text_attention_input = ct.TensorType(name="attention_mask", shape=text_data["attention_mask"].shape)
text_features = ct.TensorType(name="features")
text_embeddings = ct.TensorType(name="embeddings")
image_features = ct.TensorType(name="features")
image_embeddings = ct.TensorType(name="embeddings")

In [None]:
module = model.image_encoder
module.eval()
module.return_features = True

traced_script_module = torch.jit.trace(module, example_inputs=image_data)
traced_script_module

In [None]:
coreml_model = ct.convert(
    traced_script_module, source="pytorch",
    inputs=[image_input], outputs=[image_features, image_embeddings],
    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)

coreml_model.author = 'Unum Cloud'
coreml_model.license = 'Apache 2.0'
coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'
coreml_model.save("../uform-vl-english-small-image.mlpackage")

In [None]:
module = model.text_encoder
module.eval()
module.return_features = True

traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])
traced_script_module

In [None]:
coreml_model = ct.convert(
    traced_script_module, source="pytorch",
    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],
    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)

coreml_model.author = 'Unum Cloud'
coreml_model.license = 'Apache 2.0'
coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'
coreml_model.save("../uform-vl-english-small-text.mlpackage")

# PyTorch

Let's ensure that the input layers and the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download.

In [None]:
import torch
from safetensors import safe_open
from safetensors.torch import save_file

In [None]:
model.image_encoder.eval()
model.image_encoder.to(dtype=torch.bfloat16)

In [None]:
torch.save(model.image_encoder.state_dict(), 'image.pt')

In [None]:
save_file(model.image_encoder.state_dict(), "image.safetensors")

In [None]:
model.text_encoder.eval()
model.text_encoder.to(dtype=torch.bfloat16)

In [None]:
torch.save(model.text_encoder.state_dict(), 'text.pt')

In [None]:
save_file(model.text_encoder.state_dict(), "text.safetensors")

In [None]:
image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)
text_features, text_embedding = model.encode_text(text_data, return_features=True)

image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape

In [None]:
!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors
!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors

In [None]:
!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt
!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt

## ONNX

In [None]:
!pip install onnx onnxconverter-common

In [None]:
from torch.onnx import export as onnx_export

We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first.

In [None]:
module = model.text_encoder
module.eval()
module.return_features = True
module.to(dtype=torch.float32)

onnx_export(
    module,
    (text_data["input_ids"], text_data["attention_mask"]), 
    "text.onnx", 
    export_params=True,
    opset_version=15,
    do_constant_folding=True,
    input_names = ['input_ids', 'attention_mask'], 
    output_names = ['features', 'embeddings'],
    dynamic_axes={
        'input_ids' : {0 : 'batch_size'}, 
        'attention_mask' : {0 : 'batch_size'}, 
        'features' : {0 : 'batch_size'}, 
        'embeddings' : {0 : 'batch_size'}})

Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision.

In [None]:
import onnx
from onnxconverter_common import float16

module = onnx.load("text.onnx")
module_fp16 = float16.convert_float_to_float16(module)
onnx.save(module_fp16, "text.onnx")

Now repeat the same for images.

In [None]:
module = model.image_encoder
module.eval()
module.return_features = True
module.to(dtype=torch.float32)

torch.onnx.export(
    module,
    image_data, 
    "image.onnx", 
    export_params=True,
    opset_version=15,
    do_constant_folding=True,
    input_names = ['input'], 
    output_names = ['features', 'embeddings'],
    dynamic_axes={
        'input' : {0 : 'batch_size'},
        'features' : {0 : 'batch_size'},
        'embeddings' : {0 : 'batch_size'}})

In [None]:
import onnx
from onnxconverter_common import float16

module = onnx.load("image.onnx")
module_fp16 = float16.convert_float_to_float16(module)
onnx.save(module_fp16, "image.onnx")

In [None]:
!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx
!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx