# Scripts for Exporting PyTorch Models to ONNX and CoreML

In [None]:
!pip install --upgrade "uform[torch]" coremltools

In [None]:
import os
model_name = "uform-vl-english-small"
output_directory = "../../"

In [None]:
import uform
from PIL import Image

model, processor = uform.get_model('unum-cloud/' + model_name)
text = 'a small red panda in a zoo'
image = Image.open('../../assets/unum.png')

image_data = processor.preprocess_image(image)
text_data = processor.preprocess_text(text)

image_features, image_embedding = model.encode_image(image_data, return_features=True)
text_features, text_embedding = model.encode_text(text_data, return_features=True)

image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape

In [None]:
model.text_encoder

In [None]:
model.image_encoder

In [None]:
# Assuming `model` is your loaded model with image_encoder and text_encoder attributes
for name, module in model.image_encoder.named_children():
    print(f"First layer of image_encoder: {name}")
    break  # We break after the first layer

for name, module in model.text_encoder.named_children():
    print(f"First layer of text_encoder: {name}")
    break  # We break after the first layer

# PyTorch

Let's ensure:

- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.
- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.
- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download.

In [None]:
list(name for name, _ in model.text_encoder.named_parameters())

In [None]:
# Verify input and output names for text_encoder
text_encoder_input_names = [name for name, _ in model.text_encoder.named_parameters()]
assert 'input_ids' in text_encoder_input_names, "input_ids not found in text_encoder inputs"
assert 'attention_mask' in text_encoder_input_names, "attention_mask not found in text_encoder inputs"

text_encoder_output_names = [name for name, _ in model.text_encoder.named_modules()]
assert 'embeddings' in text_encoder_output_names, "embeddings not found in text_encoder outputs"
assert 'features' in text_encoder_output_names, "features not found in text_encoder outputs"

# Verify input and output names for image_encoder
image_encoder_input_names = [name for name, _ in model.image_encoder.named_parameters()]
assert 'input' in image_encoder_input_names, "input not found in image_encoder inputs"

image_encoder_output_names = [name for name, _ in model.image_encoder.named_modules()]
assert 'embeddings' in image_encoder_output_names, "embeddings not found in image_encoder outputs"
assert 'features' in image_encoder_output_names, "features not found in image_encoder outputs"

# Ensure the model can be converted to f16 half-precision
try:
    model.half()  # Convert to half precision
    print("Model successfully converted to half precision (f16).")
except Exception as e:
    print(f"An error occurred while converting the model to half precision: {e}")

## ONNX

## CoreML

In [None]:
import coremltools as ct
import torch

In [None]:
precision = ct.precision.FLOAT32

CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.

```python
        image_input = ct.TensorType(name="input", shape=image_data.shape)
        text_input = ct.TensorType(name="input_ids", shape=text_data["input_ids"].shape)
        text_attention_input = ct.TensorType(name="attention_mask", shape=text_data["attention_mask"].shape)
```

That, however, will only work for batch-size one. To support larger batches, we need to override the input shapes.

```python
        ct.RangeDim(lower_bound=25, upper_bound=100, default=45)
```

In [None]:
def generalize_first_dimensions(input_shape, upper_bound=64):
    if upper_bound == 1:
        return input_shape
    input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]
    return input_shape

generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data["input_ids"].shape), generalize_first_dimensions(text_data["attention_mask"].shape)

In [None]:
image_input = ct.TensorType(name="input", shape=generalize_first_dimensions(image_data.shape, 1))
text_input = ct.TensorType(name="input_ids", shape=generalize_first_dimensions(text_data["input_ids"].shape, 1))
text_attention_input = ct.TensorType(name="attention_mask", shape=generalize_first_dimensions(text_data["attention_mask"].shape, 1))
text_features = ct.TensorType(name="features")
text_embeddings = ct.TensorType(name="embeddings")
image_features = ct.TensorType(name="features")
image_embeddings = ct.TensorType(name="embeddings")

In [None]:
module = model.image_encoder
module.eval()
module.return_features = True

traced_script_module = torch.jit.trace(module, example_inputs=image_data)
traced_script_module

In [None]:
coreml_model = ct.convert(
    traced_script_module, source="pytorch",
    inputs=[image_input], outputs=[image_features, image_embeddings],
    convert_to='mlprogram', compute_precision=precision)

coreml_model.author = 'Unum Cloud'
coreml_model.license = 'Apache 2.0'
coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'
coreml_model.save(os.path.join(output_directory, model_name + "-image.mlpackage"))

In [None]:
module = model.text_encoder
module.eval()
module.return_features = True

traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])
traced_script_module

In [None]:
coreml_model = ct.convert(
    traced_script_module, source="pytorch",
    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],
    convert_to='mlprogram', compute_precision=precision)

coreml_model.author = 'Unum Cloud'
coreml_model.license = 'Apache 2.0'
coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'
coreml_model.save(os.path.join(output_directory, model_name + "-text.mlpackage"))