In [1]:
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
from peft import PeftModel

# Constants
MODEL_PATH = "florence2-lora-20250209T125717Z-001_MODEL-3_15-EPOCHS/florence2-lora"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Processor & Model
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)

base_model_id = "microsoft/Florence-2-base-ft"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id, trust_remote_code=True, revision="refs/pr/6"
).to(DEVICE)

peft_model = PeftModel.from_pretrained(base_model, MODEL_PATH).to(DEVICE)


Florence2LanguageForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


## This saves as a 410MB quantized pt model file

In [2]:
from torch.ao.quantization import quantize_dynamic
quantized_model = quantize_dynamic(peft_model, {torch.nn.Linear}, dtype=torch.qint8)
torch.save(quantized_model.state_dict(), "florence2_lora_quantized.pt")

In [3]:
peft_model.eval()
peft_model.to("cuda" if torch.cuda.is_available() else "cpu")

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Florence2ForConditionalGeneration(
      (vision_tower): DaViT(
        (convs): ModuleList(
          (0): ConvEmbed(
            (proj): Conv2d(3, 128, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
            (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          )
          (1): ConvEmbed(
            (proj): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
            (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          )
          (2): ConvEmbed(
            (proj): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
            (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          )
          (3): ConvEmbed(
            (proj): Conv2d(512, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
            (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          )
        )
        (blocks): ModuleList(
      

## This prunes the model, but still same size 410MB, not great

In [4]:
import torch
import torch.nn.utils.prune as prune

checkpoint_path = "florence2_lora_quantized.pt"

# Load model
model = torch.load(checkpoint_path, map_location="cpu")

# Apply pruning (removing 50% of weights in linear layers)
for name, module in model.items():
    if isinstance(module, torch.nn.Linear):  # Apply pruning to linear layers
        prune.l1_unstructured(module, name="weight", amount=0.5)

# Save pruned model
pruned_checkpoint_path = "florence2_lora_quantized_pruned.pt"
torch.save(model, pruned_checkpoint_path)
print(f"Pruned model saved as {pruned_checkpoint_path}")



  model = torch.load(checkpoint_path, map_location="cpu")
  device=storage.device,


Pruned model saved as florence2_lora_quantized_pruned.pt


##  INT8 or FP16 Quantization: This reduces size to 344MB

In [5]:
import torch

checkpoint_path = "florence2_lora_quantized_pruned.pt"
quantized_checkpoint_path = "florence2_lora_quantized_pruned_int8.pt"  # Change to _fp16.pt if using float16

# Load checkpoint
checkpoint = torch.load(checkpoint_path, map_location="cpu")

# Convert all tensor values in the state_dict
for key in checkpoint:
    if isinstance(checkpoint[key], torch.Tensor):  # Only convert tensors
        checkpoint[key] = checkpoint[key].to(torch.int8)  # Change to .to(torch.float16) for FP16

# Save quantized checkpoint
torch.save(checkpoint, quantized_checkpoint_path)
print(f"Quantized checkpoint saved as {quantized_checkpoint_path}")


  checkpoint = torch.load(checkpoint_path, map_location="cpu")


Quantized checkpoint saved as florence2_lora_quantized_pruned_int8.pt


## This saves as a 338MB onnx model file using VIT Base Model

In [7]:
import torch
import timm
import onnx

# Load the ViT model
model_name = "vit_base_patch16_224"
model = timm.create_model(model_name, pretrained=False)  # No need to load default weights
model.eval()  # Set to evaluation mode

# Load pretrained Florence v2 LoRA quantized weights
checkpoint_path = "florence2_lora_quantized_pruned_int8.pt"
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))

# Load weights into the model
model.load_state_dict(checkpoint, strict=False)  # Use strict=False in case of mismatched keys

print("Pretrained Florence v2 LoRA quantized weights loaded successfully!")

# Dummy input (batch_size=1, 3 color channels, 224x224 image size)
dummy_input = torch.randn(1, 3, 224, 224)

# Export to ONNX
onnx_path = "vit_base_patch16_224_florence2.onnx"
torch.onnx.export(
    model, 
    dummy_input, 
    onnx_path, 
    input_names=["input"], 
    output_names=["output"], 
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}, 
    opset_version=14  # Use a compatible ONNX opset version
)

print(f"Model exported successfully to {onnx_path}")


  checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))


Pretrained Florence v2 LoRA quantized weights loaded successfully!


  assert condition, message


Model exported successfully to vit_base_patch16_224_florence2.onnx


## This saves as a 86MB onnx model file using VIT Small Model

In [11]:
import torch
import timm
import onnx

# Use a smaller ViT model
model_name = "vit_small_patch16_224"  # Use "vit_tiny_patch16_224" for an even smaller model
model = timm.create_model(model_name, pretrained=False)  
model.eval()  # Set to evaluation mode

# Load pretrained Florence v2 LoRA quantized weights
checkpoint_path = "florence2_lora_quantized_pruned_int8.pt"
checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))

# Load weights into the model
model.load_state_dict(checkpoint, strict=False)  # Allow mismatched keys

print("Pretrained Florence v2 LoRA quantized weights loaded successfully!")

# Dummy input for ONNX export (batch_size=1, 3 color channels, 224x224 image size)
dummy_input = torch.randn(1, 3, 224, 224)

# Export to ONNX
onnx_path = "vit_small_patch16_224_florence2.onnx"  # Change to vit_tiny_patch16_224_florence2.onnx for a smaller model
torch.onnx.export(
    model, 
    dummy_input, 
    onnx_path, 
    input_names=["input"], 
    output_names=["output"], 
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}, 
    opset_version=14  # Compatible ONNX opset version
)

print(f"Model exported successfully to {onnx_path}")


  checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))


Pretrained Florence v2 LoRA quantized weights loaded successfully!
Model exported successfully to vit_small_patch16_224_florence2.onnx


In [24]:
import torch
import torchvision
import timm
import onnx
from torchvision.models.detection import fasterrcnn_resnet50_fpn
model = fasterrcnn_resnet50_fpn(pretrained=True)


# Load DINOv2 model
# model_name = "vit_small_patch14_dinov2.lvd142m"  # Adjust as needed
# model = timm.create_model(model_name, pretrained=False)
# model.eval()

# Load a pretrained DETR model (End-to-End Object Detector)
# model = torchvision.models.detection.detr_resnet50(pretrained=True)
model.eval()

# Load Florence v2 quantized weights
checkpoint_path = "florence2_lora_quantized_pruned_int8.pt"
checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))
model.load_state_dict(checkpoint, strict=False)

print("Pretrained Florence v2 LoRA quantized weights loaded successfully!")

# Dummy input (assuming image size is 518x518 as per DINOv2)
dummy_input = torch.randn(1, 3, 518, 518)

# Export to ONNX with object detection outputs
onnx_path = "dinov2_florence2_od.onnx"
torch.onnx.export(
    model,
    dummy_input,
    onnx_path,
    input_names=["input"],
    output_names=["bboxes", "labels"],  # Include outputs needed for OD
    dynamic_axes={"input": {0: "batch_size"}, "bboxes": {0: "batch_size"}, "labels": {0: "batch_size"}},
    opset_version=14
)

print(f"Model exported successfully to {onnx_path}")


Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to C:\Users\cynth/.cache\torch\hub\checkpoints\fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [03:07<00:00, 891kB/s] 
  checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))


Pretrained Florence v2 LoRA quantized weights loaded successfully!


  * torch.tensor(scale_factors[i], dtype=torch.float32)
  boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
  boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
  assert condition, message
  torch.tensor(s, dtype=torch.float32, device=boxes.device)
  / torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)


Model exported successfully to dinov2_florence2_od.onnx


In [9]:
# Run a forward pass to inspect the output format
with torch.no_grad():
    sample_output = model(dummy_input)
    print(f"Model Output Shape: {sample_output.shape}")  # Debug output


Model Output Shape: torch.Size([1, 1000])


## This normally saves the model = 1GB

In [None]:

# Save the model as a .pt file
SAVE_PATH = "florence2_lora.pt"
torch.save(peft_model.state_dict(), SAVE_PATH)
print(f"Model saved to {SAVE_PATH}")


In [2]:
import numpy
print(numpy.__version__)

2.0.2


In [None]:
import torch
from transformers import AutoModel

model_name = "microsoft/Florence-2-base-ft"  # Ensure this is the correct model ID
pytorch_model = AutoModel.from_pretrained(model_name)

dummy_input = torch.randn(1, 3, 224, 224)  # Adjust based on the input shape
torch.onnx.export(pytorch_model, dummy_input, "florencev2.onnx")


OSError: microsoft/florence-v2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`