In [2]:
import os
import torch
from pathlib import Path
from rfdetr import RFDETRNano

In [5]:
# Recall saved Model checkpoint for 10 epochs
checkpoint_path = "models/trained_models/rfdetr_best_run_20250727_123823/checkpoint.pth"
saved_model_path = str(Path.cwd().parent / checkpoint_path)

# Load the RFDETR model with the pre-trained weights from the checkpoint
model = RFDETRNano(pretrain_weights=saved_model_path)

Using a different number of positional encodings than DINOv2, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.
Using patch size 16 instead of 14, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.
Loading pretrain weights


num_classes mismatch: pretrain weights has 9 classes, but your model has 90 classes
reinitializing detection head with 9 classes


# Model Base Size Optimization

In [9]:
def get_file_size_mb(file_path):
    """Get model size in MB"""
    size_bytes = os.path.getsize(file_path)
    size_mb = size_bytes / (1000 * 1000)
    return size_mb

# Before loading
original_size = get_file_size_mb(saved_model_path)
print(f"Original file size: {original_size:.2f} MB")

Original file size: 483.66 MB


In [6]:
checkpoint = torch.load(saved_model_path, weights_only=False)
checkpoint

{'model': OrderedDict([('transformer.decoder.layers.0.self_attn.in_proj_weight',
               tensor([[ 0.0529,  0.0423, -0.0011,  ...,  0.0051,  0.0480,  0.0495],
                       [-0.1254,  0.1441, -0.0490,  ..., -0.0093,  0.0924,  0.0932],
                       [ 0.0320,  0.0045, -0.0337,  ...,  0.0093, -0.0397,  0.0387],
                       ...,
                       [ 0.1894, -0.1131,  0.3422,  ..., -0.1349,  0.2828, -0.0270],
                       [ 0.0300,  0.0070, -0.1224,  ..., -0.0033,  0.0076,  0.1306],
                       [-0.0684,  0.1403, -0.3135,  ...,  0.1712, -0.1475,  0.0092]],
                      device='cuda:0')),
              ('transformer.decoder.layers.0.self_attn.in_proj_bias',
               tensor([-1.9461e-02,  1.2657e-02,  1.3592e-01, -5.9212e-02,  2.6691e-02,
                       -1.3242e-01, -2.9438e-02,  1.1290e-01, -1.9184e-02,  1.3823e-01,
                       -4.4255e-02, -6.3642e-02,  1.0147e-01, -1.6277e-01,  3.5565e-02,
     

In [7]:
checkpoint.keys()

dict_keys(['model', 'optimizer', 'lr_scheduler', 'epoch', 'args', 'ema_model'])

In [8]:
checkpoint['args']

Namespace(num_classes=10, grad_accum_steps=1, amp=True, lr=0.0001, lr_encoder=0.00015, batch_size=16, weight_decay=0.0001, epochs=30, lr_drop=100, clip_max_norm=0.1, lr_vit_layer_decay=0.8, lr_component_decay=0.7, do_benchmark=False, dropout=0, drop_path=0.0, drop_mode='standard', drop_schedule='constant', cutoff_epoch=0, pretrained_encoder=None, pretrain_weights='rf-detr-nano.pth', pretrain_exclude_keys=None, pretrain_keys_modify_to_load=None, pretrained_distiller=None, encoder='dinov2_windowed_small', vit_encoder_num_layers=12, window_block_indexes=None, position_embedding='sine', out_feature_indexes=[3, 6, 9, 12], freeze_encoder=False, layer_norm=True, rms_norm=False, backbone_lora=False, force_no_pretrain=False, dec_layers=2, dim_feedforward=2048, hidden_dim=256, sa_nheads=8, ca_nheads=16, num_queries=300, group_detr=13, two_stage=True, projector_scale=['P4'], lite_refpoint_refine=True, num_select=300, dec_n_points=2, decoder_norm='LN', bbox_reparam=True, freeze_batch_norm=False, s

In [11]:
def fp16_quantize_checkpoint(checkpoint):
   state_dict = checkpoint['model']
   quantized_state_dict = {}
   
   for name, tensor in state_dict.items():
       if tensor.dtype == torch.float32:
           quantized_state_dict[name] = tensor.half()
       else:
           quantized_state_dict[name] = tensor
   
   new_checkpoint = checkpoint.copy()
   new_checkpoint['model'] = quantized_state_dict
   return new_checkpoint

# Usage
checkpoint = torch.load(saved_model_path, weights_only=False)
quantized = fp16_quantize_checkpoint(checkpoint)
torch.save(quantized, 'quantized_model_fp16.pth')

In [12]:
get_file_size_mb('quantized_model_fp16.pth')

423.308029

In [16]:
print(f"{((get_file_size_mb(saved_model_path) - get_file_size_mb('quantized_model_fp16.pth')) / get_file_size_mb(saved_model_path)) * 100}% reduction in size")

12.478525158838183% reduction in size


In [17]:
def clean_checkpoint_keep_args(checkpoint):
    # Keep only model weights and args
    return {
        'model': checkpoint['model'],
        'args': checkpoint['args']
    }

# Clean + keep args, then quantize
cleaned = clean_checkpoint_keep_args(checkpoint)
quantized = fp16_quantize_checkpoint(cleaned)
torch.save(quantized, 'model_clean_args_fp16.pth')

In [18]:
get_file_size_mb('model_clean_args_fp16.pth')

60.521183

In [19]:
print(f"{((get_file_size_mb(saved_model_path) - get_file_size_mb('model_clean_args_fp16.pth')) / get_file_size_mb(saved_model_path)) * 100}% reduction in size")

87.48688228804691% reduction in size


# Other Inference Based Optimization

In [20]:
import cv2

In [None]:
def resize_inference_test(image_path, model, scale_factor=0.75):
    """Test with smaller input resolution""" # did not work well with 0.5x
    
    import time
    original_image = cv2.imread(str(image_path))
    
    # Resize image down
    h, w = original_image.shape[:2]
    new_h, new_w = int(h * scale_factor), int(w * scale_factor)
    resized_image = cv2.resize(original_image, (new_w, new_h))
    
    # Time inference
    start = time.time()
    def callback(image):
        return model.predict(image, threshold=0.8)
    
    slicer = sv.InferenceSlicer(callback=callback)
    detections = slicer(resized_image)
    inference_time = time.time() - start
    
    print(f"Resized inference ({scale_factor}x): {inference_time:.3f}s")
    print(f"Resolution: {w}x{h} -> {new_w}x{new_h}")
    
    return detections, resized_image, inference_time




In [None]:
# Test 1: Smaller input resolution # this worked better for increasing tile size
image_path = str(Path.cwd().parent / 'dataset/inference/1038.tif')


rediced_model_path = "model_clean_args_fp16.pth"
reduced_model = RFDETRNano(pretrain_weights=rediced_model_path)

detections_75, img_75, time_75 = resize_inference_test(image_path, reduced_model, 0.75)
detections_50, img_50, time_50 = resize_inference_test(image_path, reduced_model, 0.50)

# Test 2: Optimized SAHI
detections_opt, img_opt, time_opt = optimized_sahi_inference(image_path, reduced_model)

print(f"\nSpeed comparison:")
print(f"75% resolution: {time_75:.3f}s")
print(f"50% resolution: {time_50:.3f}s") 
print(f"Optimized SAHI: {time_opt:.3f}s")

Using a different number of positional encodings than DINOv2, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.
Using patch size 16 instead of 14, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.


num_classes mismatch: pretrain weights has 9 classes, but your model has 90 classes
reinitializing detection head with 9 classes


Loading pretrain weights


Model is not optimized for inference. Latency may be higher than expected. You can optimize the model for inference by calling model.optimize_for_inference().


Resized inference (0.75x): 1.905s
Resolution: 3195x3215 -> 2396x2411




Resized inference (0.5x): 0.522s
Resolution: 3195x3215 -> 1597x1607




Optimized SAHI: 1.686s

Speed comparison:
75% resolution: 1.905s
50% resolution: 0.522s
Optimized SAHI: 1.686s


In [27]:
detections_75

Detections(xyxy=array([[ 2.56696802e+02, -1.30135536e+00,  5.74488342e+02,
         3.18610107e+02],
       [ 7.66763847e+02, -1.30437851e+00,  1.08854950e+03,
         3.18316193e+02],
       [ 1.02213090e+03, -4.21466827e-01,  1.34332614e+03,
         3.17532104e+02],
       [-2.48260498e-01,  2.54728886e+02,  3.20560120e+02,
         5.74989532e+02],
       [ 5.10467331e+02,  2.54323660e+02,  8.31976776e+02,
         5.74174622e+02],
       [ 7.67178095e+02,  2.54643990e+02,  1.08743036e+03,
         5.74667755e+02],
       [ 2.04698049e+03,  2.56724726e+02,  2.36857404e+03,
         5.70536926e+02],
       [ 2.30406795e+03,  2.56284452e+02,  2.39547473e+03,
         5.73842896e+02],
       [ 5.12116348e+02,  5.10054266e+02,  8.31627991e+02,
         8.30904297e+02],
       [ 7.67020138e+02,  5.10462620e+02,  1.08736945e+03,
         8.31887909e+02],
       [ 1.28510387e+03,  5.08939936e+02,  1.58115488e+03,
         8.30013885e+02],
       [ 1.80011992e+03,  5.10931093e+02,  2.1071

In [28]:
detections_75.xyxy

array([[ 2.56696802e+02, -1.30135536e+00,  5.74488342e+02,
         3.18610107e+02],
       [ 7.66763847e+02, -1.30437851e+00,  1.08854950e+03,
         3.18316193e+02],
       [ 1.02213090e+03, -4.21466827e-01,  1.34332614e+03,
         3.17532104e+02],
       [-2.48260498e-01,  2.54728886e+02,  3.20560120e+02,
         5.74989532e+02],
       [ 5.10467331e+02,  2.54323660e+02,  8.31976776e+02,
         5.74174622e+02],
       [ 7.67178095e+02,  2.54643990e+02,  1.08743036e+03,
         5.74667755e+02],
       [ 2.04698049e+03,  2.56724726e+02,  2.36857404e+03,
         5.70536926e+02],
       [ 2.30406795e+03,  2.56284452e+02,  2.39547473e+03,
         5.73842896e+02],
       [ 5.12116348e+02,  5.10054266e+02,  8.31627991e+02,
         8.30904297e+02],
       [ 7.67020138e+02,  5.10462620e+02,  1.08736945e+03,
         8.31887909e+02],
       [ 1.28510387e+03,  5.08939936e+02,  1.58115488e+03,
         8.30013885e+02],
       [ 1.80011992e+03,  5.10931093e+02,  2.10712424e+03,
      