In [1]:
require 'onnxruntime'
require 'rmagick'
require 'torch-rb'
require 'numo/narray'

def preprocess_image(image)
  # Resize to 224x224 using RMagick
  image = image.resize_to_fill(224, 224)
  
  # Convert to RGB array and normalize
  rgb_data = image.export_pixels(0, 0, image.columns, image.rows, 'RGB')
  rgb_array = Numo::DFloat.cast(rgb_data).reshape(3, 224, 224) / 255.0
  
  # Apply CLIP normalization
  means = Numo::DFloat[0.48145466, 0.4578275, 0.40821073]
  stds = Numo::DFloat[0.26862954, 0.26130258, 0.27577711]
  
  3.times do |c|
    rgb_array[c, true, true] -= means[c]
    rgb_array[c, true, true] /= stds[c]
  end
  
  # Add batch dimension
  rgb_array.reshape(1, 3, 224, 224)
end

def simple_tokenize(text, context_length=77)
  # Simple tokenizer that creates zero array
  Numo::Int64.zeros(1, context_length)
end

# def get_image_embeddings(model, image_paths, batch_size: 32)
#   embeddings_list = []
  
#   (0...image_paths.size).step(batch_size) do |start_idx|
#     batch_paths = image_paths[start_idx, batch_size]
    
#     # Process batch of images
#     batch_tensors = batch_paths.map do |path|
#       image = Magick::Image.read(path).first
#       preprocess_image(image)
#     end
    
#     # Combine batch
#     batch_tensor = Numo::DFloat.zeros(batch_paths.size, 3, 224, 224)
#     batch_tensors.each_with_index do |tensor, i|
#       batch_tensor[i, true, true, true] = tensor[0, true, true, true]
#     end
    
#     # Create dummy text tokens
#     dummy_texts = Numo::Int64.zeros(batch_paths.size, 77)
    
#     # Run inference
#     outputs = model.predict({
#       "image_input" => batch_tensor,
#       "text_input" => dummy_texts
#     })
    
#     # Get embeddings - ensure it's a Numo::NArray
#     batch_embeddings = Numo::NArray.cast(outputs["image_features"])
    
#     # Add debugging
#     puts "Batch embeddings type: #{batch_embeddings.class}"
#     puts "Batch embeddings shape: #{batch_embeddings.shape}"
    
#     # Convert to torch tensor
#     begin
#       batch_embeddings_torch = Torch.tensor(batch_embeddings.to_a)
#       norms = batch_embeddings_torch.norm(2, 1, keepdim: true)
#       normalized_embeddings = batch_embeddings_torch / norms
      
#       embeddings_list << normalized_embeddings
#     rescue => e
#       puts "Error in conversion: #{e.message}"
#       puts "Shape before conversion: #{batch_embeddings.shape}"
#       next
#     end
#   end
  
#   # Combine all batches
#   Torch.cat(embeddings_list, dim: 0)
# end




# Usage
# begin
#   # Load ONNX model
#   model = OnnxRuntime::Model.new('clip_image_text_encoder.onnx')
  
#   # Process directory of images
#   image_dir = 'house_data_png'
#   image_paths = Dir[File.join(image_dir, '*')]
#   puts "Found #{image_paths.length} images"
  
#   # Get embeddings
#   embeddings = get_image_embeddings(model, image_paths)
#   puts "Final embeddings shape: #{embeddings.shape}"
  
# rescue => e
#   puts "Error: #{e.message}"
#   puts e.backtrace
# end


def get_image_embeddings(model, image_paths, batch_size: 32, limit: nil)
 # Apply limit if specified
 image_paths = image_paths[0...limit] if limit
 
 puts "Processing #{image_paths.length} images..."
 
 embeddings_list = []
 
 (0...image_paths.size).step(batch_size) do |start_idx|
   batch_paths = image_paths[start_idx, batch_size]
   
   puts "Processing batch #{start_idx/batch_size + 1} of #{(image_paths.size.to_f/batch_size).ceil}"
   
   # Process batch of images
   batch_tensors = batch_paths.map do |path|
     image = Magick::Image.read(path).first
     preprocess_image(image)
   end
   
   # Combine batch
   batch_tensor = Numo::DFloat.zeros(batch_paths.size, 3, 224, 224)
   batch_tensors.each_with_index do |tensor, i|
     batch_tensor[i, true, true, true] = tensor[0, true, true, true]
   end
   
   # Create dummy text tokens
   dummy_texts = Numo::Int64.zeros(batch_paths.size, 77)
   
   # Run inference
   outputs = model.predict({
     "image_input" => batch_tensor,
     "text_input" => dummy_texts
   })
   
   # Get embeddings - ensure it's a Numo::NArray
   batch_embeddings = Numo::NArray.cast(outputs["image_features"])
   
   # Add debugging
   puts "Batch embeddings type: #{batch_embeddings.class}"
   puts "Batch embeddings shape: #{batch_embeddings.shape}"
   
   # Convert to torch tensor
   begin
     batch_embeddings_torch = Torch.tensor(batch_embeddings.to_a)
     norms = batch_embeddings_torch.norm(2, 1, keepdim: true)
     normalized_embeddings = batch_embeddings_torch / norms
     
     embeddings_list << normalized_embeddings
   rescue => e
     puts "Error in conversion: #{e.message}"
     puts "Shape before conversion: #{batch_embeddings.shape}"
     next
   end
 end
 
 # Combine all batches
 Torch.cat(embeddings_list, dim: 0)
end

# Usage
begin
 # Load ONNX model
 model = OnnxRuntime::Model.new('clip_image_text_encoder.onnx')
 
 # Process directory of images
 image_dir = 'house_data_png'
 image_paths = Dir[File.join(image_dir, '*')]
 puts "Found #{image_paths.length} images"
 
 # Get embeddings for first 100 images
 embeddings = get_image_embeddings(model, image_paths, limit: 100)
 puts "Final embeddings shape: #{embeddings.shape}"
 
rescue => e
 puts "Error: #{e.message}"
 puts e.backtrace
end

Found 5249 images
Processing 100 images...
Processing batch 1 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 2 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 3 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 4 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [4, 512]
Final embeddings shape: [100, 512]


In [2]:
embeddings

tensor([[ 0.0086, -0.0172,  0.0142,  ...,  0.0412, -0.0064,  0.0027],
        [ 0.0140,  0.0025,  0.0138,  ...,  0.0485,  0.0043,  0.0073],
        [-0.0073, -0.0085,  0.0317,  ...,  0.0320, -0.0004,  0.0363],
        ...,
        [ 0.0066, -0.0006,  0.0121,  ...,  0.0584, -0.0078, -0.0052],
        [ 0.0128, -0.0024,  0.0054,  ...,  0.0549, -0.0059, -0.0072],
        [-0.0009, -0.0002,  0.0173,  ...,  0.0421,  0.0040,  0.0087]])