In [1]:
require 'onnxruntime'
require 'rmagick'
require 'torch-rb'
require 'numo/narray'

def preprocess_image(image)
  # Resize to 224x224 using RMagick
  image = image.resize_to_fill(224, 224)
  
  # Convert to RGB array and normalize
  rgb_data = image.export_pixels(0, 0, image.columns, image.rows, 'RGB')
  rgb_array = Numo::DFloat.cast(rgb_data).reshape(3, 224, 224) / 255.0
  
  # Apply CLIP normalization
  means = Numo::DFloat[0.48145466, 0.4578275, 0.40821073]
  stds = Numo::DFloat[0.26862954, 0.26130258, 0.27577711]
  
  3.times do |c|
    rgb_array[c, true, true] -= means[c]
    rgb_array[c, true, true] /= stds[c]
  end
  
  # Add batch dimension
  rgb_array.reshape(1, 3, 224, 224)
end

def simple_tokenize(text, context_length=77)
  # Simple tokenizer that creates zero array
  Numo::Int64.zeros(1, context_length)
end

# def get_image_embeddings(model, image_paths, batch_size: 32)
#   embeddings_list = []
  
#   (0...image_paths.size).step(batch_size) do |start_idx|
#     batch_paths = image_paths[start_idx, batch_size]
    
#     # Process batch of images
#     batch_tensors = batch_paths.map do |path|
#       image = Magick::Image.read(path).first
#       preprocess_image(image)
#     end
    
#     # Combine batch
#     batch_tensor = Numo::DFloat.zeros(batch_paths.size, 3, 224, 224)
#     batch_tensors.each_with_index do |tensor, i|
#       batch_tensor[i, true, true, true] = tensor[0, true, true, true]
#     end
    
#     # Create dummy text tokens
#     dummy_texts = Numo::Int64.zeros(batch_paths.size, 77)
    
#     # Run inference
#     outputs = model.predict({
#       "image_input" => batch_tensor,
#       "text_input" => dummy_texts
#     })
    
#     # Get embeddings - ensure it's a Numo::NArray
#     batch_embeddings = Numo::NArray.cast(outputs["image_features"])
    
#     # Add debugging
#     puts "Batch embeddings type: #{batch_embeddings.class}"
#     puts "Batch embeddings shape: #{batch_embeddings.shape}"
    
#     # Convert to torch tensor
#     begin
#       batch_embeddings_torch = Torch.tensor(batch_embeddings.to_a)
#       norms = batch_embeddings_torch.norm(2, 1, keepdim: true)
#       normalized_embeddings = batch_embeddings_torch / norms
      
#       embeddings_list << normalized_embeddings
#     rescue => e
#       puts "Error in conversion: #{e.message}"
#       puts "Shape before conversion: #{batch_embeddings.shape}"
#       next
#     end
#   end
  
#   # Combine all batches
#   Torch.cat(embeddings_list, dim: 0)
# end




# Usage
# begin
#   # Load ONNX model
#   model = OnnxRuntime::Model.new('clip_image_text_encoder.onnx')
  
#   # Process directory of images
#   image_dir = 'house_data_png'
#   image_paths = Dir[File.join(image_dir, '*')]
#   puts "Found #{image_paths.length} images"
  
#   # Get embeddings
#   embeddings = get_image_embeddings(model, image_paths)
#   puts "Final embeddings shape: #{embeddings.shape}"
  
# rescue => e
#   puts "Error: #{e.message}"
#   puts e.backtrace
# end


def get_image_embeddings(model, image_paths, batch_size: 32, limit: nil)
 # Apply limit if specified
 image_paths = image_paths[0...limit] if limit
 
 puts "Processing #{image_paths.length} images..."
 
 embeddings_list = []
 
 (0...image_paths.size).step(batch_size) do |start_idx|
   batch_paths = image_paths[start_idx, batch_size]
   
   puts "Processing batch #{start_idx/batch_size + 1} of #{(image_paths.size.to_f/batch_size).ceil}"
   
   # Process batch of images
   batch_tensors = batch_paths.map do |path|
     image = Magick::Image.read(path).first
     preprocess_image(image)
   end
   
   # Combine batch
   batch_tensor = Numo::DFloat.zeros(batch_paths.size, 3, 224, 224)
   batch_tensors.each_with_index do |tensor, i|
     batch_tensor[i, true, true, true] = tensor[0, true, true, true]
   end
   
   # Create dummy text tokens
   dummy_texts = Numo::Int64.zeros(batch_paths.size, 77)
   
   # Run inference
   outputs = model.predict({
     "image_input" => batch_tensor,
     "text_input" => dummy_texts
   })
   
   # Get embeddings - ensure it's a Numo::NArray
   batch_embeddings = Numo::NArray.cast(outputs["image_features"])
   
   # Add debugging
   puts "Batch embeddings type: #{batch_embeddings.class}"
   puts "Batch embeddings shape: #{batch_embeddings.shape}"
   
   # Convert to torch tensor
   begin
     batch_embeddings_torch = Torch.tensor(batch_embeddings.to_a)
     norms = batch_embeddings_torch.norm(2, 1, keepdim: true)
     normalized_embeddings = batch_embeddings_torch / norms
     
     embeddings_list << normalized_embeddings
   rescue => e
     puts "Error in conversion: #{e.message}"
     puts "Shape before conversion: #{batch_embeddings.shape}"
     next
   end
 end
 
 # Combine all batches
 Torch.cat(embeddings_list, dim: 0)
end

# Usage
begin
 # Load ONNX model
 model = OnnxRuntime::Model.new('clip_image_text_encoder.onnx')
 
 # Process directory of images
 image_dir = 'house_data_png'
 image_paths = Dir[File.join(image_dir, '*')]
 puts "Found #{image_paths.length} images"
 
 # Get embeddings for first 100 images
 embeddings = get_image_embeddings(model, image_paths, limit: 100)
 puts "Final embeddings shape: #{embeddings.shape}"
 
rescue => e
 puts "Error: #{e.message}"
 puts e.backtrace
end

Found 5249 images
Processing 100 images...
Processing batch 1 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 2 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 3 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 4 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [4, 512]
Final embeddings shape: [100, 512]


In [2]:
embeddings

tensor([[ 0.0086, -0.0172,  0.0142,  ...,  0.0412, -0.0064,  0.0027],
        [ 0.0140,  0.0025,  0.0138,  ...,  0.0485,  0.0043,  0.0073],
        [-0.0073, -0.0085,  0.0317,  ...,  0.0320, -0.0004,  0.0363],
        ...,
        [ 0.0066, -0.0006,  0.0121,  ...,  0.0584, -0.0078, -0.0052],
        [ 0.0128, -0.0024,  0.0054,  ...,  0.0549, -0.0059, -0.0072],
        [-0.0009, -0.0002,  0.0173,  ...,  0.0421,  0.0040,  0.0087]])

In [3]:
# Function to find nearest neighbors
def find_nearest_neighbors(embeddings, query_index, k=5)
  # Get the query embedding
  query_embedding = embeddings[query_index]
  
  # Calculate similarities with all embeddings
  query_embedding = query_embedding.reshape(1, -1)  # Make it 2D: [1, embed_dim]
  
  # Calculate cosine similarity
  similarities = Torch.matmul(embeddings, query_embedding.t())
  
  # Get top k (excluding the first which is the query itself)
  values, indices = similarities.flatten.topk(k + 1)
  
  # Convert to Ruby arrays and remove the self-similarity
  similarity_scores = values.to_a[1..-1]  # Skip first one (self)
  neighbor_indices = indices.to_a[1..-1]  # Skip first one (self)
  
  return neighbor_indices, similarity_scores
end

# Save embeddings to file
def save_embeddings(embeddings, filename)
  File.open(filename, 'wb') do |file|
    Marshal.dump(embeddings.to_a, file)
  end
end

# Load embeddings from file
def load_embeddings(filename)
  embeddings_array = Marshal.load(File.read(filename))
  Torch.tensor(embeddings_array)
end

# Encode text to the same vector space as images
def embed_text(model, text)
  # Create simple tokenizer (77 is CLIP's context length)
  dummy_tokens = Numo::Int64.zeros(1, 77)
  
  # Get text embeddings
  outputs = model.predict({
    "image_input" => Numo::DFloat.zeros(1, 3, 224, 224),  # dummy image
    "text_input" => dummy_tokens
  })
  
  # Get and normalize text embeddings
  text_embedding = Torch.tensor(outputs["text_features"].to_a)
  text_embedding /= text_embedding.norm(2, -1, keepdim: true)
  text_embedding
end

# Search by text query
def search_by_text(model, embeddings, image_paths, query, k=5)
  # Get text embedding
  text_embedding = embed_text(model, query)
  
  # Calculate similarities
  similarities = Torch.matmul(embeddings, text_embedding.t())
  
  # Get top k results
  values, indices = similarities.flatten.topk(k)
  
  # Convert to Ruby arrays
  similarity_scores = values.to_a
  result_indices = indices.to_a
  
  # Get matching image paths
  result_paths = result_indices.map { |idx| image_paths[idx] }
  
  puts "\nTop #{k} matches for query: '#{query}'"
  result_paths.zip(similarity_scores).each_with_index do |(path, score), i|
    puts "#{i+1}. #{File.basename(path)} (similarity: #{score.round(3)})"
  end
  
  return result_paths, similarity_scores
end

# Search by image index
def search_by_image(embeddings, image_paths, index, k=5)
  neighbor_indices, similarity_scores = find_nearest_neighbors(embeddings, index, k)
  
  result_paths = neighbor_indices.map { |idx| image_paths[idx] }
  
  puts "\nTop #{k} similar images to #{File.basename(image_paths[index])}:"
  result_paths.zip(similarity_scores).each_with_index do |(path, score), i|
    puts "#{i+1}. #{File.basename(path)} (similarity: #{score.round(3)})"
  end
  
  return result_paths, similarity_scores
end

# Usage example:
begin
  # Load model and embeddings
  model = OnnxRuntime::Model.new('clip_image_text_encoder.onnx')
  
  # Either generate embeddings:
  image_paths = Dir[File.join('house_data_png', '*')]
  embeddings = get_image_embeddings(model, image_paths, limit: 100)
  
  # Or load pre-saved embeddings:
  # embeddings = load_embeddings('house_data_png.marshal')
  
  # Example searches
  search_by_text(model, embeddings, image_paths, "large kitchen island colonial")
  search_by_text(model, embeddings, image_paths, "white marble shower stall")
  
  # Search by image example
  search_by_image(embeddings, image_paths, 5)  # Changed from 505 to 5 since we limited to 100
  
rescue => e
  puts "Error: #{e.message}"
  puts e.backtrace
end

Processing 100 images...
Processing batch 1 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 2 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 3 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 4 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [4, 512]

Top 5 matches for query: 'large kitchen island colonial'
1. bath_114.png (similarity: 0.241)
2. bath_1150.png (similarity: 0.24)
3. bath_1013.png (similarity: 0.239)
4. bath_1169.png (similarity: 0.238)
5. bath_1059.png (similarity: 0.237)

Top 5 matches for query: 'white marble shower stall'
1. bath_114.png (similarity: 0.241)
2. bath_1150.png (similarity: 0.24)
3. bath_1013.png (similarity: 0.239)
4. bath_1169.png (similarity: 0.238)
5. bath_1059.png (similarity: 0.237)

Top 5 similar images to bath_1006.png:
1. bath_1111.png (similarity: 0.987)
2. bath_1149.png (similarity: 0.985)
3. bath_

[["house_data_png/bath_1111.png", "house_data_png/bath_1149.png", "house_data_png/bath_1141.png", "house_data_png/bath_1136.png", "house_data_png/bath_1163.png"], [0.9867925643920898, 0.9853135347366333, 0.9835642576217651, 0.9810572862625122, 0.980038046836853]]

In [4]:
require 'rmagick'
require 'base64'

# Function to display image in Jupyter notebook
def display_image(image_path)
  # Read image using RMagick
  img = Magick::Image.read(image_path).first
  
  # Resize if too large (optional)
  if img.columns > 500 || img.rows > 500
    img = img.resize_to_fit(500, 500)
  end
  
  # Convert to blob and base64
  blob = img.to_blob { self.format = 'JPEG' }
  base64_img = Base64.encode64(blob)
  
  # Display using IRuby
  IRuby.display IRuby.html "<img src='data:image/jpeg;base64,#{base64_img}'>"
end

# Modified search functions to display images
def search_by_text(model, embeddings, image_paths, query, k=5)
  text_embedding = embed_text(model, query)
  similarities = Torch.matmul(embeddings, text_embedding.t())
  values, indices = similarities.flatten.topk(k)
  
  similarity_scores = values.to_a
  result_indices = indices.to_a
  result_paths = result_indices.map { |idx| image_paths[idx] }
  
  puts "\nTop #{k} matches for query: '#{query}'"
  result_paths.zip(similarity_scores).each_with_index do |(path, score), i|
    puts "#{i+1}. #{File.basename(path)} (similarity: #{score.round(3)})"
    display_image(path)
  end
  
  return result_paths, similarity_scores
end

def search_by_image(embeddings, image_paths, index, k=5)
  puts "\nQuery image:"
  display_image(image_paths[index])
  
  neighbor_indices, similarity_scores = find_nearest_neighbors(embeddings, index, k)
  result_paths = neighbor_indices.map { |idx| image_paths[idx] }
  
  puts "\nTop #{k} similar images to #{File.basename(image_paths[index])}:"
  result_paths.zip(similarity_scores).each_with_index do |(path, score), i|
    puts "#{i+1}. #{File.basename(path)} (similarity: #{score.round(3)})"
    display_image(path)
  end
  
  return result_paths, similarity_scores
end

# Usage example:
begin
  # Load model and embeddings
  model = OnnxRuntime::Model.new('clip_image_text_encoder.onnx')
  
  # Generate or load embeddings
  image_paths = Dir[File.join('house_data_png', '*')]
  embeddings = get_image_embeddings(model, image_paths, limit: 100)
  
  # Example searches with image display
  puts "Searching for: 'large kitchen island colonial'"
  search_by_text(model, embeddings, image_paths, "large kitchen island colonial")
  
  puts "\nSearching for: 'white marble shower stall'"
  search_by_text(model, embeddings, image_paths, "white marble shower stall")
  
  puts "\nSearching by image index 5:"
  search_by_image(embeddings, image_paths, 5)
  
rescue => e
  puts "Error: #{e.message}"
  puts e.backtrace
end

Processing 100 images...
Processing batch 1 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 2 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 3 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [32, 512]
Processing batch 4 of 4
Batch embeddings type: Numo::DFloat
Batch embeddings shape: [4, 512]
Searching for: 'large kitchen island colonial'

Top 5 matches for query: 'large kitchen island colonial'
1. bath_114.png (similarity: 0.241)
Error: undefined method `format=' for #<Object:0x00007ffffaf83f98>
Did you mean?  format
(irb):14:in `block in display_image'
(irb):14:in `to_blob'
(irb):14:in `display_image'
(irb):34:in `block in search_by_text'
(irb):32:in `each'
(irb):32:in `each_with_index'
(irb):32:in `search_by_text'
(irb):67:in `<top (required)>'
/home/jovyan/.local/share/gem/ruby/3.1.0/gems/irb-1.6.2/lib/irb/workspace.rb:119:in `eval'
/home/jovyan/.local/share/gem/ruby/3.1.0/gems