# Lab 7 - Multimodal search with CLIP

In [None]:
require 'httparty'
require 'zip'
require 'mini_magick'
require 'torch-rb'
require 'open_clip'
require 'nearest_neighbors'
require 'tqdm'

## Get the image dataset (interiors of houses)

- Source: https://www.kaggle.com/datasets/mikhailma/house-rooms-streets-image-dataset/data
- Cached: https://max.io/house_data_png.zip (resized to 256x256 and converted to PNG)
- License: CC-0 Public Domain

In [None]:
# Function to download and extract the zip file
def download_and_extract_zip(url, extract_to: '.')
    puts "Downloading and extracting #{url}"
    response = HTTParty.get(url)
    Zip::File.open_buffer(response.body) do |zip_file|
        zip_file.each do |entry|
            entry.extract(File.join(extract_to, entry.name))
        end
    end
end

# Download and extract the example images
url = "https://max.io/house_data_png.zip"
download_and_extract_zip(url)
image_dir = 'house_data_png'
image_paths = Dir[File.join(image_dir, '*')]
puts "Extracted #{image_paths.length} images"

In [None]:
# Load our model
model, transform, preprocess = OpenClip.create_model_and_transforms('ViT-B-32', pretrained: 'openai')

# Print the model architecture
model.eval
puts model

In [None]:
# Infers images in batches
def get_image_embeddings(image_paths, batch_size: 32)
    embeddings = []
    device = Torch.cuda.available? ? 'cuda' : 'cpu'

    # Process images in batches
    (0...image_paths.length).step(batch_size).each do |i|
        batch_paths = image_paths[i...([i + batch_size, image_paths.length].min)]
        batch_images = batch_paths.map do |path|
            img = MiniMagick::Image.open(path)
            transform.call(img).unsqueeze(0)
        end

        # Stack and process the batch
        batch_images_tensor = Torch.vstack(batch_images).to(device)
        
        Torch.no_grad do
            batch_embeddings = model.encode_image(batch_images_tensor)
            embeddings.push(batch_embeddings)
        end
    end

    # Concatenate all embeddings
    Torch.vstack(embeddings)
end

In [None]:
image_embeddings = get_image_embeddings(image_paths, batch_size: 32)

In [None]:
# Normalization is required!
image_embeddings /= image_embeddings.norm(dim: -1, keepdim: true)

In [None]:
# Save embeddings
File.open('house_data_png.marshal', 'wb') do |file|
    Marshal.dump(image_embeddings.cpu.numpy, file)
end

In [None]:
puts "#{image_embeddings.length} #{image_embeddings[0].shape}"

In [None]:
# Encodes the text to the same vector space as the images
def embed_text(text)
    tokens = OpenClip::Tokenizer.tokenize([text])
    Torch.no_grad do
        text_features = model.encode_text(tokens)
        text_features /= text_features.norm(dim: -1, keepdim: true) # Normalization is required!
        text_features
    end
end

In [None]:
# Function to display images
def display_images(image_paths, distances)
    image_paths.each_with_index do |path, idx|
        IRuby.display(IRuby.image(path))
        puts "👆 #{distances[idx]}"
    end
end

In [None]:
# This will search and display nearest images given a text query
nbrs = NearestNeighbors.new(n_neighbors: 10, metric: 'cosine')
nbrs.fit(image_embeddings.cpu.numpy)

def search(text)
    text_embedding = embed_text(text)
    distances, indices = nbrs.kneighbors(text_embedding.cpu.numpy)
    nearest_images = indices[0].map { |i| image_paths[i] }
    display_images(nearest_images, distances[0])
end

In [None]:
search('large kitchen island colonial')

In [None]:
search('white marble shower stall')

In [None]:
search('red ferrari')

In [None]:
search('nuclear reactor')