# Lab 2 - Basic sentence transformer inference and similarity

In [None]:
system("rm embeddings.cache")
system("sh prepare_embeddings_cache.sh")

In [None]:
require 'transformers-rb'
model = Transformers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# Example from https://sbert.net
sentences = [
    'This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.'
]

sentence_embeddings = model.(sentences)

In [None]:
#Calculate and print the similarity between all three example sentence embeddings
cos = Torch::NN::CosineSimilarity.new(dim: 1, eps: 1e-6)

# Convert arrays to tensors and reshape them
tensor0 = Torch.tensor(sentence_embeddings[0]).unsqueeze(0)
tensor1 = Torch.tensor(sentence_embeddings[1]).unsqueeze(0)
tensor2 = Torch.tensor(sentence_embeddings[2]).unsqueeze(0)

# Calculate cosine similarity
similarity01 = cos.call(tensor0, tensor1)
similarity02 = cos.call(tensor0, tensor2)
similarity12 = cos.call(tensor1, tensor2)

puts "#{similarity01} : '#{sentences[0]}' :: '#{sentences[1]}'"
puts "#{similarity02} : '#{sentences[0]}' :: '#{sentences[2]}'"
puts "#{similarity12} : '#{sentences[1]}' :: '#{sentences[2]}'"

## Inference of a small dataset

In [None]:
puts `free -h`

In [None]:
# See the model card here: https://huggingface.co/intfloat/e5-small-v2
model = Transformers.pipeline("embedding", "intfloat/e5-small-v2")

In [None]:
# Should result in about 100MB less RAM available
puts `free -h`

In [None]:
def get_embeddings(model, texts, prefix: "passage: ")
  puts texts
  texts = [texts] unless texts.is_a?(Array)
  total = texts.length
  embeddings = []
  
  texts.each_with_index do |text, i|
    prefixed_text = "#{prefix}#{text}"
    embedding = model.(prefixed_text)
    embeddings << embedding
    
    percent = ((i + 1).to_f / total * 100).to_i
    print "\rProcessing embeddings: #{percent}% (#{i + 1}/#{total})"
  end
  
  print "\nDone!\n"
  embeddings
end

In [None]:
test_e5 = get_embeddings(model,["Hello world"])

#stand in for test_e5.shape. Alternatively use nmatrix or some other library
puts "First dimension: #{test_e5.length}"
puts "Second dimension: #{test_e5[0].length}" if test_e5[0].is_a?(Array)

puts test_e5

### We use part of the CC_News dataset from Huggingface

In [None]:
# Load 50000 examples of the 'cc_news' dataset from Hugging Face
require 'polars-df'
df = Polars.read_parquet('hf://datasets/vblagoje/cc_news/plain_text/train-*.parquet',n_rows:50_000)

In [None]:
puts df['title']
title_array = df['title'].to_a

In [None]:
def save_embedding(embedding, file)
  File.open(file, 'ab') do |f| 
    Marshal.dump(embedding, f)
  end
end

def each_embedding(file)
  return enum_for(:each_embedding, file) unless block_given?
  
  File.open(file, 'rb') do |f|
    begin
      while !f.eof?
        yield Marshal.load(f)
      end
    rescue EOFError
      # Break out if we hit end of file
    end
  end
end

def load_embeddings(file)
  embeddings = []
  each_embedding(file) do |embedding|
    embeddings << embedding
  end
  embeddings
end

def save_embeddings(model, texts, prefix: "passage: ", cache_file: "embeddings.cache", batch_size: 100)
  texts = [texts] unless texts.is_a?(Array)
  
  # Figure out how many embeddings are already in the cache
  start_idx = each_embedding(cache_file).count rescue 0
  
  # Slice the texts in sets of N (e.g., 100)
  total = texts.length
  texts[start_idx..].each_slice(batch_size).with_index do |batch, batch_idx|
    # Apply the prefix to each text in the batch
    prefixed_batch = batch.map { |t| "#{prefix}#{t}" }
    
    # Get embeddings for the entire batch at once (if your model supports batch calls)
    embeddings = model.(prefixed_batch)
    
    # Save each embedding
    embeddings.each do |embedding|
      save_embedding(embedding, cache_file)
    end
    
    # Progress indicator
    current = start_idx + batch_idx * batch_size + batch.size
    percent = (current.to_f / total * 100).to_i
    print "\rProcessing embeddings: #{percent}% (#{current}/#{total})"
  end
  
  print "\nDone!\n"
end

In [None]:
# This will take about 24 hours on Mac M1 due to docker/virtualization issues!  Takes 5 minutes when not in Docker
#save_embeddings(model,title_array)

In [None]:
#Embeddings for 50_000 titles is about 500MB cached - make sure you have enough docker RAM
$title_embeddings = load_embeddings("embeddings.cache")
$title_embeddings.first(1)

In [None]:
require 'objspace'

puts ObjectSpace.memsize_of($title_embeddings)

### brute-force nearest neighbor calculation

In [None]:
$dataset = df

In [None]:
$title_tensor = Torch.tensor($title_embeddings)
def cosine_similarity(query_embedding)
    cos = Torch::NN::CosineSimilarity.new(dim: 1, eps: 1e-6)

    # Convert arrays to tensors and reshape them
    query_tensor = Torch.tensor(query_embedding)

    # Calculate cosine similarity
    similarity = cos.call(query_tensor, $title_tensor)
end

In [None]:
def knn(query,model, k: 5)
    query_embedding = get_embeddings(model, query, prefix: "query: ")
    puts "query_embedding 1st number: #{query_embedding[0][0]}"
    puts "title_embeddings 1st number: #{$title_embeddings[0][0]}"
    
    #Score the relation between the query embedding and all title embeddings
    start_time = Time.now
    cosine_scores = cosine_similarity(query_embedding).flatten.to_a
    scores_series = Polars::Series.new("scores", cosine_scores)
    
    # Reverse sort (descending will yield the nearest neighbors on top:
    sorted_indices = scores_series.arg_sort(reverse: true).to_a
    top_k_indices = sorted_indices.first(k)  # Using k instead of hardcoded 5
    
    #How long did it take?
    end_time = Time.now
    elapsed_time = (end_time - start_time) * 1000.0
    
    #Print out the top K titles and the scores
    most_similar = top_k_indices.map { |i| "#{cosine_scores[i]} | #{$dataset['title'][i]}" }
    most_similar.each do |string|
      puts string
    end
    puts "Took: #{elapsed_time.round(2)} ms" 
    nil
end

In [None]:
knn("housing market",model)

In [None]:
knn("property market",model)

In [None]:
knn("ballet dancing changes",model)

In [None]:
knn("climate change",model)

In [None]:
knn("global warming in the united states",model)

In [None]:
knn("taylor swift",model)