# Lab 2 - Basic sentence transformer inference and similarity

In [3]:
require 'transformers-rb'
model = Transformers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
nil

In [4]:
# Example from https://sbert.net
sentences = [
    'This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.'
]

sentence_embeddings = model.(sentences)
nil

In [39]:
#Calculate and print the similarity between all three example sentence embeddings
cos = Torch::NN::CosineSimilarity.new(dim: 1, eps: 1e-6)

# Convert arrays to tensors and reshape them
tensor0 = Torch.tensor(sentence_embeddings[0]).unsqueeze(0)
tensor1 = Torch.tensor(sentence_embeddings[1]).unsqueeze(0)
tensor2 = Torch.tensor(sentence_embeddings[2]).unsqueeze(0)

# Calculate cosine similarity
similarity01 = cos.call(tensor0, tensor1)
similarity02 = cos.call(tensor0, tensor2)
similarity12 = cos.call(tensor1, tensor2)

puts "#{similarity01} : '#{sentences[0]}' :: '#{sentences[1]}'"
puts "#{similarity02} : '#{sentences[0]}' :: '#{sentences[2]}'"
puts "#{similarity12} : '#{sentences[1]}' :: '#{sentences[2]}'"

tensor([0.5381]) : 'This framework generates embeddings for each input sentence' :: 'Sentences are passed as a list of string.'
tensor([0.1181]) : 'This framework generates embeddings for each input sentence' :: 'The quick brown fox jumps over the lazy dog.'
tensor([0.1036]) : 'Sentences are passed as a list of string.' :: 'The quick brown fox jumps over the lazy dog.'


## Inference of a small dataset

In [9]:
puts `free -h`

              total        used        free      shared  buff/cache   available
Mem:          3.8Gi       2.5Gi       677Mi        72Mi       688Mi       1.0Gi
Swap:         1.0Gi       1.0Gi       0.0Ki


In [10]:
# See the model card here: https://huggingface.co/intfloat/e5-small-v2
# model = SentenceTransformer.new('intfloat/e5-small-v2')
model = Transformers.pipeline("embedding", "intfloat/e5-small-v2")
nil

In [11]:
# Should result in about 100MB less RAM available
puts `free -h`

              total        used        free      shared  buff/cache   available
Mem:          3.8Gi       2.7Gi       362Mi        72Mi       817Mi       814Mi
Swap:         1.0Gi       1.0Gi       0.0Ki


In [12]:
def get_embeddings(model, texts, prefix: "passage: ")
  puts texts
  texts = [texts] unless texts.is_a?(Array)
  total = texts.length
  embeddings = []
  
  texts.each_with_index do |text, i|
    prefixed_text = "#{prefix}#{text}"
    embedding = model.(prefixed_text)
    embeddings << embedding
    
    percent = ((i + 1).to_f / total * 100).to_i
    print "\rProcessing embeddings: #{percent}% (#{i + 1}/#{total})"
  end
  
  print "\nDone!\n"
  embeddings
end

:get_embeddings

In [13]:
test_e5 = get_embeddings(model,["Hello world"])

#stand in for test_e5.shape. Alternatively use nmatrix or some other library
puts "First dimension: #{test_e5.length}"
puts "Second dimension: #{test_e5[0].length}" if test_e5[0].is_a?(Array)

puts test_e5

Hello world
Processing embeddings: 100% (1/1)
Done!
First dimension: 1
Second dimension: 384
-0.01651151478290558
0.053172264248132706
0.022188758477568626
0.014594843611121178
-0.03386980667710304
0.01365770772099495
0.06086576730012894
-0.06649580597877502
0.025309480726718903
0.04438522085547447
0.045850373804569244
0.011839757673442364
-0.027528109028935432
0.05420251935720444
0.0230848491191864
-0.03905387222766876
0.022867225110530853
0.04260660335421562
-0.11509678512811661
0.0016917287139222026
0.0842922106385231
-0.05009948089718819
-0.020574478432536125
-0.039296966046094894
-0.05684807896614075
-0.028063228353857994
0.023119978606700897
0.02836981788277626
-0.03971661254763603
-0.11180952936410904
-0.056230854243040085
-0.011437352746725082
0.0563444122672081
-0.014556125737726688
0.06406951695680618
-0.06155061349272728
0.02405361644923687
0.04834815114736557
0.0012838798575103283
0.07727725803852081
-0.011544615961611271
-0.046061914414167404
0.07619121670722961
-0.0493257

### We use part of the CC_News dataset from Huggingface

In [14]:
# Load 50000 examples of the 'cc_news' dataset from Hugging Face
require 'polars-df'
df = Polars.read_parquet('hf://datasets/vblagoje/cc_news/plain_text/train-*.parquet',n_rows:50_000)

shape: (50_000, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ title        ┆ text        ┆ domain      ┆ date        ┆ description ┆ url         ┆ image_url   │
│ ---          ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         │
│ str          ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ Daughter Duo ┆ There's a   ┆ www.pointem ┆ 2017-12-11  ┆ There's a   ┆ http://www. ┆ https://poi │
│ is Dancing   ┆ surprising  ┆ agazine.com ┆ 20:19:05    ┆ surprising  ┆ pointemagaz ┆ nte-img.rbl │
│ in The…      ┆ twist to …  ┆             ┆             ┆ twist to …  ┆ ine.com/…   ┆ .ms/sima…   │
│ New York     ┆ The New     ┆ www.pointem ┆ 2017-12-11  ┆ NYCB has    ┆ http://www. ┆ https://poi │
│ City Ballet  ┆ York City   ┆ agazine.com ┆ 17:02:55    ┆ announced   ┆

In [15]:
puts df['title']
title_array = df['title'].to_a
nil

shape: (50_000,)
Series: 'title' [str]
[
	"Daughter Duo is Dancing in The…
	"New York City Ballet Announces…
	"Watch Pennsylvania Ballet & Bo…
	"dance shoes"
	"Rebecca Krohn on Her Retiremen…
	…
	"New York City Expands Plan to …
	"1 Killed, 2 Sickened by Diseas…
	"Sleeping Man Stabbed on Subway…
	"Long Island Teen Stabbed Mothe…
	"Markets Close Down as Facebook…
]


In [16]:
def save_embedding(embedding, file)
  File.open(file, 'ab') do |f| 
    Marshal.dump(embedding, f)
  end
end

def each_embedding(file)
  return enum_for(:each_embedding, file) unless block_given?
  
  File.open(file, 'rb') do |f|
    begin
      while !f.eof?
        yield Marshal.load(f)
      end
    rescue EOFError
      # Break out if we hit end of file
    end
  end
end

def load_embeddings(file)
  embeddings = []
  each_embedding(file) do |embedding|
    embeddings << embedding
  end
  embeddings
end

def save_embeddings(model, texts, prefix: "passage: ", cache_file: "embeddings.cache", batch_size: 100)
  texts = [texts] unless texts.is_a?(Array)
  
  # Figure out how many embeddings are already in the cache
  start_idx = each_embedding(cache_file).count rescue 0
  
  # Slice the texts in sets of N (e.g., 100)
  total = texts.length
  texts[start_idx..].each_slice(batch_size).with_index do |batch, batch_idx|
    # Apply the prefix to each text in the batch
    prefixed_batch = batch.map { |t| "#{prefix}#{t}" }
    
    # Get embeddings for the entire batch at once (if your model supports batch calls)
    embeddings = model.(prefixed_batch)
    
    # Save each embedding
    embeddings.each do |embedding|
      save_embedding(embedding, cache_file)
    end
    
    # Progress indicator
    current = start_idx + batch_idx * batch_size + batch.size
    percent = (current.to_f / total * 100).to_i
    print "\rProcessing embeddings: #{percent}% (#{current}/#{total})"
  end
  
  print "\nDone!\n"
end

:save_embeddings

In [17]:
# This will take about 24 hours!  We're actively researching how to speed this up
# For comparison, Python transformers will take about 5 minutes on the same machine
#save_embeddings(model,title_array)

In [18]:
#Embeddings for 50_000 titles is about 500MB cached - make sure you have enough docker RAM
$title_embeddings = load_embeddings("embeddings.cache")
$title_embeddings.first(1)

[[0.007776172831654549, 0.030636534094810486, 0.006429621949791908, 0.0032369415275752544, -0.006653294432908297, 0.08624327927827835, 0.0658121407032013, -0.06442847102880478, 0.010094518773257732, 0.038785673677921295, 0.08897559344768524, 0.0016825764905661345, -0.0065928008407354355, 0.0024186638183891773, 0.0528688058257103, -0.04490164667367935, -0.04365790635347366, 0.0727725476026535, -0.08364787697792053, 0.02291102148592472, 0.012090000323951244, -0.061756979674100876, -0.004014022182673216, -0.036743611097335815, 0.016573714092373848, -0.03042406216263771, 0.007106706965714693, 0.03227509185671806, -0.027671728283166885, -0.08136255294084549, -0.05101291090250015, 0.029846908524632454, -0.03366858884692192, 0.004739896859973669, 0.031140614300966263, -0.014164036139845848, -0.009502699598670006, 0.07173682004213333, -0.026918208226561546, 0.03813191130757332, -0.03274918720126152, -0.05567197874188423, 0.038645725697278976, -0.07432640343904495, -0.01664847694337368, 0.00750

In [19]:
require 'objspace'
# ObjectSpace.memsize_of_all(Array) # Size of all arrays
# ObjectSpace.memsize_of_all(Float) # Size of all floats

puts ObjectSpace.memsize_of($title_embeddings)

# 50000 embeddings at 384 dims each is how much in RAM?

454056


### brute-force nearest neighbor calculation

In [20]:
$dataset = df

shape: (50_000, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ title        ┆ text        ┆ domain      ┆ date        ┆ description ┆ url         ┆ image_url   │
│ ---          ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         │
│ str          ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ Daughter Duo ┆ There's a   ┆ www.pointem ┆ 2017-12-11  ┆ There's a   ┆ http://www. ┆ https://poi │
│ is Dancing   ┆ surprising  ┆ agazine.com ┆ 20:19:05    ┆ surprising  ┆ pointemagaz ┆ nte-img.rbl │
│ in The…      ┆ twist to …  ┆             ┆             ┆ twist to …  ┆ ine.com/…   ┆ .ms/sima…   │
│ New York     ┆ The New     ┆ www.pointem ┆ 2017-12-11  ┆ NYCB has    ┆ http://www. ┆ https://poi │
│ City Ballet  ┆ York City   ┆ agazine.com ┆ 17:02:55    ┆ announced   ┆

In [21]:
$title_tensor = Torch.tensor($title_embeddings)
def cosine_similarity(query_embedding)
    cos = Torch::NN::CosineSimilarity.new(dim: 1, eps: 1e-6)

    # Convert arrays to tensors and reshape them
    query_tensor = Torch.tensor(query_embedding)

    # Calculate cosine similarity
    similarity = cos.call(query_tensor, $title_tensor)
end

:cosine_similarity

In [31]:
def knn(query,model, k: 5)
    query_embedding = get_embeddings(model, query, prefix: "query: ")
    puts "query_embedding 1st number: #{query_embedding[0][0]}"
    puts "title_embeddings 1st number: #{$title_embeddings[0][0]}"
    
    #Score the relation between the query embedding and all title embeddings
    start_time = Time.now
    cosine_scores = cosine_similarity(query_embedding).flatten.to_a
    scores_series = Polars::Series.new("scores", cosine_scores)
    
    # Reverse sort (descending will yield the nearest neighbors on top:
    sorted_indices = scores_series.arg_sort(reverse: true).to_a
    top_k_indices = sorted_indices.first(k)  # Using k instead of hardcoded 5
    
    #How long did it take?
    end_time = Time.now
    elapsed_time = (end_time - start_time) * 1000.0
    
    #Print out the top K titles and the scores
    most_similar = top_k_indices.map { |i| "#{cosine_scores[i]} | #{$dataset['title'][i]}" }
    most_similar.each do |string|
      puts string
    end
    puts "Took: #{elapsed_time.round(2)} ms" 
    nil
end

:knn

In [32]:
knn("housing market",model)

housing market
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.032320182770490646
title_embeddings 1st number: 0.007776172831654549
0.870964527130127 | Feb. home prices soar 6.3 pct in a fierce competition to buy - WAFB 9 News Baton Rouge, Louisiana News, Weather, Sports
0.8708667755126953 | Given strong Tampa Bay housing market, is it better to rent or buy?
0.8671011924743652 | Prices and homes listed up, sales down in Metro Vancouver real estate market
0.8559203147888184 | US sales of new homes shot up 4 percent in March - WAFB 9 News Baton Rouge, Louisiana News, Weather, Sports
0.8558682799339294 | After the bubble burst: How homebuying fared in Huntington Beach, Fountain Valley, Garden Grove, Westminster
Took: 1417.0 ms


In [33]:
knn("property market",model)

property market
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.04586941376328468
title_embeddings 1st number: 0.007776172831654549
0.8427945971488953 | Prices and homes listed up, sales down in Metro Vancouver real estate market
0.8416168689727783 | Given strong Tampa Bay housing market, is it better to rent or buy?
0.837308406829834 | Feb. home prices soar 6.3 pct in a fierce competition to buy - WAFB 9 News Baton Rouge, Louisiana News, Weather, Sports
0.8327614068984985 | What’s On Listings 31 May-6 Jun
0.8317294716835022 | The most ‘ingenious’ real estate deals in New York City
Took: 1386.99 ms


In [34]:
knn("ballet dancing changes",model)

ballet dancing changes
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.057412199676036835
title_embeddings 1st number: 0.007776172831654549
0.8435789942741394 | Ballet Performances This Week
0.8329176902770996 | Your weekly London dance guide
0.8302448391914368 | Dance: Performance and participatory
0.8302448391914368 | Dance: Performance and participatory
0.8275549411773682 | american ballet theatre
Took: 1405.54 ms


In [35]:
knn("climate change",model)

climate change
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.06961619853973389
title_embeddings 1st number: 0.007776172831654549
0.8812720775604248 | Cosmic evidence for climate change?
0.8690364360809326 | EPA moves to rescind Obama plan to slow global warming - wave3.com-Louisville News, Weather & Sports
0.8656370043754578 | A century on and climate 'still changing like the weather'
0.8636171817779541 | The Latest: Climate activists protest at intl climate summit
0.8615657091140747 | Green cash, carbon tax: What to expect at Paris climate meet
Took: 1414.34 ms


In [36]:
knn("global warming in the united states",model)

global warming in the united states
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.042729850858449936
title_embeddings 1st number: 0.007776172831654549
0.8607496619224548 | What U.S. cities have the most solar power? New report lays it out
0.8567557334899902 | EPA moves to rescind Obama plan to slow global warming - wave3.com-Louisville News, Weather & Sports
0.8529138565063477 | What Does “Climate-Smart Agriculture” Really Mean? New Tool Breaks It Down
0.8508168458938599 | Sea level rise could send U.S. 'climate migrants' fleeing to Austin, Atlanta
0.8501375317573547 | Global Forecast-Fahrenheit
Took: 1410.7 ms


In [37]:
knn("taylor swift",model)

taylor swift
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.032431963831186295
title_embeddings 1st number: 0.007776172831654549
0.8544883131980896 | The Latest: Taylor Swift says she hopes to help others - wave3.com-Louisville News, Weather & Sports
0.8531742691993713 | Taylor Swift Cries During Closing Arguments in Alleged Groping Trial as Her Lawyer Implores 'No Means No'
0.8523499965667725 | Taylor Swift to be awarded symbolic $1 as jury sides with her in groping trial
0.849186897277832 | Meet the Taylor Swift lookalike fooling even the biggest fans
0.8463172912597656 | Jurors weighing case over Taylor Swift's groping allegation - wave3.com-Louisville News, Weather & Sports
Took: 1382.88 ms
