# Lab 2 - Basic sentence transformer inference and similarity

In [1]:
require 'transformers-rb'
model = Transformers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
nil

In [2]:
# Example from https://sbert.net
sentences = [
    'This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.'
]

sentence_embeddings = model.(sentences)
nil

In [3]:
sentence_embeddings[1]
nil

In [4]:
Torch.tensor(sentence_embeddings[1])
nil

In [5]:
Torch.tensor(sentence_embeddings[1]).unsqueeze(0)
nil

In [6]:
cos = Torch::NN::CosineSimilarity.new(dim: 1, eps: 1e-6)

# Convert arrays to tensors and reshape them
tensor0 = Torch.tensor(sentence_embeddings[0]).unsqueeze(0)
tensor1 = Torch.tensor(sentence_embeddings[1]).unsqueeze(0)

# Calculate cosine similarity
similarity = cos.call(tensor0, tensor1)

tensor([0.5381])

## Inference of a small dataset

In [7]:
puts `free -h`

              total        used        free      shared  buff/cache   available
Mem:          3.8Gi       2.4Gi        80Mi       100Mi       1.3Gi       989Mi
Swap:         1.0Gi       603Mi       420Mi


In [8]:
# See the model card here: https://huggingface.co/intfloat/e5-small-v2
# model = SentenceTransformer.new('intfloat/e5-small-v2')
model = Transformers.pipeline("embedding", "intfloat/e5-small-v2")
nil

In [9]:
# Should result in about 100MB less RAM available
puts `free -h`

              total        used        free      shared  buff/cache   available
Mem:          3.8Gi       2.6Gi        95Mi        99Mi       1.1Gi       737Mi
Swap:         1.0Gi       608Mi       415Mi


In [10]:
def get_embeddings(model, texts, prefix: "passage: ")
  puts texts
  texts = [texts] unless texts.is_a?(Array)
  total = texts.length
  embeddings = []
  
  texts.each_with_index do |text, i|
    prefixed_text = "#{prefix}#{text}"
    embedding = model.(prefixed_text)
    embeddings << embedding
    
    percent = ((i + 1).to_f / total * 100).to_i
    print "\rProcessing embeddings: #{percent}% (#{i + 1}/#{total})"
  end
  
  print "\nDone!\n"
  embeddings
end

:get_embeddings

In [11]:
test_e5 = get_embeddings(model,["Hello world"])

#stand in for test_e5.shape. Alternatively use nmatrix or some other library
puts "First dimension: #{test_e5.length}"
puts "Second dimension: #{test_e5[0].length}" if test_e5[0].is_a?(Array)

puts test_e5

Hello world
Processing embeddings: 100% (1/1)
Done!
First dimension: 1
Second dimension: 384
-0.01651151478290558
0.053172264248132706
0.022188758477568626
0.014594843611121178
-0.03386980667710304
0.01365770772099495
0.06086576730012894
-0.06649580597877502
0.025309480726718903
0.04438522085547447
0.045850373804569244
0.011839757673442364
-0.027528109028935432
0.05420251935720444
0.0230848491191864
-0.03905387222766876
0.022867225110530853
0.04260660335421562
-0.11509678512811661
0.0016917287139222026
0.0842922106385231
-0.05009948089718819
-0.020574478432536125
-0.039296966046094894
-0.05684807896614075
-0.028063228353857994
0.023119978606700897
0.02836981788277626
-0.03971661254763603
-0.11180952936410904
-0.056230854243040085
-0.011437352746725082
0.0563444122672081
-0.014556125737726688
0.06406951695680618
-0.06155061349272728
0.02405361644923687
0.04834815114736557
0.0012838798575103283
0.07727725803852081
-0.011544615961611271
-0.046061914414167404
0.07619121670722961
-0.0493257

### We use part of the CC_News dataset from Huggingface

In [12]:
require 'httparty'
require 'fileutils'

download_dir = "cc_news_data"

def download_file(url, destination_dir, filename=nil)
  FileUtils.mkdir_p(destination_dir)
  filename ||= File.basename(url)
  destination_path = File.join(destination_dir, filename)

  response = HTTParty.get(url, follow_redirects: true)  
  File.open(destination_path, 'wb') { |file| file.write(response.body) }

  puts "Downloaded file to #{destination_path}"
end

file_urls = [
  "https://huggingface.co/datasets/vblagoje/cc_news/resolve/main/plain_text/train-00000-of-00005.parquet"
]

file_urls.each do |url|
  download_file(url, download_dir)
end

Downloaded file to cc_news_data/train-00000-of-00005.parquet


["https://huggingface.co/datasets/vblagoje/cc_news/resolve/main/plain_text/train-00000-of-00005.parquet"]

In [13]:
require 'polars-df'
df = Polars.read_parquet('./cc_news_data/train-00000-of-00005.parquet',n_rows: 50_000)
nil

In [14]:
df

shape: (50_000, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ title        ┆ text        ┆ domain      ┆ date        ┆ description ┆ url         ┆ image_url   │
│ ---          ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         │
│ str          ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ Daughter Duo ┆ There's a   ┆ www.pointem ┆ 2017-12-11  ┆ There's a   ┆ http://www. ┆ https://poi │
│ is Dancing   ┆ surprising  ┆ agazine.com ┆ 20:19:05    ┆ surprising  ┆ pointemagaz ┆ nte-img.rbl │
│ in The…      ┆ twist to …  ┆             ┆             ┆ twist to …  ┆ ine.com/…   ┆ .ms/sima…   │
│ New York     ┆ The New     ┆ www.pointem ┆ 2017-12-11  ┆ NYCB has    ┆ http://www. ┆ https://poi │
│ City Ballet  ┆ York City   ┆ agazine.com ┆ 17:02:55    ┆ announced   ┆

In [35]:
# Load 50000 examples of the 'cc_news' dataset from Hugging Face
df = Polars.read_parquet('hf://datasets/vblagoje/cc_news/plain_text/train-*.parquet',n_rows:50_000)

shape: (50_000, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ title        ┆ text        ┆ domain      ┆ date        ┆ description ┆ url         ┆ image_url   │
│ ---          ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         │
│ str          ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ Daughter Duo ┆ There's a   ┆ www.pointem ┆ 2017-12-11  ┆ There's a   ┆ http://www. ┆ https://poi │
│ is Dancing   ┆ surprising  ┆ agazine.com ┆ 20:19:05    ┆ surprising  ┆ pointemagaz ┆ nte-img.rbl │
│ in The…      ┆ twist to …  ┆             ┆             ┆ twist to …  ┆ ine.com/…   ┆ .ms/sima…   │
│ New York     ┆ The New     ┆ www.pointem ┆ 2017-12-11  ┆ NYCB has    ┆ http://www. ┆ https://poi │
│ City Ballet  ┆ York City   ┆ agazine.com ┆ 17:02:55    ┆ announced   ┆

In [16]:

nil

shape: (50_000, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ title        ┆ text        ┆ domain      ┆ date        ┆ description ┆ url         ┆ image_url   │
│ ---          ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         │
│ str          ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ Daughter Duo ┆ There's a   ┆ www.pointem ┆ 2017-12-11  ┆ There's a   ┆ http://www. ┆ https://poi │
│ is Dancing   ┆ surprising  ┆ agazine.com ┆ 20:19:05    ┆ surprising  ┆ pointemagaz ┆ nte-img.rbl │
│ in The…      ┆ twist to …  ┆             ┆             ┆ twist to …  ┆ ine.com/…   ┆ .ms/sima…   │
│ New York     ┆ The New     ┆ www.pointem ┆ 2017-12-11  ┆ NYCB has    ┆ http://www. ┆ https://poi │
│ City Ballet  ┆ York City   ┆ agazine.com ┆ 17:02:55    ┆ announced   ┆

In [36]:
puts df['title']
title_array = df['title'].to_a
title_array2 = df2['title'].to_a
identical = title_array == title_array2
puts identical

shape: (50_000,)
Series: 'title' [str]
[
	"Daughter Duo is Dancing in The…
	"New York City Ballet Announces…
	"Watch Pennsylvania Ballet & Bo…
	"dance shoes"
	"Rebecca Krohn on Her Retiremen…
	…
	"New York City Expands Plan to …
	"1 Killed, 2 Sickened by Diseas…
	"Sleeping Man Stabbed on Subway…
	"Long Island Teen Stabbed Mothe…
	"Markets Close Down as Facebook…
]
true


In [33]:
def save_embedding(embedding, file)
  File.open(file, 'ab') do |f| 
    Marshal.dump(embedding, f)
  end
end

def each_embedding(file)
  return enum_for(:each_embedding, file) unless block_given?
  
  File.open(file, 'rb') do |f|
    begin
      while !f.eof?
        yield Marshal.load(f)
      end
    rescue EOFError
      # Break out if we hit end of file
    end
  end
end

def load_embeddings(file)
  embeddings = []
  each_embedding(file) do |embedding|
    embeddings << embedding
  end
  embeddings
end

def save_embeddings(model, texts, prefix: "passage: ", cache_file: "embeddings.cache", batch_size: 100)
  texts = [texts] unless texts.is_a?(Array)
  
  # Figure out how many embeddings are already in the cache
  start_idx = each_embedding(cache_file).count rescue 0
  
  # Slice the texts in sets of N (e.g., 100)
  total = texts.length
  texts[start_idx..].each_slice(batch_size).with_index do |batch, batch_idx|
    # Apply the prefix to each text in the batch
    prefixed_batch = batch.map { |t| "#{prefix}#{t}" }
    
    # Get embeddings for the entire batch at once (if your model supports batch calls)
    embeddings = model.(prefixed_batch)
    
    # Save each embedding
    embeddings.each do |embedding|
      save_embedding(embedding, cache_file)
    end
    
    # Progress indicator
    current = start_idx + batch_idx * batch_size + batch.size
    percent = (current.to_f / total * 100).to_i
    print "\rProcessing embeddings: #{percent}% (#{current}/#{total})"
  end
  
  print "\nDone!\n"
end

:save_embeddings

In [34]:
save_embeddings(model,title_array)

Processing embeddings: 100% (50000/50000)
Done!


In [20]:
title_embeddings = load_embeddings("embeddings.cache")
title_embeddings.first(1)

[[0.007776172831654549, 0.030636534094810486, 0.006429621949791908, 0.0032369415275752544, -0.006653294432908297, 0.08624327927827835, 0.0658121407032013, -0.06442847102880478, 0.010094518773257732, 0.038785673677921295, 0.08897559344768524, 0.0016825764905661345, -0.0065928008407354355, 0.0024186638183891773, 0.0528688058257103, -0.04490164667367935, -0.04365790635347366, 0.0727725476026535, -0.08364787697792053, 0.02291102148592472, 0.012090000323951244, -0.061756979674100876, -0.004014022182673216, -0.036743611097335815, 0.016573714092373848, -0.03042406216263771, 0.007106706965714693, 0.03227509185671806, -0.027671728283166885, -0.08136255294084549, -0.05101291090250015, 0.029846908524632454, -0.03366858884692192, 0.004739896859973669, 0.031140614300966263, -0.014164036139845848, -0.009502699598670006, 0.07173682004213333, -0.026918208226561546, 0.03813191130757332, -0.03274918720126152, -0.05567197874188423, 0.038645725697278976, -0.07432640343904495, -0.01664847694337368, 0.00750

In [21]:
require 'objspace'
# ObjectSpace.memsize_of_all(Array) # Size of all arrays
# ObjectSpace.memsize_of_all(Float) # Size of all floats

puts ObjectSpace.memsize_of(title_embeddings)

# 50000 embeddings at 384 dims each is how much in RAM?

11840


In [22]:
# File.open('cc_news_title_embeddings_50000.marshal', 'wb') do |file|
#     Marshal.dump(title_embeddings, file)
# end

In [23]:
# # NOTE that 50k embeddings of 384 dims each uses about 74MB pickled disk space
# system('ls -lah cc_news_title_embeddings_50000.marshal')

### brute-force nearest neighbor calculation

In [24]:
# def knn(query,model, k: 5)
#     query_embedding = get_embeddings(model, query, prefix: "query: ")
#     cosine_scores = STutil.cosine_similarity(query_embedding, title_embeddings)
#     sorted_indices = cosine_scores.sort_index(reverse: true)
#     top_k_indices = sorted_indices.first(k)
    
#     top_k_indices.map { |i| dataset['title'][i] }
# end

In [25]:
$dataset = df

def cosine_similarity(query_embedding,corpus_embedding)
    cos = Torch::NN::CosineSimilarity.new(dim: 1, eps: 1e-6)

    # Convert arrays to tensors and reshape them
    tensor0 = Torch.tensor(query_embedding)
    # puts 'after tensor 0'
    tensor1 = Torch.tensor(corpus_embedding)


    # Calculate cosine similarity
    similarity = cos.call(tensor0, tensor1)
end

def knn(query,model, k: 5)
    puts query
    query_embedding = get_embeddings(model, query, prefix: "query: ")
    # puts load_embeddings("embeddings.cache")[0][0]
    puts "query_embedding 1st number: #{query_embedding[0][0]}"
    puts "title_embeddings 1st number: #{load_embeddings("embeddings.cache")[0][0]}"
    cosine_scores = cosine_similarity(query_embedding, load_embeddings("embeddings.cache"))
    #sorted_indices = cosine_scores.sort_index(reverse: true)
    # Polars.read_parquet('hf://datasets/vblagoje/cc_news/plain_text/train-*.parquet').head(1000)

    # puts scores_series = Polars::Series.new("scores", cosine_scores.flatten.to_a)
    
    scores_series = Polars::Series.new("scores", cosine_scores.flatten.to_a)
    sorted_indices = scores_series.arg_sort(reverse: true).to_a  # Added reverse: true here
    top_k_indices = sorted_indices.first(k)  # Using k instead of hardcoded 5
    
    most_similar = top_k_indices.map { |i| $dataset['title'][i] }
end

:knn

In [26]:
knn("housing market",model)

housing market
housing market
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.032320182770490646
title_embeddings 1st number: 0.007776172831654549


["ভ্যালেন্টাইনের হাত ধরে গভীর রাতে জুহু বিচে শাহরুখ", "জেনের সঙ্গে আলাপ‚প্রেম থেকে বিয়ে – অকপট প্রীতি জিন্টা", "Real time energy financing and trading news", "Real time energy financing and trading news", "Real time energy financing and trading news"]

In [27]:
knn("property market",model)

property market
property market
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.04586941376328468
title_embeddings 1st number: 0.007776172831654549


["জেনের সঙ্গে আলাপ‚প্রেম থেকে বিয়ে – অকপট প্রীতি জিন্টা", "ভ্যালেন্টাইনের হাত ধরে গভীর রাতে জুহু বিচে শাহরুখ", "প্রাক্তন প্রেমিকা প্রিয়াঙ্কাকে মেরে ফেলতে চান শাহিদ!!", "Global GIS Market in Telecom Sector - Use of GIS and Big Data is an Emerging Trend in the Market", "Business Highlights"]

In [28]:
knn("ballet dancing changes",model)

ballet dancing changes
ballet dancing changes
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.057412199676036835
title_embeddings 1st number: 0.007776172831654549


["Ballet Performances This Week", "Roy Kaiser to Become Nevada Ballet Theatre's New Artistic Director", "Watch Pennsylvania Ballet & Boston Ballet Face Off for the Super Bowl", "Broadway's \"Carousel\" Stars Some Familiar Ballet Faces", "Rebecca Krohn on Her Retirement from New York City Ballet"]

In [29]:
knn("climate change",model)

climate change
climate change
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.06961619853973389
title_embeddings 1st number: 0.007776172831654549


["Global Forecast-Celsius", "In The Rockies, Climate Change Spells Trouble For Cutthroat Trout", "ভ্যালেন্টাইনের হাত ধরে গভীর রাতে জুহু বিচে শাহরুখ", "সোশ্যাল মিডিয়ায় ছবি দিয়ে কি বয়ফ্রেন্ডের কথাই সবাইকে জানাতে চাইছেন সোনম?", "জেনের সঙ্গে আলাপ‚প্রেম থেকে বিয়ে – অকপট প্রীতি জিন্টা"]

In [30]:
knn("global warming in the united states",model)

global warming in the united states
global warming in the united states
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.042729850858449936
title_embeddings 1st number: 0.007776172831654549


["Global Forecast-Celsius", "How Google Fiber turned 2017 into its comeback year", "জেনের সঙ্গে আলাপ‚প্রেম থেকে বিয়ে – অকপট প্রীতি জিন্টা", "Some of the deadliest mass shootings in modern US history", "সোশ্যাল মিডিয়ায় ছবি দিয়ে কি বয়ফ্রেন্ডের কথাই সবাইকে জানাতে চাইছেন সোনম?"]

In [31]:
knn("taylor swift",model)

taylor swift
taylor swift
Processing embeddings: 100% (1/1)
Done!
query_embedding 1st number: -0.032431963831186295
title_embeddings 1st number: 0.007776172831654549


["Taylor Swift Wins Sexual Assault Lawsuit Against Former Radio Host", "ভ্যালেন্টাইনের হাত ধরে গভীর রাতে জুহু বিচে শাহরুখ", "The top 5 iPhone X gestures every user should know", "Taylor Swift groper hired as a DJ at Mississippi station", "জেনের সঙ্গে আলাপ‚প্রেম থেকে বিয়ে – অকপট প্রীতি জিন্টা"]