# Lab 2 - Basic sentence transformer inference and similarity

In [3]:
require 'transformers-rb'
model = Transformers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
nil

In [4]:
# Example from https://sbert.net
sentences = [
    'This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.'
]

sentence_embeddings = model.(sentences)
nil

In [8]:
sentence_embeddings[1]
nil

In [9]:
Torch.tensor(sentence_embeddings[1])
nil

In [11]:
Torch.tensor(sentence_embeddings[1]).unsqueeze(0)
nil

In [13]:
cos = Torch::NN::CosineSimilarity.new(dim: 1, eps: 1e-6)

# Convert arrays to tensors and reshape them
tensor0 = Torch.tensor(sentence_embeddings[0]).unsqueeze(0)
tensor1 = Torch.tensor(sentence_embeddings[1]).unsqueeze(0)

# Calculate cosine similarity
similarity = cos.call(tensor0, tensor1)

tensor([0.5381])

## Inference of a small dataset

In [18]:
require 'tqdm'
require 'datasets'

LoadError: cannot load such file -- tqdm

In [None]:
system('free -h')

In [None]:
# See the model card here: https://huggingface.co/intfloat/e5-small-v2
model = SentenceTransformer.new('intfloat/e5-small-v2')

In [None]:
# Should result in about 100MB less RAM available
system('free -h')

In [None]:
# The E5 models expect 'query: ' and 'passage: ' prefixes
def get_embeddings(texts, prefix: "passage: ")
    # The E5 models expects either 'query: ' or 'passage: ' prefix
    texts = [texts] unless texts.is_a?(Array)
    prefixed = texts.map { |text| "#{prefix}#{text}" }
    model.encode(prefixed, show_progress_bar: true)
end

In [None]:
test_e5 = get_embeddings(["Hello world"])
puts test_e5.shape
puts test_e5

### We use part of the CC_News dataset

In [None]:
# Load 50000 examples of the 'cc_news' dataset from Hugging Face
dataset = Dataset.load_dataset("cc_news", split: 'train[0:50000]')

In [None]:
puts dataset

In [None]:
puts dataset['title']

In [None]:
title_embeddings = get_embeddings(dataset['title'])

In [None]:
puts ObjectSpace.memsize_of(title_embeddings)
# 50000 embeddings at 384 dims each is how much in RAM?

In [None]:
File.open('cc_news_title_embeddings_50000.marshal', 'wb') do |file|
    Marshal.dump(title_embeddings, file)
end

In [None]:
# NOTE that 50k embeddings of 384 dims each uses about 74MB pickled disk space
system('ls -lah cc_news_title_embeddings_50000.marshal')

### brute-force nearest neighbor calculation

In [None]:
def knn(query, k: 5)
    query_embedding = get_embeddings(query, prefix: "query: ")
    cosine_scores = STutil.cosine_similarity(query_embedding, title_embeddings)
    sorted_indices = cosine_scores.sort_index(reverse: true)
    top_k_indices = sorted_indices.first(k)
    
    top_k_indices.map { |i| dataset['title'][i] }
end

In [None]:
puts knn("housing market")

In [None]:
puts knn("property market")

In [None]:
puts knn("ballet dancing changes")

In [None]:
puts knn("climate change")

In [None]:
puts knn("global warming in the united states")

In [None]:
puts knn("taylor swift")