# Lab 3 - Indexing and Searching embeddings in Opensearch

In [None]:
require 'opensearch'
require 'transformers-rb'
require 'tqdm'
require 'date'
require 'json'
require 'polars-df'

In [None]:
def each_embedding(file)
  return enum_for(:each_embedding, file) unless block_given?
  
  File.open(file, 'rb') do |f|
    begin
      while !f.eof?
        yield Marshal.load(f)
      end
    rescue EOFError
      # Break out if we hit end of file
    end
  end
end

def load_embeddings(file)
  embeddings = []
  each_embedding(file) do |embedding|
    embeddings << embedding
  end
  embeddings
end

## Helper methods & setup

In [None]:
# https://github.com/opensearch-project/opensearch-ruby
host = 'rubyai-opensearch-node' 
port = 9200
$client = OpenSearch::Client.new(hosts: [{ host: host, port: port }])
info = $client.info
puts "Welcome to #{info['version']['distribution']} #{info['version']['number']}!"

## First, look at the schema

Right click on the 'schema.json' file in the tree, and open with Editor

## Create the OpenSearch Index

In [None]:
def create_index(name: "ai-search", filename: "schema.json", delete: false)
    schema = File.read(filename)
    index_name = name
    index_body = schema

    if delete
        begin
            $client.indices.delete(index: index_name)
        rescue
            # Index might not exist
        end
    end
        
    response = $client.indices.create(index: index_name, body: index_body)
    puts response
end

In [None]:
create_index(delete: true)

## Use the same model and method to get the query embedding, with some defaults changed
Remember, the model is `intfloat/e5-small-v2` and we need to prefix any query with 'query:'

In [None]:
# The E5 models expect 'query:' and 'passage:' prefixes
$model = Transformers.pipeline("embedding", 'intfloat/e5-small-v2')


def get_embeddings(texts, prefix: "query: ")
  # puts texts
  texts = [texts] unless texts.is_a?(Array)
  total = texts.length
  embeddings = []
  
  texts.each_with_index do |text, i|
    prefixed_text = "#{prefix}#{text}"
    embedding = $model.(prefixed_text)
    embeddings << embedding
    
    percent = ((i + 1).to_f / total * 100).to_i
    print "\rProcessing embeddings: #{percent}% (#{i + 1}/#{total})"
  end
  
  print "\nDone!\n"
  embeddings
end

## Get our dataset and title_embeddings

In [None]:
# Load 50k records of the the 'cc_news' dataset from Hugging Face
dataset = Polars.read_parquet('hf://datasets/vblagoje/cc_news/plain_text/train-*.parquet',n_rows:50_000)

In [None]:
dataset.size

In [None]:
# Load the title_embeddings we generated in 02-sentence-transformers
title_embeddings = load_embeddings("embeddings.cache")
title_embeddings.first(1)

In [None]:
require 'polars'

# Create DataFrame from title embeddings
$title_embeddings_dataset = Polars::DataFrame.new({ "title_embedding" => title_embeddings })

# Combine with original DataFrame horizontally
$records_dataset = dataset.hstack($title_embeddings_dataset)

In [None]:
puts $records_dataset["title"][2]
puts $records_dataset["url"][2]

## Index the data in bulk, into our Opensearch index

In [None]:
def format_date(date_string)
    begin
        date_obj = DateTime.strptime(date_string, '%Y-%m-%d %H:%M:%S')
        date_obj.strftime('%Y-%m-%dT%H:%M:%S')
    rescue Date::Error
        nil
    end
end

def get_document(idx, records)
    record = records[idx]
    {
        'title' => record['title'].to_a[0],
        'text' => record['text'].to_a[0],
        'domain' => record['domain'].to_a[0],
        'date' => format_date(record['date'].to_a[0]),
        'description' => record['description'].to_a[0],
        'url' => record['url'].to_a[0],
        'image_url' => record['image_url'].to_a[0],
        'title_embedding' => record['title_embedding'].to_a[0]
    }
end

In [None]:
first_doc = get_document(1, $records_dataset)
puts first_doc.to_json

In [None]:
def index_one(document)
    index_name = "ai-search"
    $client.index(index: index_name, id: document['url'], body: document)
end

In [None]:
index_one(first_doc)

In [None]:
$records_dataset['title']

In [None]:
def index_bulk(records_dataset, batch_size: 100)
  index = "ai-search"
  count = records_dataset.height
    
  (0...count).step(batch_size).each do |batch|
    left = batch
    right = [batch + batch_size, count].min
    documents = []
    
    (left...right).each do |idx|   
      document = get_document(idx, records_dataset)
      
      # Create action metadata with newline
      action = { index: { _index: index, _id: document['url'] } }.to_json + "\n"
      
      # Add document data with newline
      doc_data = document.to_json + "\n"
      
      # Add both to bulk body
      documents << action
      documents << doc_data

    end

    # Join all lines for the bulk request
    response = $client.bulk(body: documents.join,refresh: true)
      
    percent = ((batch + batch_size).to_f / count * 100).to_i
    print "\rProcessing embeddings: #{percent}% (#{batch + batch_size}/#{count})"
      
  end
  
  puts "\nTotal documents indexed: #{count}"
end

In [None]:
index_bulk($records_dataset)

# Time to search!

In [None]:
def get_knn_body(querystring)
    embeddings = get_embeddings(querystring, prefix:"query: ")
    {
        "query" => {
            "bool" => {
                "should" => [
                    {
                        "knn" => {
                            "title_embedding" => {
                                "vector" => embeddings[0],
                                "k" => 20
                            }
                        }
                    }
                ]
            }
        },
        "_source" => { "exclude" => ["title_embedding"] }
    }
end

In [None]:
def serps(querystring, resp, k: 5, show: true)
    took = resp["took"]
    count = resp["hits"]["total"]["value"]
    results = resp["hits"]["hits"]
    
    html_str = "<h4>Showing #{count} Results for <strong>\"<em>#{querystring}</em>\"</strong> (took #{took}ms)</h4><ol>"
    
    results[0...k].each do |result|
        score = result["_score"]
        title = result["_source"]["title"] || "No title"
        url = result["_source"]["url"] || "No URL"
        description = result["_source"]["description"]
        text = result["_source"]["text"] || ""
        snippet = description || "#{text[0...140]}..."
        
        html_str += "<li><b>#{title}</b>(#{score})<br>#{snippet}<br>"
        html_str += "<span style=\"font-size:0.8em\"><a href=\"#{url}\">#{url}</a></span></li>"
    end
    
    html_str += "</ol>"
    
    if show
        IRuby.display(IRuby.html(html_str))
    else
        html_str
    end
end

In [None]:
def search(querystring)
    body = get_knn_body(querystring)
    resp = $client.search(body: body, index: "ai-search")
    serps(querystring, resp)
end

In [None]:
search("Ballet event")

In [None]:
search("housing market")