# Lab 3 - Indexing and Searching embeddings in Opensearch

In [1]:
require 'opensearch-ruby'
require 'sentence_transformers'
require 'tqdm'
require 'datasets'
require 'date'
require 'json'

LoadError: cannot load such file -- opensearch-ruby

In [None]:
# https://github.com/opensearch-project/opensearch-ruby
host = 'ai-search-opensearch-node'
port = 9200
client = OpenSearch::Client.new(hosts: [{ host: host, port: port }])
info = client.info
puts "Welcome to #{info['version']['distribution']} #{info['version']['number']}!"

## First, look at the schema

Right click on the 'schema.json' file in the tree, and open with Editor

## Create the OpenSearch Index

In [None]:
def create_index(name: "ai-search", filename: "schema.json", delete: false)
    schema = File.read(filename)
    index_name = name
    index_body = schema

    if delete
        begin
            client.indices.delete(index: index_name)
        rescue
            # Index might not exist
        end
    end
        
    response = client.indices.create(index: index_name, body: index_body)
    puts response
end

In [None]:
create_index(delete: true)
#create_index()

## Use the same model and method to get the query embedding, with some defaults changed
Remember, the model is `intfloat/e5-small-v2` and we need to prefix any query with 'query:'

In [None]:
# The E5 models expect 'query:' and 'passage:' prefixes
model = SentenceTransformer.new('intfloat/e5-small-v2')
def get_embeddings(texts, prefix: "query: ")
    # The E5 models expects either 'query:' or 'passage:' prefix
    texts = [texts] unless texts.is_a?(Array)
    prefixed = texts.map { |text| "#{prefix}#{text}" }
    model.encode(prefixed, show_progress_bar: false)
end

## Get our dataset and title_embeddings

In [None]:
# Load 50k records of the the 'cc_news' dataset from Hugging Face
dataset = Dataset.load_dataset("cc_news", split: 'train[0:50000]')

In [None]:
# Load the title_embeddings we generated in 02-sentence-transformers
title_embeddings = Marshal.load(File.read('cc_news_title_embeddings_50000.marshal'))

In [None]:
# Add the title embeddings as a new column in our dataset
title_embeddings_dataset = Dataset.from_dict({ "title_embedding" => title_embeddings })
records_dataset = Dataset.concatenate([dataset, title_embeddings_dataset], axis: 1)

In [None]:
puts records_dataset.select([2])['title']
puts records_dataset.select([2])['title_embedding'][0]

In [None]:
def format_date(date_string)
    begin
        date_obj = DateTime.strptime(date_string, '%Y-%m-%d %H:%M:%S')
        date_obj.strftime('%Y-%m-%dT%H:%M:%S')
    rescue Date::Error
        nil
    end
end

def get_document(idx, records)    
    rec = records.select([idx])
    {
        'title' => rec['title'][0],
        'text' => rec['text'][0],
        'domain' => rec['domain'][0],
        'date' => format_date(rec['date'][0]),
        'description' => rec['description'][0],
        'url' => rec['url'][0],
        'image_url' => rec['image_url'][0],
        'title_embedding' => rec['title_embedding'][0]
    }
end

In [None]:
first_doc = get_document(0, records_dataset)
puts first_doc.to_json

In [None]:
def index_one(document)
    index_name = "ai-search"
    client.index(index: index_name, id: document['url'], body: document)
end

In [None]:
index_one(first_doc)

In [None]:
def index_bulk(records_dataset, batch_size: 100)
    index = "ai-search"
    count = records_dataset.num_rows
    batches = (count / batch_size).ceil
    
    (0...count).step(batch_size).each do |batch|
        left = batch
        right = [batch + batch_size, count].min
        documents = (left...right).map do |idx|
            document = get_document(idx, records_dataset)
            document['_index'] = index
            document['_id'] = document['url']
            document
        end
        
        response = client.bulk(body: documents)
        puts "Indexed batch #{left}-#{right}"
    end
end

In [None]:
index_bulk(records_dataset)

# Time to search!

In [None]:
def get_knn_body(querystring)
    embeddings = get_embeddings(querystring)
    {
        "query" => {
            "bool" => {
                "should" => [
                    {
                        "knn" => {
                            "title_embedding" => {
                                "vector" => embeddings[0],
                                "k" => 20
                            }
                        }
                    }
                ]
            }
        },
        "_source" => { "exclude" => ["title_embedding"] }
    }
end

In [None]:
def serps(querystring, resp, k: 5, show: true)
    count = resp["hits"]["total"]["value"]
    results = resp["hits"]["hits"]
    
    html_str = "<h4>Showing #{count} Results for <em>#{querystring}</em></h4><ol>"
    
    results[0...k].each do |result|
        score = result["_score"]
        title = result["_source"]["title"] || "No title"
        url = result["_source"]["url"] || "No URL"
        description = result["_source"]["description"]
        text = result["_source"]["text"] || ""
        snippet = description || "#{text[0...140]}..."
        
        html_str += "<li><b>#{title}</b>(#{score})<br>#{snippet}<br>"
        html_str += "<span style=\"font-size:0.8em\"><a href=\"#{url}\">#{url}</a></span></li>"
    end
    
    html_str += "</ol>"
    
    if show
        IRuby.display(IRuby.html(html_str))
    else
        html_str
    end
end

In [None]:
def search(querystring)
    body = get_knn_body(querystring)
    resp = client.search(body: body, index: "ai-search")
    serps(querystring, resp)
end

In [None]:
search("Ballet event")

In [None]:
search("housing market")