# Lab 3 - Indexing and Searching embeddings in Opensearch

In [1]:
require 'opensearch'
require 'transformers-rb'
require 'tqdm'
require 'date'
require 'json'
require 'polars-df'

true

In [2]:
def save_embedding(embedding, file)
  File.open(file, 'ab') do |f| 
    Marshal.dump(embedding, f)
  end
end

def each_embedding(file)
  return enum_for(:each_embedding, file) unless block_given?
  
  File.open(file, 'rb') do |f|
    begin
      while !f.eof?
        yield Marshal.load(f)
      end
    rescue EOFError
      # Break out if we hit end of file
    end
  end
end

def load_embeddings(file)
  embeddings = []
  each_embedding(file) do |embedding|
    embeddings << embedding
  end
  embeddings
end

def save_embeddings(model, texts, prefix: "passage: ", cache_file: "embeddings.cache")
  texts = [texts] unless texts.is_a?(Array)
  start_idx = each_embedding(cache_file).count rescue 0
  
  texts[start_idx..].each_with_index do |text, i|
    prefixed_text = "#{prefix}#{text}"
    embedding = model.(prefixed_text)
    save_embedding(embedding, cache_file)
    
    current = start_idx + i + 1
    percent = (current.to_f / texts.length * 100).to_i
    print "\rProcessing embeddings: #{percent}% (#{current}/#{texts.length})"
  end
  print "\nDone!\n"
end

:save_embeddings

## Helper methods & setup

In [3]:
host = 'opensearch-node'
port = 9200

$client = OpenSearch::Client.new(hosts: [{ host: host, port: port }])
info = $client.info
puts "Welcome to #{info['version']['distribution']} #{info['version']['number']}!"

Welcome to opensearch 2.11.0!


## First, look at the schema

Right click on the 'schema.json' file in the tree, and open with Editor

## Create the OpenSearch Index

In [4]:
def create_index(name: "ai-search", filename: "schema.json", delete: false)
    schema = File.read(filename)
    index_name = name
    index_body = schema

    if delete
        begin
            $client.indices.delete(index: index_name)
        rescue
            # Index might not exist
        end
    end
        
    response = $client.indices.create(index: index_name, body: index_body)
    puts response
end

:create_index

In [5]:
create_index(delete: true)

{"acknowledged"=>true, "shards_acknowledged"=>true, "index"=>"ai-search"}


## Use the same model and method to get the query embedding, with some defaults changed
Remember, the model is `intfloat/e5-small-v2` and we need to prefix any query with 'query:'

In [1]:
# The E5 models expect 'query:' and 'passage:' prefixes
$model = Transformers.pipeline("embedding", 'intfloat/e5-small-v2')


def get_embeddings(texts, prefix: "passage: ")
  # puts texts
  texts = [texts] unless texts.is_a?(Array)
  total = texts.length
  embeddings = []
  
  texts.each_with_index do |text, i|
    prefixed_text = "#{prefix}#{text}"
    embedding = $model.(prefixed_text)
    embeddings << embedding
    
    percent = ((i + 1).to_f / total * 100).to_i
    print "\rProcessing embeddings: #{percent}% (#{i + 1}/#{total})"
  end
  
  print "\nDone!\n"
  embeddings
end

NameError: uninitialized constant Transformers

## Get our dataset and title_embeddings

In [7]:
# Load 50k records of the the 'cc_news' dataset from Hugging Face
# dataset = Dataset.load_dataset("cc_news", split: 'train[0:50000]')


dataset = Polars.read_parquet('hf://datasets/vblagoje/cc_news/plain_text/train-*.parquet').head(1000)

shape: (1_000, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ title        ┆ text        ┆ domain      ┆ date        ┆ description ┆ url         ┆ image_url   │
│ ---          ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         │
│ str          ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         ┆ str         │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ Daughter Duo ┆ There's a   ┆ www.pointem ┆ 2017-12-11  ┆ There's a   ┆ http://www. ┆ https://poi │
│ is Dancing   ┆ surprising  ┆ agazine.com ┆ 20:19:05    ┆ surprising  ┆ pointemagaz ┆ nte-img.rbl │
│ in The…      ┆ twist to …  ┆             ┆             ┆ twist to …  ┆ ine.com/…   ┆ .ms/sima…   │
│ New York     ┆ The New     ┆ www.pointem ┆ 2017-12-11  ┆ NYCB has    ┆ http://www. ┆ https://poi │
│ City Ballet  ┆ York City   ┆ agazine.com ┆ 17:02:55    ┆ announced   ┆ 

In [9]:
dataset.size

1000

In [12]:
# Load the title_embeddings we generated in 02-sentence-transformers

#I should rename to this original
# title_embeddings = Marshal.load(File.read('cc_news_title_embeddings_50000.marshal'))
title_embeddings = load_embeddings("embeddings.cache")
nil

In [13]:
require 'polars'

# Create DataFrame from title embeddings
$title_embeddings_dataset = Polars::DataFrame.new({ "title_embedding" => title_embeddings })

# Combine with original DataFrame horizontally
$records_dataset = dataset.hstack($title_embeddings_dataset)

shape: (1_000, 8)
┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ title      ┆ text       ┆ domain     ┆ date      ┆ descripti ┆ url       ┆ image_url ┆ title_emb │
│ ---        ┆ ---        ┆ ---        ┆ ---       ┆ on        ┆ ---       ┆ ---       ┆ edding    │
│ str        ┆ str        ┆ str        ┆ str       ┆ ---       ┆ str       ┆ str       ┆ ---       │
│            ┆            ┆            ┆           ┆ str       ┆           ┆           ┆ list[f64] │
╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ Daughter   ┆ There's a  ┆ www.pointe ┆ 2017-12-1 ┆ There's a ┆ http://ww ┆ https://p ┆ [0.007776 │
│ Duo is     ┆ surprising ┆ magazine.c ┆ 1         ┆ surprisin ┆ w.pointem ┆ ointe-img ┆ ,         │
│ Dancing in ┆ twist to … ┆ om         ┆ 20:19:05  ┆ g twist   ┆ agazine.c ┆ .rbl.ms/s ┆ 0.030637, │
│ The…       ┆            ┆            ┆           ┆ to …      ┆ om/…    

In [354]:
# Select values using get()
puts $records_dataset["title"][2]
puts $records_dataset["url"][2]
nil

Watch Pennsylvania Ballet & Boston Ballet Face Off for the Super Bowl
http://www.pointemagazine.com/watch-pennsylvania-ballet-boston-ballet-face-off-for-the-super-bowl-2530816257.html


## Index the data in bulk, into our Opensearch index

In [283]:
def format_date(date_string)
    begin
        date_obj = DateTime.strptime(date_string, '%Y-%m-%d %H:%M:%S')
        date_obj.strftime('%Y-%m-%dT%H:%M:%S')
    rescue Date::Error
        nil
    end
end

def get_document(idx, records)
    record = records[idx]
    # puts 'idx'
    # puts idx
    # puts record['title'].to_a[0]
    {
        'title' => record['title'].to_a[0],
        'text' => record['text'].to_a[0],
        'domain' => record['domain'].to_a[0],
        'date' => format_date(record['date'].to_a[0]),
        'description' => record['description'].to_a[0],
        'url' => record['url'].to_a[0],
        'image_url' => record['image_url'].to_a[0],
        'title_embedding' => record['title_embedding'].to_a[0]
    }
end

:get_document

In [284]:
first_doc = get_document(1, $records_dataset)
puts first_doc.to_json


{"title":"New York City Ballet Announces Interim Leadership Team","text":"The New York City Ballet Board of Directors announced on Saturday the interim team that has been appointed to run the artistic side of the company during ballet master in chief Peter Martins' leave of absence. Martins requested a temporary leave from both NYCB and the School of American Ballet last Thursday while the company undergoes an internal investigation into the sexual harassment accusations aimed at him.\nThe four-person group is made up of members of the company's current artistic staff, led by ballet master and former principal dancer Jonathan Stafford. Joining Stafford are NYCB resident choreographer and soloist Justin Peck and ballet masters Craig Hall and Rebecca Krohn, both former dancers with the company. While the members of this group haven't had much leadership experience, their close familiarity with the company (Krohn left the stage for her new role just two months ago) should help to ease the

In [285]:
def index_one(document)
    index_name = "ai-search"
    $client.index(index: index_name, id: document['url'], body: document)
end

:index_one

In [286]:
index_one(first_doc)

{"_index"=>"ai-search", "_id"=>"http://www.pointemagazine.com/nycb-interim-leadership-team-2516618703.html", "_version"=>39, "result"=>"updated", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>31314, "_primary_term"=>1}

In [350]:
$records_dataset['title']


shape: (1_000,)
Series: 'title' [str]
[
	"Daughter Duo is Dancing in The…
	"New York City Ballet Announces…
	"Watch Pennsylvania Ballet & Bo…
	"dance shoes"
	"Rebecca Krohn on Her Retiremen…
	…
	"AP-NORC poll: Privacy debacle …
	"The question at the Olympic Ov…
	"Column: PGA Tour primed for bi…
	"Going, going ... Indians remov…
	"Racing director Eric Boullier …
]

In [351]:
def index_bulk(records_dataset, batch_size: 100)
  index = "ai-search"
  count = records_dataset.height
    
  (0...count).step(batch_size).each do |batch|
    left = batch
    right = [batch + batch_size, count].min
    documents = []
    
    (left...right).each do |idx|   
      document = get_document(idx, records_dataset)
      
      # Create action metadata with newline
      action = { index: { _index: index, _id: document['url'] } }.to_json + "\n"
      
      # Add document data with newline
      doc_data = document.to_json + "\n"
      
      # Add both to bulk body
      documents << action
      documents << doc_data

    end

    # Join all lines for the bulk request
    response = $client.bulk(body: documents.join,refresh: true)
  end
  
  puts "Total documents indexed: #{count}"
end

:index_bulk

In [352]:
index_bulk($records_dataset)

Total documents indexed: 1000


In [349]:
# $records_dataset

# Time to search!

In [344]:
def get_knn_body(querystring)
    embeddings = get_embeddings(querystring)
    {
        "query" => {
            "bool" => {
                "should" => [
                    {
                        "knn" => {
                            "title_embedding" => {
                                "vector" => embeddings[0],
                                "k" => 20
                            }
                        }
                    }
                ]
            }
        },
        "_source" => { "exclude" => ["title_embedding"] }
    }
end

:get_knn_body

In [345]:
def serps(querystring, resp, k: 5, show: true)
    # puts resp
    count = resp["hits"]["total"]["value"]
    # puts "count: #{count}"
    results = resp["hits"]["hits"]
    
    html_str = "<h4>Showing #{count} Results for <em>#{querystring}</em></h4><ol>"
    
    results[0...k].each do |result|
        score = result["_score"]
        title = result["_source"]["title"] || "No title"
        url = result["_source"]["url"] || "No URL"
        description = result["_source"]["description"]
        text = result["_source"]["text"] || ""
        snippet = description || "#{text[0...140]}..."
        
        html_str += "<li><b>#{title}</b>(#{score})<br>#{snippet}<br>"
        html_str += "<span style=\"font-size:0.8em\"><a href=\"#{url}\">#{url}</a></span></li>"
    end
    
    html_str += "</ol>"
    
    if show
        IRuby.display(IRuby.html(html_str))
    else
        html_str
    end
end

:serps

In [346]:
def search(querystring)
    body = get_knn_body(querystring)
    resp = $client.search(body: body, index: "ai-search")
    serps(querystring, resp)
end

:search

In [347]:
search("Ballet event")

Processing embeddings: 100% (1/1)
Done!


In [348]:
search("housing market")

Processing embeddings: 100% (1/1)
Done!
