# Lab 4 - Hybrid Search of title embeddings and full-text fields in Opensearch

In [None]:
require 'opensearch-ruby'
require 'sentence_transformers'
require 'tqdm'
require 'date'
require 'json'

In [None]:
# https://github.com/opensearch-project/opensearch-ruby
host = 'ai-search-opensearch-node'
port = 9200
client = OpenSearch::Client.new(hosts: [{ host: host, port: port }])
info = client.info
puts "Welcome to #{info['version']['distribution']} #{info['version']['number']}!"

## Use the same model and method to get the query embedding, with some defaults changed
Remember, the model is `intfloat/e5-small-v2` and we need to prefix any query with 'query:'

In [None]:
# The E5 models expect 'query:' and 'passage:' prefixes
model = SentenceTransformer.new('intfloat/e5-small-v2')
def get_embeddings(texts, prefix: "query: ")
    # The E5 models expects either 'query: ' or 'passage: ' prefix
    texts = [texts] unless texts.is_a?(Array)
    prefixed = texts.map { |text| "#{prefix}#{text}" }
    model.encode(prefixed, show_progress_bar: false)
end

## Define different query types
Here we define Opensearch query bodies for:
 - BM25
 - KNN
 - Hybrid

In [None]:
def get_bm25_body(querystring)
    {
      "query" => {
        "bool" => {
          "should" => [
            {
              "multi_match" => {
                "query" => querystring,
                "type" => "cross_fields",
                "fields" => ["description"],
                "boost" => 1.0
              }
            },
            {
              "multi_match" => {
                "query" => querystring,
                "type" => "cross_fields",
                "fields" => ["title"],
                "boost" => 1.1
              }
            },
            {
              "multi_match" => {
                "query" => querystring,
                "type" => "cross_fields",
                "fields" => ["title_exactish"],
                "boost" => 1.2
              }
            }
          ]
        }
      },
      "_source" => {"exclude" => ["title_embedding"]}
    }
end

In [None]:
def get_knn_body(querystring)
    embeddings = get_embeddings(querystring)
    {
      "query" => {
        "bool" => {
          "should" => [
            {
              "knn" => {
                "title_embedding" => {
                  "vector" => embeddings[0],
                  "k" => 20
                }
              }
            }
          ]
        }
      },
      "_source" => {"exclude" => ["title_embedding"]}
    }
end

## Hybrid Query

In [None]:
def get_hybrid_body(querystring)
    embeddings = get_embeddings(querystring)
    {
      "query" => {
        "hybrid" => {
          "queries" => [
            {
              "bool" => {
                "should" => [
                  {
                    "multi_match" => {
                      "query" => querystring,
                      "type" => "cross_fields",
                      "fields" => ["description"],
                      "boost" => 1.0
                    }
                  },
                  {
                    "multi_match" => {
                      "query" => querystring,
                      "type" => "cross_fields",
                      "fields" => ["title"],
                      "boost" => 1.1
                    }
                  },
                  {
                    "multi_match" => {
                      "query" => querystring,
                      "type" => "cross_fields",
                      "fields" => ["title_exactish"],
                      "boost" => 1.2
                    }
                  }
                ]
              }        
            },
            {
              "knn" => {
                "title_embedding" => {
                  "vector" => embeddings[0],
                  "k" => 100
                }
              }
            }
          ]
        }
      },
      "_source" => {"exclude" => ["title_embedding"]}
    }
end

In [None]:
def serps(querystring, resp, k: 5, show: true)
    count = resp["hits"]["total"]["value"]
    results = resp["hits"]["hits"]
    
    html_str = "<h4>Showing #{count} Results for <em>#{querystring}</em></h4><ol>"
    
    results[0...k].each do |result|
        score = result["_score"]
        title = result["_source"]["title"] || "No title"
        url = result["_source"]["url"] || "No URL"
        description = result["_source"]["description"]
        text = result["_source"]["text"] || ""
        snippet = description || "#{text[0...140]}..."
        
        html_str += "<li><b>#{title}</b>(#{score})<br>#{snippet}<br>"
        html_str += "<span style=\"font-size:0.8em\"><a href=\"#{url}\">#{url}</a></span></li>"
    end
    
    html_str += "</ol>"
    
    if show
        IRuby.display(IRuby.html(html_str))
    else
        html_str
    end
end

In [None]:
def make_normalization_pipeline(name, bm25_weight: 0.5, knn_weight: 0.5)
    body = {
      "description" => "Post processor for hybrid search with bm25=#{bm25_weight} and knn=#{knn_weight}",
      "phase_results_processors" => [
        {
          "normalization-processor" => {
            "normalization" => {
              "technique" => "min_max"
            },
            "combination" => {
              "technique" => "arithmetic_mean",
              "parameters" => {
                "weights" => [
                  bm25_weight,
                  knn_weight
                ]
              }
            }
          }
        }
      ]
    }
    resp = client.transport.perform_request(method: "PUT", url: "/_search/pipeline/#{name}", body: body)
    puts resp
end

In [None]:
make_normalization_pipeline("nlp-search-pipeline-equal", bm25_weight: 0.5, knn_weight: 0.5)
make_normalization_pipeline("nlp-search-pipeline-bm25-heavy", bm25_weight: 0.6, knn_weight: 0.4)
make_normalization_pipeline("nlp-search-pipeline-knn-heavy", bm25_weight: 0.4, knn_weight: 0.6)

In [None]:
def search(querystring, body, pipeline: "nlp-search-pipeline-equal", show: true)
    resp = client.search(body: body, index: "ai-search", params: {"search_pipeline" => pipeline})
    serps(querystring, resp) if show
    resp
end

def search_bm25(querystring, show: true)
    body = get_bm25_body(querystring)
    search(querystring, body, show: show)
end

def search_knn(querystring, show: true)
    body = get_knn_body(querystring)
    search(querystring, body, show: show)
end

def search_hybrid(querystring, pipeline: "nlp-search-pipeline-equal", show: true)
    body = get_hybrid_body(querystring)
    search(querystring, body, pipeline: pipeline, show: show)
end

In [None]:
resp = search_hybrid("crypto scandal")

In [None]:
resp = search_bm25("crypto scandal")

In [None]:
resp = search_knn("crypto scandal")

In [None]:
def search_compare(querystring, pipeline: "nlp-search-pipeline-equal")
    bm25 = serps(querystring, search_bm25(querystring, show: false), k: 5, show: false)
    knn = serps(querystring, search_knn(querystring, show: false), k: 5, show: false)
    hybrid = serps(querystring, search_hybrid(querystring, pipeline: pipeline, show: false), k: 5, show: false)
    
    html_all = <<-HTML
        <style>
            .compare li {overflow-x:hidden;width:320px!important;text-align:left;height:200px;border-bottom:1px solid #333;}
        </style>
        <table class="compare">
            <tr><td>BM25</td><td>KNN</td><td>Hybrid (#{pipeline})</td></tr>
            <tr><td>#{bm25}</td><td>#{knn}</td><td>#{hybrid}</td></tr>
        </table>
    HTML
    
    IRuby.display(IRuby.html(html_all))
end

In [None]:
search_compare("property market", pipeline: "nlp-search-pipeline-equal")

In [None]:
search_compare("property market", pipeline: "nlp-search-pipeline-bm25-heavy")

In [None]:
search_compare("property market", pipeline: "nlp-search-pipeline-knn-heavy")

In [None]:
search_compare("crypto scandal")

In [None]:
search_compare("US economic recovery")