In [None]:
# https://www.elastic.co/kr/blog/how-to-deploy-nlp-text-embeddings-and-vector-search
!pip install elasticsearch

In [4]:
# Set Import
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from nltk.tokenize import RegexpTokenizer
from datetime import datetime

In [5]:
# Constant
index_suffix = "_20210623"

# Set Elasticsearch client ( == 8.5.2 )
es = Elasticsearch(hosts='http://127.0.0.1:9200')
print(es.info())

{'name': 'rnd-pc', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'r88RQhCVSc6VhT-0zolRvg', 'version': {'number': '8.11.3', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '64cf052f3b56b1fd4449f5454cb88aca7e739d9a', 'build_date': '2023-12-08T11:33:53.634979452Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [1]:
# https://www.sbert.net/docs/quickstart.html
from sentence_transformers import SentenceTransformer

In [2]:
model = SentenceTransformer('msmarco-MiniLM-L-12-v3')

In [3]:
sentences = ['how is the weather in jamaica', 'How is the weather in Jamaica?']

#Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: how is the weather in jamaica
Embedding: [ 3.34530890e-01 -3.05600852e-01  2.59279668e-01  2.80981958e-01
  4.63598609e-01 -5.73584378e-01  7.16962144e-02 -5.90868294e-02
 -7.89078102e-02  3.62236887e-01 -4.15611207e-01  3.87190819e-01
 -1.75434396e-01 -1.82604298e-01  2.08019122e-01 -6.74968958e-03
 -3.58326972e-01  4.34502922e-02 -2.28772238e-01 -1.80091903e-01
  2.38738880e-01 -3.63834143e-01 -3.32248539e-01  1.49275571e-01
 -4.88743842e-01  3.41952652e-01 -3.79692972e-01  1.63043082e-01
  2.94705331e-01 -3.61424610e-02 -1.87942117e-01  6.86962068e-01
  2.00638026e-01 -3.63157302e-01 -4.24887240e-02  6.47606969e-01
 -4.02464904e-02 -4.62765157e-01 -6.66669369e-01 -1.38681471e-01
 -4.12112594e-01 -3.98140192e-01 -1.46704942e-01  1.83483809e-01
  2.11242616e-01 -3.55386972e-01  1.95356831e-03 -7.32291639e-02
  1.25555441e-01 -4.97678936e-01 -4.19553161e-01  8.86605233e-02
  4.42097545e-01 -2.58165989e-02  1.07392639e-01 -2.34741509e-01
 -1.39024571e-01 -5.39209545e-01  8.583

In [6]:
settings = {
  "settings": {
    "number_of_shards": 3,
    "number_of_replicas": 0,
    "index.search.slowlog.threshold.query.warn": "3s",
  }
}

def make_index(es, index_name):
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)
        print("delete index :", index_name)
    es.indices.create(index=index_name, body=settings)
    print("create index :", index_name)

def get_doc_from_line(line):
    doc = { }
    fields = line.split('\t')
    doc['id'] = fields[0]
    doc['text'] = fields[1].strip()
    text_embedding = { }
    text_embedding['predicted_value'] = model.encode(doc['text'])
    doc['text_embedding'] = text_embedding
    return doc
    
def index_tsv(es, index_name, file_path, make_index_yn=True, mapping=None):
    print("indexing start: {}".format(datetime.today()))
    if make_index_yn:
        make_index(es, index_name)
    
    if mapping is not None:
        resp = es.indices.put_mapping(index=index_name, body={**mapping}, doc_type="_doc")
        print(resp)

    f = open(file_path)                                                                                       
    while True:
        lines = f.readlines(10 * 1024 * 1024) # 10M 단위로 읽기                                                             
        if not lines:                                                                                                      
            break
        json_docs = [get_doc_from_line(line) for line in lines]
        print(json_docs[0])
        ok, err = helpers.bulk(es, json_docs, index=index_name)
        print(ok, err);
    print("indexing end: {}".format(datetime.today()))

In [59]:
collection_index_alias = 'collection-with-embeddings'
collection_index_name = collection_index_alias
#index_tsv(es, collection_index_name, '../data/msmarco-passagetest2019-unique.tsv', make_index_yn=True, mapping=None)

indexing start: 2024-01-11 10:39:42.437582
delete index : collection-with-embeddings
create index : collection-with-embeddings
{'id': '7130104', 'text': 'This is the definition of RNA along with examples of types of RNA molecules. This is the definition of RNA along with examples of types of RNA molecules. RNA Definition', 'text_embedding': {'predicted_value': array([-2.36600339e-01, -7.04580545e-01, -2.21963525e-01, -1.39986843e-01,
        1.00311257e-01,  5.69228716e-02,  2.87480235e-01, -4.22112048e-02,
        4.20530885e-01,  1.86510772e-01, -3.40012878e-01,  4.50721420e-02,
       -2.25252017e-01, -1.10283293e-01, -1.40938684e-02,  2.88654596e-01,
       -4.73513687e-03,  6.76389337e-02, -1.00320458e+00, -1.82432488e-01,
        4.43737209e-01,  2.64385611e-01,  3.66507441e-01,  2.00877059e-02,
       -8.05082172e-02,  3.44885252e-02, -3.31538147e-03,  2.15427224e-02,
       -1.89073533e-01,  6.38620377e-01, -4.71551836e-01,  4.27978724e-01,
        7.97959030e-01,  2.08296031e-

In [68]:
def query_index(index_name, query):
    print(index_name)
    #results = es.search(index=index_name, body={'from':0, 'size':10, 'query': query})
    results = es.search(index=index_name, body={'from':0, 'size':10, **query})
    for result in results['hits']['hits']:
        print(result['_source'])

def knn_query_index(index_name, query):
    print(index_name)
    #results = es.search(index=index_name, body={'from':0, 'size':10, 'query': query})
    results = es.knn_search(index=index_name, body={**query})
    #print(results['hits']['hits'])
    for result in results['hits']['hits']:
        print(result)
        #print(result['_source'])

In [69]:
embedding = model.encode("how is the weather in jamaica")
query = {
  "knn": {
    "field": "text_embedding.predicted_value",
    "query_vector": embedding,
    "k": 10,
    "num_candidates": 100
  },
  "_source": [
    "id",
    "text"
  ]
}
knn_query_index(collection_index_name, query)

collection-with-embeddings
{'_index': 'collection-with-embeddings', '_id': 'dTY59owBxsunP9IbeUCA', '_score': 0.94591546, '_ignored': ['text.keyword'], '_source': {'id': '434125', 'text': 'The climate in Jamaica is tropical and humid with warm to hot temperatures all year round. The average temperature in Jamaica is between 80 and 90 degrees Fahrenheit. Jamaican nights are considerably cooler than the days, and the mountain areas are cooler than the lower land throughout the year. Continue Reading.'}}
{'_index': 'collection-with-embeddings', '_id': 'bjY59owBxsunP9IbbiSM', '_score': 0.9453643, '_ignored': ['text.keyword'], '_source': {'id': '4498474', 'text': 'The climate in Jamaica is tropical and humid with warm to hot temperatures all year round. The average temperature in Jamaica is between 80 and 90 degrees Fahrenheit. Jamaican nights are considerably cooler than the days, and the mountain areas are cooler than the lower land throughout the year.'}}
{'_index': 'collection-with-embed

  results = es.knn_search(index=index_name, body={**query})


## Hybrid search