In [28]:
! pip install elasticsearch fastembed

In [29]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import os
import csv
from fastembed import TextEmbedding
from typing import List
import numpy as np

In [30]:
books = [*csv.DictReader(open('/usr/local/dataset/dataset.csv'))]
descriptions = [doc["description"] for doc in books]
embedding_model = TextEmbedding(model_name="BAAI/bge-small-en")
embeddings: List[np.ndarray] = list(embedding_model.embed(descriptions))

In [31]:
client = Elasticsearch(['https://elasticsearch-ha-es-http:9200'], verify_certs=True, 
    ca_certs='/usr/local/cert/ca.crt',
    basic_auth=("elastic",
    os.getenv("PW"))
)

In [32]:
index_scheme = {
    "settings": {
        "number_of_shards": 3,
        "number_of_replicas": 1
    },
    "mappings": {
        "dynamic": "true",
        "_source": {
        "enabled": "true"
        },
        "properties": {
        "title": {
            "type": "text"
        },
        "author": {
            "type": "text"
        },
        "publishDate": {
            "type": "text"
        },
        "description": {
            "type": "text"
        },
        "description_vector": {
            "type": "dense_vector",
            "dims": 384
        }
        }
    }
}
client.indices.create(index="books", body=index_scheme)

In [33]:
documents: list[dict[str, any]] = []

for i, doc in enumerate(books):
    book = doc
    book["_op_type"] = "index"
    book["_index"] = "books"
    book["description_vector"] = embeddings[i]
    documents.append(book)

In [34]:
bulk(client, documents)

Define a function to query data from Elasticsearch.

It prints each result separated by a line of dashes, in the following format :

- Title: Title of the book, Author: Author of the book, Score: Elasticsearch relevancy score
- Description of the book

In [None]:
def handle_query(query, limit):
    query_vector = list(embedding_model.embed([query]))[0]
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'description_vector') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }
    response = client.search(
        index="books",
        body={
            "size": limit,
            "query": script_query,
            "_source": {"includes": ["description", "title", "author", "body"]}
        }
    )   
    for hit in response["hits"]["hits"]:
        print("Title: {}, Author: {}, score: {}".format(hit["_source"]["title"], hit["_source"]["author"], hit["_score"]))
        print(hit["_source"]["description"])
        print("---------")

Query the Elasticsearch database. It runs a search query about `drama about people and unhappy love` and displays results.

In [None]:
handle_query("drama about people and unhappy love", 2)