## Running the Elasticsearch docker container

```bash
docker run -p 9200:9200 -d --name elasticsearch --rm \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -e "xpack.security.http.ssl.enabled=false" \
  -e "xpack.license.self_generated.type=trial" \
  docker.elastic.co/elasticsearch/elasticsearch:8.13.0
```


In [4]:
from elasticsearch import Elasticsearch
import json

es = Elasticsearch("http://localhost:9200")

In [7]:
index_name = "product"

# Delete the index if it exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

In [8]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 1},
    "mappings": {
        "properties": {
            "id": {"type": "integer"},
            "title": {"type": "text", "similarity": "BM25"},
            "description": {"type": "text", "similarity": "BM25"},
            "category": {"type": "keyword"},
            "price": {"type": "integer"},
            "average_rating": {"type": "float"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine",
                # See https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
                "index_options": {"type": "hnsw", "ef_construction": 200, "m": 16},
            },
        },
    },
}

# Create the index
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'product'})

In [4]:
# Pretty-print the mapping
mapping = es.indices.get_mapping(index=index_name)
mapping.raw

{'product_index': {'mappings': {'properties': {'average_rating': {'type': 'float'},
    'category': {'type': 'keyword'},
    'description': {'type': 'text', 'similarity': 'BM25'},
    'embedding': {'type': 'dense_vector',
     'dims': 384,
     'index': True,
     'similarity': 'cosine',
     'index_options': {'type': 'hnsw', 'm': 16, 'ef_construction': 200}},
    'id': {'type': 'integer'},
    'price': {'type': 'integer'},
    'title': {'type': 'text', 'similarity': 'BM25'}}}}}

## Feed the data to the Elasticsearch


In [None]:
!curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/_bulk --data-binary "@../dataprep/output-data/es_sample-100.jsonl"; echo

In [9]:
keyword = "algorithms"
hybrid_query = {
    "query": {
        "multi_match": {
            "query": "algorithm",
            "fields": ["title", "description"],  # Fields to search
            "type": "best_fields",  # Use best_fields to treat the fields as a single combined field
            "tie_breaker": 0.5,  # Manage the scoring between fields (set to 0 for no effect of secondary matches)
        }
    },
    "knn": {"field": "embedding", "query_vector": [0.7] * 384, "k": 2},
    "rank": {"rrf": {}},
    "_source": False,
    "fields": ["title", "description"],
}
results = es.search(index=index_name, body=hybrid_query)
print("Hybrid search results:", json.dumps(results.raw, indent=4))

Hybrid search results: {
    "took": 85,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 2,
            "relation": "eq"
        },
        "max_score": null,
        "hits": [
            {
                "_index": "product",
                "_id": "843195",
                "_score": null,
                "_rank": 1,
                "fields": {
                    "description": [
                        "Stickers are printed in high resolution to capture every detail of the image. We know that a sticker is only as good as its adhesive. Like the vinyl we use, we set out to find the best adhesive. All of our stickers have a very strong adhesive that still keeps the surface intact if it needs to be removed. We make our stickers with premium vinyl to withstand exposure to wind, rain and sunlight. The stickers are coated with a protective UV laminat

In [63]:
results = es.search(
    query={
        "bool": {
            "must": {"match": {"description": "computer"}},
            # If we want to filter by price, we can add a filter clause
            "filter": {"range": {"price": {"lt": 150}}},
        }
    },
    knn={
        "field": "embedding",
        "query_vector": [0.7] * 384,
        "k": 3,
        "num_candidates": 5,
    },
    rank={"rrf": {}},
    size=5,
    fields=["title", "price"],
    # Disable source retrieval - only return the fields specified in the `fields` parameter
    _source=False,
)

print("Hybrid search results:", json.dumps(results.raw, indent=4))

Hybrid search results: {
    "took": 33,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 2,
            "relation": "eq"
        },
        "max_score": null,
        "hits": [
            {
                "_index": "product_index",
                "_id": "2",
                "_score": null,
                "_rank": 1,
                "fields": {
                    "price": [
                        100
                    ],
                    "title": [
                        "Introduction to Algorithms"
                    ]
                }
            },
            {
                "_index": "product_index",
                "_id": "1",
                "_score": null,
                "_rank": 2,
                "fields": {
                    "price": [
                        200
                    ],
                    "title": [
  