## Running the Elasticsearch docker container

```bash
docker run -p 9200:9200 -d --name elasticsearch \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -e "xpack.security.http.ssl.enabled=false" \
  -e "xpack.license.self_generated.type=trial" \
  docker.elastic.co/elasticsearch/elasticsearch:8.13.0
```


In [1]:
from elasticsearch import Elasticsearch
import json

es = Elasticsearch("http://localhost:9200")

In [2]:
index_name = "product_index"

# Delete the index if it exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

In [3]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 1},
    "mappings": {
        "properties": {
            "id": {"type": "integer"},
            "title": {"type": "text", "similarity": "BM25"},
            "description": {"type": "text", "similarity": "BM25"},
            "category": {"type": "keyword"},
            "price": {"type": "integer"},
            "average_rating": {"type": "float"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine",
                # See https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
                "index_options": {"type": "hnsw", "ef_construction": 200, "m": 16},
            },
        },
    },
}

# Create the index
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'product_index'})

In [4]:
# Pretty-print the mapping
mapping = es.indices.get_mapping(index=index_name)
mapping.raw

{'product_index': {'mappings': {'properties': {'average_rating': {'type': 'float'},
    'category': {'type': 'keyword'},
    'description': {'type': 'text', 'similarity': 'BM25'},
    'embedding': {'type': 'dense_vector',
     'dims': 384,
     'index': True,
     'similarity': 'cosine',
     'index_options': {'type': 'hnsw', 'm': 16, 'ef_construction': 200}},
    'id': {'type': 'integer'},
    'price': {'type': 'integer'},
    'title': {'type': 'text', 'similarity': 'BM25'}}}}}

In [5]:
from elasticsearch import helpers

actions = [
    {
        "_index": "product_index",
        "_id": 1,
        "_source": {
            "id": 1,
            "title": "The Art of Computer Programming",
            "description": "Comprehensive computer programming book by Donald Knuth",
            "category": "Computer Science",
            "price": 200,
            "average_rating": 4.8,
            "embedding": [0.5] * 384,
        },
    },
    {
        "_index": "product_index",
        "_id": 2,
        "_source": {
            "id": 2,
            "title": "Introduction to Algorithms",
            "description": "Popular textbook in computer science, covering algorithms and data structures",
            "category": "Computer Science",
            "price": 100,
            "average_rating": 4.5,
            "embedding": [0.8] * 384,
        },
    },
    {
        "_index": "product_index",
        "_id": 3,
        "_source": {
            "id": 3,
            "title": "Clean Code",
            "description": "A handbook of agile software craftsmanship",
            "category": "Software Development",
            "price": 45,
            "average_rating": 4.7,
            "embedding": [0.6] * 384,
        },
    },
    {
        "_index": "product_index",
        "_id": 4,
        "_source": {
            "id": 4,
            "title": "Design Patterns",
            "description": "Elements of Reusable Object-Oriented Software",
            "category": "Software Development",
            "price": 55,
            "average_rating": 4.4,
            "embedding": [0.7] * 384,
        },
    },
    {
        "_index": "product_index",
        "_id": 5,
        "_source": {
            "id": 5,
            "title": "You Don't Know JS",
            "description": "A book series exploring the core mechanisms of JavaScript",
            "category": "Web Development",
            "price": 25,
            "average_rating": 4.6,
            "embedding": [0.9] * 384,
        },
    },
    {
        "_index": "product_index",
        "_id": 6,
        "_source": {
            "id": 6,
            "title": "Cracking the Coding Interview",
            "description": "189 programming questions and solutions",
            "category": "Software Development",
            "price": 30,
            "average_rating": 4.5,
            "embedding": [0.85] * 384,
        },
    },
    {
        "_index": "product_index",
        "_id": 7,
        "_source": {
            "id": 7,
            "title": "Artificial Intelligence: A Modern Approach",
            "description": "The most comprehensive book in AI, used in over 1300 universities worldwide",
            "category": "Artificial Intelligence",
            "price": 120,
            "average_rating": 4.3,
            "embedding": [0.75] * 384,
        },
    },
    {
        "_index": "product_index",
        "_id": 8,
        "_source": {
            "id": 8,
            "title": "The Pragmatic Programmer",
            "description": "Your journey to mastery in software development",
            "category": "Software Development",
            "price": 50,
            "average_rating": 4.8,
            "embedding": [0.65] * 384,
        },
    },
    {
        "_index": "product_index",
        "_id": 9,
        "_source": {
            "id": 9,
            "title": "Algorithms to Live By",
            "description": "The computer science of human decisions",
            "category": "Computer Science",
            "price": 40,
            "average_rating": 4.2,
            "embedding": [0.55] * 384,
        },
    },
    {
        "_index": "product_index",
        "_id": 10,
        "_source": {
            "id": 10,
            "title": "Code Complete",
            "description": "A practical handbook of software construction",
            "category": "Software Development",
            "price": 60,
            "average_rating": 4.9,
            "embedding": [0.6] * 384,
        },
    },
]

# Assuming 'es' is your Elasticsearch client instance:
helpers.bulk(es, actions)

(10, [])

In [6]:
keyword = "algorithms"
hybrid_query = {
    "query": {
        "multi_match": {
            "query": "algorithm",
            "fields": ["title", "description"],  # Fields to search
            "type": "best_fields",  # Use best_fields to treat the fields as a single combined field
            "tie_breaker": 0.5,  # Manage the scoring between fields (set to 0 for no effect of secondary matches)
        }
    },
    "knn": {"field": "embedding", "query_vector": [0.7] * 384, "k": 2},
    "rank": {"rrf": {}},
    "_source": False,
    "fields": ["title", "description"],
}
results = es.search(index=index_name, body=hybrid_query)
print("Hybrid search results:", json.dumps(results.raw, indent=4))

Hybrid search results: {
    "took": 57,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 2,
            "relation": "eq"
        },
        "max_score": null,
        "hits": [
            {
                "_index": "product_index",
                "_id": "5",
                "_score": null,
                "_rank": 1,
                "fields": {
                    "description": [
                        "A book series exploring the core mechanisms of JavaScript"
                    ],
                    "title": [
                        "You Don't Know JS"
                    ]
                }
            },
            {
                "_index": "product_index",
                "_id": "9",
                "_score": null,
                "_rank": 2,
                "fields": {
                    "description": [
                        "

In [63]:
results = es.search(
    query={
        "bool": {
            "must": {"match": {"description": "computer"}},
            # If we want to filter by price, we can add a filter clause
            "filter": {"range": {"price": {"lt": 150}}},
        }
    },
    knn={
        "field": "embedding",
        "query_vector": [0.7] * 384,
        "k": 3,
        "num_candidates": 5,
    },
    rank={"rrf": {}},
    size=5,
    fields=["title", "price"],
    # Disable source retrieval - only return the fields specified in the `fields` parameter
    _source=False,
)

print("Hybrid search results:", json.dumps(results.raw, indent=4))

Hybrid search results: {
    "took": 33,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 2,
            "relation": "eq"
        },
        "max_score": null,
        "hits": [
            {
                "_index": "product_index",
                "_id": "2",
                "_score": null,
                "_rank": 1,
                "fields": {
                    "price": [
                        100
                    ],
                    "title": [
                        "Introduction to Algorithms"
                    ]
                }
            },
            {
                "_index": "product_index",
                "_id": "1",
                "_score": null,
                "_rank": 2,
                "fields": {
                    "price": [
                        200
                    ],
                    "title": [
  