Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.


## Running the Elasticsearch docker container

```bash
docker run -p 9200:9200 -d --name elasticsearch --rm \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -e "xpack.security.http.ssl.enabled=false" \
  -e "xpack.license.self_generated.type=trial" \
  docker.elastic.co/elasticsearch/elasticsearch:8.13.0
```


In [10]:
from elasticsearch import Elasticsearch
import json

es = Elasticsearch("http://localhost:9200")

In [11]:
index_name = "product"

# Delete the index if it exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

In [12]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 1},
    "mappings": {
        "properties": {
            "id": {"type": "integer"},
            "title": {"type": "text", "similarity": "BM25"},
            "description": {"type": "text", "similarity": "BM25"},
            "category": {"type": "keyword"},
            "price": {"type": "integer"},
            "average_rating": {"type": "float"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine",
                # See https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
                "index_options": {"type": "hnsw", "ef_construction": 200, "m": 16},
            },
        },
    },
}

# Create the index
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'product'})

In [13]:
# Pretty-print the mapping
mapping = es.indices.get_mapping(index=index_name)
mapping.raw

{'product': {'mappings': {'properties': {'average_rating': {'type': 'float'},
    'category': {'type': 'keyword'},
    'description': {'type': 'text', 'similarity': 'BM25'},
    'embedding': {'type': 'dense_vector',
     'dims': 384,
     'index': True,
     'similarity': 'cosine',
     'index_options': {'type': 'hnsw', 'm': 16, 'ef_construction': 200}},
    'id': {'type': 'integer'},
    'price': {'type': 'integer'},
    'title': {'type': 'text', 'similarity': 'BM25'}}}}}

## Feed the data to the Elasticsearch


In [None]:
!curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/_bulk --data-binary "@../dataprep/output-data/final/es_feed-10k.json"; echo

## BM25


In [27]:
keyword = "monster flag"

In [37]:
bm25_query = {
    "query": {
        "multi_match": {
            "query": keyword,
            "fields": ["title", "description"],
            "type": "best_fields",
        }
    },
    "size": 10,
    "fields": ["title", "description"],
    "_source": False,
}

results = es.search(index=index_name, body=bm25_query)
print("BM25 results: ", json.dumps(results.raw, indent=4))


BM25 results:  {
    "took": 11,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 8,
            "relation": "eq"
        },
        "max_score": 9.189247,
        "hits": [
            {
                "_index": "product",
                "_id": "312685",
                "_score": 9.189247,
                "fields": {
                    "description": [
                        "     "
                    ],
                    "title": [
                        "American & Dominican Republic Flag Patch, U.S. Flag Patches"
                    ]
                }
            },
            {
                "_index": "product",
                "_id": "75",
                "_score": 7.5551996,
                "fields": {
                    "description": [
                        "0:50 5:56 0:39 5:03 Clean Version 3:22"
                    ],
    

## Semantic Query


In [38]:
# vespa query "yql=select * from product where ({targetHits:100}nearestNeighbor(embedding,q_embedding))" ranking.profile=closeness "input.query(q_embedding)=[]"
semantic_query = {
    "knn": {
        "field": "embedding",
        "query_vector": [0.5] * 384,
        "k": 100,  # TODO: Set k to 100 when feeding full dataset
    },
    "size": 10,
    "fields": ["title", "description"],
    "_source": False,
}
results = es.search(index=index_name, body=semantic_query)
print("Semantic search results:", json.dumps(results.raw, indent=4))


Semantic search results: {
    "took": 18,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 100,
            "relation": "eq"
        },
        "max_score": 0.5146523,
        "hits": [
            {
                "_index": "product",
                "_id": "3309",
                "_score": 0.5146523,
                "fields": {
                    "description": [
                        "Distant Light"
                    ],
                    "title": [
                        "Distant Light"
                    ]
                }
            },
            {
                "_index": "product",
                "_id": "971",
                "_score": 0.51464224,
                "fields": {
                    "description": [
                        "Audio CD"
                    ],
                    "title": [
                        "Je

## Hybrid search


In [39]:
# vespa query "yql=select * from product where ({targetHits:10}nearestNeighbor(embedding,q_embedding)) or userQuery()" ranking.profile=hybrid "query=Small MONEY CLIP Leather Wallet" "input.query(q_embedding)=[]"
hybrid_query = {
    "query": {
        "multi_match": {
            "query": keyword,
            "fields": ["title", "description"],
            "type": "best_fields",
        }
    },
    "knn": {
        "field": "embedding",
        "query_vector": [0.5] * 384,
        "k": 10,
        "num_candidates": 10,  # per shard, we have only 1 shard
    },
    "rank": {"rrf": {}},
    "size": 10,
    "fields": ["title", "description"],
    "_source": False,
}
results = es.search(index=index_name, body=hybrid_query)
print("Hybrid search results:", json.dumps(results.raw, indent=4))


Hybrid search results: {
    "took": 12,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 18,
            "relation": "eq"
        },
        "max_score": null,
        "hits": [
            {
                "_index": "product",
                "_id": "312685",
                "_score": null,
                "_rank": 1,
                "fields": {
                    "description": [
                        "     "
                    ],
                    "title": [
                        "American & Dominican Republic Flag Patch, U.S. Flag Patches"
                    ]
                }
            },
            {
                "_index": "product",
                "_id": "971",
                "_score": null,
                "_rank": 2,
                "fields": {
                    "description": [
                        "Audio CD"
    