Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.


# ES


## Running the Elasticsearch docker container


```bash
docker run -p 9200:9200 -d --name elasticsearch --rm \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -e "xpack.security.http.ssl.enabled=false" \
  -e "xpack.license.self_generated.type=trial" \
  docker.elastic.co/elasticsearch/elasticsearch:8.13.2
```


In [122]:
run_es_docker_command = 'docker run -p 9200:9200 -d --name elasticsearch --rm \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    -e "xpack.security.http.ssl.enabled=false" \
    -e "xpack.license.self_generated.type=trial" \
    docker.elastic.co/elasticsearch/elasticsearch:8.13.2'

In [132]:
import docker

client = docker.from_env()

container = client.containers.run(
    "docker.elastic.co/elasticsearch/elasticsearch:8.13.2",
    detach=True,
    ports={"9200/tcp": 9200},
    environment=[
        "discovery.type=single-node",
        "xpack.security.enabled=false",
        "xpack.security.http.ssl.enabled=false",
        "xpack.license.self_generated.type=trial",
    ],
)
# Wait until container is ready
container.reload()

In [137]:
from elasticsearch import Elasticsearch
import json
import time

es = Elasticsearch("http://localhost:9200")
timeout = 30
while not es.ping():
    timeout -= 1
    if timeout == 0:
        raise TimeoutError("Elasticsearch is not ready")
    time.sleep(1)

In [138]:
index_name = "product"

# Delete the index if it exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

In [139]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 1},
    "mappings": {
        "properties": {
            "id": {"type": "integer"},
            "title": {"type": "text", "similarity": "BM25"},
            "description": {"type": "text", "similarity": "BM25"},
            "category": {"type": "keyword"},
            "price": {"type": "integer"},
            "average_rating": {"type": "float"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine",
                # See https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
                "index_options": {"type": "hnsw", "ef_construction": 200, "m": 16},
            },
        },
    },
}

# Create the index
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'product'})

In [140]:
# Pretty-print the mapping
mapping = es.indices.get_mapping(index=index_name)
mapping.raw

{'product': {'mappings': {'properties': {'average_rating': {'type': 'float'},
    'category': {'type': 'keyword'},
    'description': {'type': 'text', 'similarity': 'BM25'},
    'embedding': {'type': 'dense_vector',
     'dims': 384,
     'index': True,
     'similarity': 'cosine',
     'index_options': {'type': 'hnsw', 'm': 16, 'ef_construction': 200}},
    'id': {'type': 'integer'},
    'price': {'type': 'integer'},
    'title': {'type': 'text', 'similarity': 'BM25'}}}}}

## Feed the data to the Elasticsearch container


In [141]:
!curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/_bulk --data-binary "@../dataprep/output-data/final/es_feed-10k.json"; echo

{"errors":false,"took":3027,"items":[{"index":{"_index":"product","_id":"6252","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":0,"_primary_term":1,"status":201}},{"index":{"_index":"product","_id":"4684","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":1,"_primary_term":1,"status":201}},{"index":{"_index":"product","_id":"1731","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":2,"_primary_term":1,"status":201}},{"index":{"_index":"product","_id":"4742","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":3,"_primary_term":1,"status":201}},{"index":{"_index":"product","_id":"4521","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":4,"_primary_term":1,"status":201}},{"index":{"_index":"product","_id":"6340","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_

### Common function to retrieve query from file


In [145]:
# Load the JSON data from the file
def get_single_query(application: str = "vespa", query_mode: str = "weak_and") -> dict:
    if application not in ["vespa", "es"]:
        raise ValueError("format must be 'vespa' or 'es'")
    if query_mode not in ["weak_and", "semantic", "hybrid"]:
        raise ValueError("query_mode must be 'weak_and', 'semantic', or 'hybrid'")
    filepath = (
        f"../dataprep/output-data/final/{application}_queries-{query_mode}-1.json"
    )
    with open(filepath, "r") as file:
        endpoint, query = file.read().splitlines()
        query_dict = json.loads(query)
    return query_dict

## Query Elasticsearch


In [146]:
bm25_query = get_single_query(application="es", query_mode="weak_and")
bm25_query

{'size': 10,
 'fields': ['title', 'description'],
 '_source': False,
 'query': {'multi_match': {'query': 'BLACKPINK LISA FIRST SINGLE ALBUM - LALISA [ GOLD VER. ] PHOTOBOOK + LYRICS PAPER + CD + PHOTOCARD + POLAROID + DOUBLE-SIDED POSTER',
   'fields': ['title', 'description'],
   'type': 'most_fields'}}}

In [147]:
results = es.search(index=index_name, body=bm25_query)
print("BM25 results: ", json.dumps(results.raw, indent=4))

BM25 results:  {
    "took": 25,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 3729,
            "relation": "eq"
        },
        "max_score": 46.63423,
        "hits": [
            {
                "_index": "product",
                "_id": "666",
                "_score": 46.63423,
                "fields": {
                    "description": [
                        "  FEATURES           "
                    ],
                    "title": [
                        "BLACKPINK LISA FIRST SINGLE ALBUM - LALISA [ GOLD VER. ] PHOTOBOOK + LYRICS PAPER + CD + PHOTOCARD + POLAROID + DOUBLE-SIDED POSTER"
                    ]
                }
            },
            {
                "_index": "product",
                "_id": "904",
                "_score": 33.27973,
                "fields": {
                    "description": [
    

### Semantic Query


In [148]:
semantic_query = get_single_query(application="es", query_mode="semantic")
results = es.search(index=index_name, body=semantic_query)
print("Semantic search results:", json.dumps(results.raw, indent=4))

Semantic search results: {
    "took": 35,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 100,
            "relation": "eq"
        },
        "max_score": 1.0,
        "hits": [
            {
                "_index": "product",
                "_id": "666",
                "_score": 1.0,
                "fields": {
                    "description": [
                        "  FEATURES           "
                    ],
                    "title": [
                        "BLACKPINK LISA FIRST SINGLE ALBUM - LALISA [ GOLD VER. ] PHOTOBOOK + LYRICS PAPER + CD + PHOTOCARD + POLAROID + DOUBLE-SIDED POSTER"
                    ]
                }
            },
            {
                "_index": "product",
                "_id": "3843",
                "_score": 0.9417677,
                "fields": {
                    "description": [
   

### Hybrid search - RRF


In [149]:
hybrid_query = get_single_query(application="es", query_mode="hybrid")
results = es.search(index=index_name, body=hybrid_query)
print("Hybrid search results:", json.dumps(results.raw, indent=4))

Hybrid search results: {
    "took": 24,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 3729,
            "relation": "eq"
        },
        "max_score": 47.63423,
        "hits": [
            {
                "_index": "product",
                "_id": "666",
                "_score": 47.63423,
                "fields": {
                    "description": [
                        "  FEATURES           "
                    ],
                    "title": [
                        "BLACKPINK LISA FIRST SINGLE ALBUM - LALISA [ GOLD VER. ] PHOTOBOOK + LYRICS PAPER + CD + PHOTOCARD + POLAROID + DOUBLE-SIDED POSTER"
                    ]
                }
            },
            {
                "_index": "product",
                "_id": "904",
                "_score": 33.27973,
                "fields": {
                    "description"

# Vespa


## Starting the Vespa docker container

Be sure that your docker engine is running, and has at least 16GB of memory allocated to it.


In [104]:
from vespa.deployment import VespaDocker

app_package_path = "../app/"

vespa_docker = VespaDocker(port=8080)
app_name = "ecommerce"

app = vespa_docker.deploy_from_disk(
    application_name=app_name, application_root=app_package_path
)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Application is up!
Finished deployment.


## Feed data to Vespa


In [76]:
!vespa config set target local
!vespa feed --progress 5 ../dataprep/output-data/final/vespa_feed-10k.json

{
  "feeder.operation.count": 10000,
  "feeder.seconds": 4.241,
  "feeder.ok.count": 10000,
  "feeder.ok.rate": 2358.203,
  "feeder.error.count": 0,
  "feeder.inflight.count": 0,
  "http.request.count": 10000,
  "http.request.bytes": 23398676,
  "http.request.MBps": 5.518,
  "http.exception.count": 0,
  "http.response.count": 10000,
  "http.response.bytes": 1097780,
  "http.response.MBps": 0.259,
  "http.response.error.count": 0,
  "http.response.latency.millis.min": 296,
  "http.response.latency.millis.avg": 413,
  "http.response.latency.millis.max": 833,
  "http.response.code.counts": {
    "200": 10000
  }
}


## Querying Vespa


### BM25


In [114]:
bm25_query = get_single_query(application="vespa", query_mode="weak_and")
bm25_query

{'yql': 'select * from product where userQuery()',
 'ranking.profile': 'bm25',
 'presentation.summary': 'minimal',
 'query': 'BLACKPINK LISA FIRST SINGLE ALBUM - LALISA ['}

In [115]:
from vespa.io import VespaQueryResponse

response: VespaQueryResponse = app.query(
    # Update to medium presentation summary
    body={**bm25_query, "presentation.summary": "medium"}
)
print(json.dumps(response.hits[:3], indent=4))

[
    {
        "id": "index:search/0/d49f4f339ad5c887b50fdfe8",
        "relevance": 18.105515172044054,
        "source": "search",
        "fields": {
            "sddocname": "product",
            "id": 904,
            "title": "ROSE BLACKPINK ROSE SINGLE ALUMB -R- ( LP ) (Limited Edition)[+EXTRA BLACKPINK PHOTOCARD]",
            "description": "+EXTRA BLACKPINK PHOTOCARD"
        }
    },
    {
        "id": "index:search/0/6b06bb73a068967affaa5e18",
        "relevance": 17.247005511148537,
        "source": "search",
        "fields": {
            "sddocname": "product",
            "id": 666,
            "title": "BLACKPINK LISA FIRST SINGLE ALBUM - LALISA [ GOLD VER. ] PHOTOBOOK + LYRICS PAPER + CD + PHOTOCARD + POLAROID + DOUBLE-SIDED POSTER",
            "description": "  FEATURES           "
        }
    },
    {
        "id": "index:search/0/e2ab7f6fdc2a1d0b6503acda",
        "relevance": 12.360969149846973,
        "source": "search",
        "fields": {
            "

### Semantic search


In [116]:
semantic_query = get_single_query(application="vespa", query_mode="semantic")
semantic_query

{'yql': 'select * from product where ({targetHits:100}nearestNeighbor(embedding,q_embedding))',
 'ranking.profile': 'closeness',
 'presentation.summary': 'minimal',
 'query': 'BLACKPINK LISA FIRST SINGLE ALBUM - LALISA [',
 'input.query(q_embedding)': [0.025907913222908974,
  -0.0113765187561512,
  -0.05289803072810173,
  -0.030958142131567,
  0.04915975406765938,
  0.007972903549671173,
  -0.018939446657896042,
  -0.039253078401088715,
  -0.023456573486328125,
  -0.03065423108637333,
  0.05994176119565964,
  0.10258321464061737,
  -0.0010259866248816252,
  -0.02520928345620632,
  0.007055302616208792,
  -0.051815297454595566,
  0.09250776469707489,
  -0.02454352006316185,
  -0.10023844242095947,
  0.005638183560222387,
  -0.03079683519899845,
  0.008614066056907177,
  0.03516066074371338,
  -0.008583984337747097,
  -0.03768084943294525,
  0.029675718396902084,
  -0.05103428661823273,
  -0.028369322419166565,
  -0.059725306928157806,
  -0.11508891731500626,
  0.030060499906539917,
  0.

In [117]:
response: VespaQueryResponse = app.query(
    # Update to medium presentation summary
    body={**semantic_query, "presentation.summary": "medium"}
)
print(json.dumps(response.hits[:3], indent=4))

[
    {
        "id": "index:search/0/6b06bb73a068967affaa5e18",
        "relevance": 1.0,
        "source": "search",
        "fields": {
            "sddocname": "product",
            "id": 666,
            "title": "BLACKPINK LISA FIRST SINGLE ALBUM - LALISA [ GOLD VER. ] PHOTOBOOK + LYRICS PAPER + CD + PHOTOCARD + POLAROID + DOUBLE-SIDED POSTER",
            "description": "  FEATURES           "
        }
    },
    {
        "id": "index:search/0/643abd579a43bac89519ef50",
        "relevance": 0.6722966714231268,
        "source": "search",
        "fields": {
            "sddocname": "product",
            "id": 3843,
            "title": "BLACKPINK ROSE 1st Single Album [-R-] CD + Photobook + Lyrics Paper + Sticker Set + Postcard + Polaroid + OFFICIAL POSTER",
            "description": " THE ALBUM  "
        }
    },
    {
        "id": "index:search/0/5e479892f9a14cb686efc46b",
        "relevance": 0.6232147937155144,
        "source": "search",
        "fields": {
         

### Hybrid query


In [120]:
hybrid_query = get_single_query(application="vespa", query_mode="hybrid")
hybrid_query

{'yql': 'select * from product where ({targetHits:10}nearestNeighbor(embedding,q_embedding)) or userQuery()',
 'ranking.profile': 'hybrid',
 'presentation.summary': 'minimal',
 'query': 'BLACKPINK LISA FIRST SINGLE ALBUM - LALISA [',
 'input.query(q_embedding)': [0.025907913222908974,
  -0.0113765187561512,
  -0.05289803072810173,
  -0.030958142131567,
  0.04915975406765938,
  0.007972903549671173,
  -0.018939446657896042,
  -0.039253078401088715,
  -0.023456573486328125,
  -0.03065423108637333,
  0.05994176119565964,
  0.10258321464061737,
  -0.0010259866248816252,
  -0.02520928345620632,
  0.007055302616208792,
  -0.051815297454595566,
  0.09250776469707489,
  -0.02454352006316185,
  -0.10023844242095947,
  0.005638183560222387,
  -0.03079683519899845,
  0.008614066056907177,
  0.03516066074371338,
  -0.008583984337747097,
  -0.03768084943294525,
  0.029675718396902084,
  -0.05103428661823273,
  -0.028369322419166565,
  -0.059725306928157806,
  -0.11508891731500626,
  0.0300604999065

In [121]:
response: VespaQueryResponse = app.query(
    # Update to medium presentation summary
    body={**hybrid_query, "presentation.summary": "medium"}
)
print(json.dumps(response.hits[:3], indent=4))

[
    {
        "id": "index:search/0/d49f4f339ad5c887b50fdfe8",
        "relevance": 9.36035176617453,
        "source": "search",
        "fields": {
            "sddocname": "product",
            "id": 904,
            "title": "ROSE BLACKPINK ROSE SINGLE ALUMB -R- ( LP ) (Limited Edition)[+EXTRA BLACKPINK PHOTOCARD]",
            "description": "+EXTRA BLACKPINK PHOTOCARD"
        }
    },
    {
        "id": "index:search/0/6b06bb73a068967affaa5e18",
        "relevance": 9.123502755574268,
        "source": "search",
        "fields": {
            "sddocname": "product",
            "id": 666,
            "title": "BLACKPINK LISA FIRST SINGLE ALBUM - LALISA [ GOLD VER. ] PHOTOBOOK + LYRICS PAPER + CD + PHOTOCARD + POLAROID + DOUBLE-SIDED POSTER",
            "description": "  FEATURES           "
        }
    },
    {
        "id": "index:search/0/e2ab7f6fdc2a1d0b6503acda",
        "relevance": 6.4771115547577915,
        "source": "search",
        "fields": {
            "sdd

### Cleanup and remove Vespa container


In [123]:
vespa_docker.container.stop()
vespa_docker.container.remove()