Based on https://www.pinecone.io/learn/series/faiss/vector-indexes/ and https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html

# Set up Elasticsearch client

In [1]:
%pip install elasticsearch
%pip install humanize

Collecting elasticsearch
  Obtaining dependency information for elasticsearch from https://files.pythonhosted.org/packages/bb/06/81b1d71ba0567ff39d0f98f3637e810846df92f6733aee46004a194b51ea/elasticsearch-8.9.0-py3-none-any.whl.metadata
  Downloading elasticsearch-8.9.0-py3-none-any.whl.metadata (5.2 kB)
Collecting elastic-transport<9,>=8 (from elasticsearch)
  Downloading elastic_transport-8.4.0-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading elasticsearch-8.9.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.5/395.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.4.0 elasticsearch-8.9.0
Note: you may need to restart the kernel to use updated packages.


In [78]:
import os
import json
import shutil
import humanize
from datetime import datetime
import random
from elasticsearch import Elasticsearch, helpers
import numpy as np
import math
import pandas as pd
pd.set_option('display.max_colwidth', 190)
import urllib.request as request
from contextlib import closing
import warnings
warnings.filterwarnings('ignore')

# Load and prepare demo data

In [2]:
# first we download the Sift1M dataset
with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:
    with open('sift.tar.gz', 'wb') as f:
        shutil.copyfileobj(r, f)

In [3]:
import tarfile

# the download leaves us with a tar.gz file, we unzip it
tar = tarfile.open('sift.tar.gz', "r:gz")
tar.extractall()

In [2]:
import numpy as np

# now define a function to read the fvecs file format of Sift1M dataset
def read_fvecs(fp):
    a = np.fromfile(fp, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')

In [3]:
# data we will search through
xb = read_fvecs('./sift/sift_base.fvecs')  # 1M samples
# also get some query vectors to search with
xq = read_fvecs('./sift/sift_query.fvecs')
# take just one query (there are many in sift_learn.fvecs)
xq = xq[0].reshape(1, xq.shape[1])

In [4]:
# The query vector
xq.shape

(1, 128)

In [5]:
# The vector search space
xb.shape

(1000000, 128)

In [50]:
# The input vector to be used for the vector queries
pd.DataFrame({'embedding' : [xq]})

Unnamed: 0,embedding
0,"[[1.0, 3.0, 11.0, 110.0, 62.0, 22.0, 4.0, 0.0, 43.0, 21.0, 22.0, 18.0, 6.0, 28.0, 64.0, 9.0, 11.0, 1.0, 0.0, 0.0, 1.0, 40.0, 101.0, 21.0, 20.0, 2.0, 4.0, 2.0, 2.0, 9.0, 18.0, 35.0, 1.0, ..."


# Upload vector embeddings to Elastic

In [9]:
esuser=os.getenv("ESUSER")
espassword=os.getenv("ESPASSWORD")
eshost=os.getenv("ESHOST")
esport=os.getenv("ESPORT")
client = Elasticsearch(
     f"https://{esuser}:{espassword}@{eshost}:{esport}",  # Elasticsearch endpoint
     verify_certs=False,
     request_timeout=120
)

In [10]:
dict(client.info())

{'name': 'm-2.30378e5f-16fe-488c-b12a-ff1e21906722.f1a236d6fe2348b9a4d2b297d12fbfa5.bc28ac43cf10402584b5f01db462d330.databases.appdomain.cloud',
 'cluster_name': '30378e5f-16fe-488c-b12a-ff1e21906722',
 'cluster_uuid': '2k4t1vNQSr67MnxoJloCbA',
 'version': {'number': '8.7.0',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '09520b59b6bc1057340b55750186466ea715e30e',
  'build_date': '2023-03-27T16:31:09.816451435Z',
  'build_snapshot': False,
  'lucene_version': '9.5.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [157]:
# Create an elastic index together with a mapping for an HNSW index for euclidean distance
if client.indices.exists(index="embeddings"):
    client.indices.delete(index="embeddings")
mapping = {
  "mappings": {
    "properties": {
      "embedding": {
        "type": "dense_vector",
        "dims": 128,
        "index": "true",
        "similarity": "l2_norm",
        "index_options": {
            "type": "hnsw",
            "m": 16,
            "ef_construction": 100
        }
      }
    }
  }
}
client.indices.create(index="embeddings", body=mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'embeddings'})

In [158]:
%%time
# Upload the documents into the Elasticsearch index (includes HNSW index building):
document_list = []
batch_size=1000
for i in range(0, len(xb)):
    document = {"_id": i, "embedding": xb[i]}
    document_list.append(document)
    if i % batch_size == batch_size-1:
        helpers.bulk(client, document_list, index='embeddings')
        document_list = []

CPU times: user 1min 9s, sys: 6.06 s, total: 1min 15s
Wall time: 52min 22s


In [159]:
# Make sure the search index is refreshed
client.indices.refresh(index="embeddings")

ObjectApiResponse({'_shards': {'total': 2, 'successful': 2, 'failed': 0}})

In [160]:
# Verify the number of documents in the index
client.count(index='embeddings')["count"]

1000000

In [19]:
# Look up a random document for test:
client.get(index="embeddings", id=random.randint(0, len(xb)-1))

ObjectApiResponse({'_index': 'embeddings', '_id': '874182', '_version': 1, '_seq_no': 874182, '_primary_term': 1, 'found': True, '_source': {'embedding': [77.0, 1.0, 0.0, 11.0, 40.0, 2.0, 4.0, 34.0, 120.0, 23.0, 11.0, 18.0, 9.0, 0.0, 0.0, 17.0, 11.0, 24.0, 27.0, 47.0, 20.0, 0.0, 1.0, 2.0, 4.0, 12.0, 7.0, 11.0, 26.0, 5.0, 1.0, 0.0, 114.0, 2.0, 0.0, 44.0, 123.0, 1.0, 0.0, 35.0, 134.0, 6.0, 6.0, 21.0, 8.0, 0.0, 0.0, 45.0, 13.0, 4.0, 10.0, 44.0, 77.0, 42.0, 19.0, 3.0, 1.0, 1.0, 1.0, 2.0, 26.0, 96.0, 68.0, 1.0, 103.0, 2.0, 0.0, 40.0, 97.0, 4.0, 8.0, 26.0, 134.0, 7.0, 0.0, 4.0, 17.0, 17.0, 4.0, 23.0, 12.0, 0.0, 0.0, 3.0, 134.0, 134.0, 8.0, 3.0, 0.0, 0.0, 0.0, 3.0, 51.0, 119.0, 52.0, 2.0, 42.0, 3.0, 4.0, 35.0, 18.0, 4.0, 4.0, 6.0, 134.0, 3.0, 2.0, 6.0, 9.0, 48.0, 23.0, 47.0, 0.0, 0.0, 0.0, 1.0, 90.0, 134.0, 25.0, 1.0, 0.0, 0.0, 6.0, 21.0, 35.0, 19.0, 3.0, 1.0]}})

In [86]:
# Get the total index size and indexing time:
index_stats = client.indices.stats(index="embeddings").get('_all').get('primaries')
print("Index size:    " + humanize.naturalsize(index_stats.get('store').get('size_in_bytes')))
print("Indexing time: " + humanize.precisedelta(index_stats.get('indexing').get('index_time_in_millis')/1000, minimum_unit='minutes'))

Index size:    922.0 MB
Indexing time: 27.30 minutes


## Exact nearest neighbor query with euclidean distance on dense_vector field
I.e., no index is being used.

In [53]:
%%time
query = {
    "script_score": {
        "query" : {
            "match_all": {}
        },
        "script": {
            "source": "1 / (1 + l2norm(params.queryVector, 'embedding'))",
            "params": {
                "queryVector": xq[0]
            }
        }
    }
}
list=dict(client.search(index="embeddings", query=query, size=10))['hits']['hits']
euclidean_baseline=pd.DataFrame([(i.get('_id'), i.get('_score')) for i in list], columns=['id', 'distance'])
euclidean_baseline

CPU times: user 4.1 ms, sys: 2.61 ms, total: 6.71 ms
Wall time: 2.57 s


Unnamed: 0,id,distance
0,932085,0.004276
1,934876,0.004242
2,561813,0.004082
3,708177,0.003899
4,706771,0.003886
5,695756,0.003848
6,435345,0.003813
7,701258,0.00377
8,455537,0.003727
9,872728,0.003717


## Aproximate nearest neighbor query with euclidean distance using HNSW index on dense_vector

In [54]:
%%time
query = {
    "field": "embedding",
    "query_vector": xq[0],
    "k": 10,
    "num_candidates": 10
}
dict(client.knn_search(index="embeddings", knn=query))['hits']['hits']
euclidean_hnsw=pd.DataFrame([(i.get('_id'), i.get('_score')) for i in list], columns=['id', 'distance'])
euclidean_hnsw

CPU times: user 4.06 ms, sys: 2.57 ms, total: 6.63 ms
Wall time: 66.8 ms


Unnamed: 0,id,distance
0,932085,0.004276
1,934876,0.004242
2,561813,0.004082
3,708177,0.003899
4,706771,0.003886
5,695756,0.003848
6,435345,0.003813
7,701258,0.00377
8,455537,0.003727
9,872728,0.003717


In [55]:
# Calculate Recall Percentage for euclidean distance IVF lookup:
baseline = euclidean_baseline["distance"].to_numpy()
baseline[np.in1d(baseline, euclidean_hnsw["distance"].to_numpy()).tolist()].size / baseline.size * 100

100.0

## Exact Euclidean Distance

In [465]:
%%time
# Run a similarity query with euclidean distance:
sql = f"SELECT id, embedding <-> '[{query_vector}]' as distance FROM embeddings ORDER BY distance LIMIT 10;"
cur.execute(sql)
euclidean_baseline = pd.DataFrame(cur.fetchall(), columns=['id', 'distance'])
euclidean_baseline

CPU times: user 2.05 ms, sys: 3.03 ms, total: 5.08 ms
Wall time: 126 ms


Unnamed: 0,id,distance
0,932085,232.871209
1,934876,234.71472
2,561813,243.989754
3,708177,255.460369
4,706771,256.31426
5,695756,258.862898
6,435345,261.241268
7,701258,264.280154
8,455537,267.284493
9,872728,268.069021


In [466]:
# Show scan type used in embedding column in the query plan
df=pd.read_sql_query("EXPLAIN ANALYZE " + sql, engine)
df[df['QUERY PLAN'].str.contains('Scan')]

Unnamed: 0,QUERY PLAN
9,-> Parallel Seq Scan on embeddings (cost=0.00..76637.33 rows=416667 width=12) (actual time=0.108..97.531 rows=333333 loops=3)


## Exact Cosine Distance

In [471]:
%%time
# Run a similarity query with cosine distance:
sql = f"SELECT id, embedding <=> '[{query_vector}]' as distance FROM embeddings ORDER BY distance LIMIT 10;"
cur.execute(sql)
cosine_baseline = pd.DataFrame(cur.fetchall(), columns=['id', 'distance'])
cosine_baseline

CPU times: user 1.7 ms, sys: 2.09 ms, total: 3.78 ms
Wall time: 124 ms


Unnamed: 0,id,distance
0,932085,0.105112
1,934876,0.106705
2,561813,0.115366
3,708177,0.126512
4,706771,0.127323
5,695756,0.129983
6,435345,0.132089
7,701258,0.135184
8,455537,0.138418
9,872728,0.138856


In [468]:
# Show scan type used in embedding column in the query plan
df=pd.read_sql_query("EXPLAIN ANALYZE " + sql, engine)
df[df['QUERY PLAN'].str.contains('Scan')]

Unnamed: 0,QUERY PLAN
9,-> Parallel Seq Scan on embeddings (cost=0.00..76637.33 rows=416667 width=12) (actual time=0.114..99.913 rows=333333 loops=3)


## Exact Dot Product Distance

In [472]:
%%time
# Run a similarity query with dot product distance:
sql = f"SELECT id, embedding <#> '[{query_vector}]' as distance FROM embeddings ORDER BY distance LIMIT 10;"
cur.execute(sql)
dot_product_baseline = pd.DataFrame(cur.fetchall(), columns=['id', 'distance'])
dot_product_baseline

CPU times: user 2.05 ms, sys: 2.7 ms, total: 4.75 ms
Wall time: 126 ms


Unnamed: 0,id,distance
0,932085,-230843.0
1,934876,-230600.0
2,561813,-228242.0
3,708177,-225288.0
4,706771,-225144.0
5,695756,-224256.0
6,435345,-224214.0
7,701258,-223408.0
8,872728,-222827.0
9,455537,-222341.0


In [470]:
# Show scan type used in embedding column in the query plan
df=pd.read_sql_query("EXPLAIN ANALYZE " + sql, engine)
df[df['QUERY PLAN'].str.contains('Scan')]

Unnamed: 0,QUERY PLAN
9,-> Parallel Seq Scan on embeddings (cost=0.00..76637.33 rows=416667 width=12) (actual time=0.202..102.335 rows=333333 loops=3)


In [524]:
# Get index size
cur.execute("select pg_size_pretty (pg_indexes_size('embeddings'));");pd.DataFrame(cur.fetchall(), columns=['pg_size_pretty'])

Unnamed: 0,pg_size_pretty
0,2342 MB
