In [None]:
# !pip install elasticsearch numpy pandas tqdm

### Imports

In [1]:
import json

import numpy as np
import pandas as pd

from elasticsearch import Elasticsearch

### Instantiate Elasticsearch client

In [2]:
es = Elasticsearch(['http://localhost:9200/'])
es.ping()

True

### Create index

In [3]:
INDEX_NAME = "image-index"

In [4]:
# Delete existing index
!curl -X DELETE "localhost:9200/{INDEX_NAME}?pretty"

{
  "acknowledged" : true
}


In [5]:
resp = es.indices.create(
    index=INDEX_NAME, 
    mappings={
        "properties": {
            "download_count": {
                "type": "float"
            },
            
            "image_vector": {
                "type": "dense_vector",
                "dims": 512,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
)
resp

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'image-index'})

### Add data

In [6]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

In [7]:
len(dataset)

20000

In [8]:
dataset[0]

{'download_count': 10.0,
 'image_vector': [0.4687531590461731,
  1.858910083770752,
  0.2487410604953766,
  0.4361421465873718,
  0.06379887461662292,
  -0.6228192448616028,
  -0.27867400646209717,
  0.04752945899963379,
  0.048252467066049576,
  -0.6161901950836182,
  -0.08333898335695267,
  -0.178070068359375,
  0.4829390347003937,
  -0.051558591425418854,
  -0.7764784693717957,
  0.11730130016803741,
  -0.9347928166389465,
  0.4606996774673462,
  0.17599976062774658,
  0.12078317999839783,
  -0.289231538772583,
  0.35409796237945557,
  0.022205233573913574,
  -0.3168144226074219,
  -0.3422653079032898,
  0.5762258172035217,
  -0.34303778409957886,
  0.2701375484466553,
  0.4523313641548157,
  -0.07710081338882446,
  -0.4255029559135437,
  0.36728641390800476,
  -0.39734968543052673,
  -0.2473088502883911,
  -0.07855471968650818,
  0.2128201574087143,
  0.06932312250137329,
  0.05761660635471344,
  0.23390062153339386,
  -0.36632591485977173,
  0.4863947629928589,
  0.050750467926263

In [9]:
def gendata():
    for sample in dataset:
        assert "download_count" in sample
        assert "image_vector" in sample
        
        assert type(sample["download_count"]) == float
        assert np.array(sample["image_vector"]).shape == (512,)
        
        yield sample

In [10]:
from elasticsearch.helpers import parallel_bulk

for success, info in parallel_bulk(client=es, index="image-index", actions=gendata()):
    if not success:
        print('A document failed:', info)

### Debug

In [11]:
# Load query vector
with open("query_vector.json", "r") as f:
    query_vector = json.load(f)["query_vector"]

In [12]:
# Inspect query vector type and shape
type(query_vector), np.array(query_vector).shape

(list, (512,))

In [13]:
# Assert query vector of 512-dim
assert np.array(query_vector).shape == (512,)

In [18]:
# Hybrid search

k = 20

resp = es.search(
    index=INDEX_NAME,
    size=k,
    query={"constant_score": {"filter": {"match_all": {}}, "boost": 1.0}},
    knn={
        "field": "image_vector",
        "query_vector": query_vector,
        "k": k,
        "num_candidates": 100,
    },
    explain=True
)

hits = resp.get("hits").get("hits")
df = pd.DataFrame(hits)

df = df[["_score", "_explanation"]]
df

Unnamed: 0,_score,_explanation
0,1.658169,"{'value': 1.6581688, 'description': 'sum of:',..."
1,1.651484,"{'value': 1.651484, 'description': 'sum of:', ..."
2,1.648651,"{'value': 1.6486505, 'description': 'sum of:',..."
3,1.646819,"{'value': 1.6468188, 'description': 'sum of:',..."
4,1.638959,"{'value': 1.6389592, 'description': 'sum of:',..."
5,1.638397,"{'value': 1.6383975, 'description': 'sum of:',..."
6,1.638098,"{'value': 1.6380985, 'description': 'sum of:',..."
7,1.637987,"{'value': 1.6379869, 'description': 'sum of:',..."
8,1.635532,"{'value': 1.6355321, 'description': 'sum of:',..."
9,1.635031,"{'value': 1.6350307, 'description': 'sum of:',..."


In [19]:
df["_explanation"].value_counts()

{'value': 1.0, 'description': 'sum of:', 'details': [{'value': 1.0, 'description': 'ConstantScore(*:*)', 'details': []}]}                                                                                         7
{'value': 1.6581688, 'description': 'sum of:', 'details': [{'value': 0.65816873, 'description': 'within top k documents', 'details': []}, {'value': 1.0, 'description': 'ConstantScore(*:*)', 'details': []}]}    1
{'value': 1.651484, 'description': 'sum of:', 'details': [{'value': 0.651484, 'description': 'within top k documents', 'details': []}, {'value': 1.0, 'description': 'ConstantScore(*:*)', 'details': []}]}       1
{'value': 1.6486505, 'description': 'sum of:', 'details': [{'value': 1.0, 'description': 'ConstantScore(*:*)', 'details': []}]}                                                                                   1
{'value': 1.6468188, 'description': 'sum of:', 'details': [{'value': 0.64681876, 'description': 'within top k documents', 'details': []}, {'value': 1.0,

In [20]:
# KNN search

k = 20

resp = es.search(
    index=INDEX_NAME,
    size=k,
    knn={
        "field": "image_vector",
        "query_vector": query_vector,
        "k": k,
        "num_candidates": 100,
    },
    explain=True
)

hits = resp.get("hits").get("hits")
df = pd.DataFrame(hits)

df = df[["_score", "_explanation"]]
df

Unnamed: 0,_score,_explanation
0,0.658169,"{'value': 0.65816873, 'description': 'within t..."
1,0.654055,"{'value': 0.0, 'description': 'not in top k do..."
2,0.651484,"{'value': 0.651484, 'description': 'within top..."
3,0.648651,"{'value': 0.0, 'description': 'not in top k do..."
4,0.647027,"{'value': 0.0, 'description': 'not in top k do..."
5,0.646819,"{'value': 0.64681876, 'description': 'within t..."
6,0.646349,"{'value': 0.0, 'description': 'not in top k do..."
7,0.641008,"{'value': 0.0, 'description': 'not in top k do..."
8,0.638959,"{'value': 0.63895917, 'description': 'within t..."
9,0.638397,"{'value': 0.0, 'description': 'not in top k do..."


In [21]:
df["_explanation"].value_counts()

{'value': 0.0, 'description': 'not in top k documents', 'details': []}           10
{'value': 0.65816873, 'description': 'within top k documents', 'details': []}     1
{'value': 0.651484, 'description': 'within top k documents', 'details': []}       1
{'value': 0.64681876, 'description': 'within top k documents', 'details': []}     1
{'value': 0.63895917, 'description': 'within top k documents', 'details': []}     1
{'value': 0.6379869, 'description': 'within top k documents', 'details': []}      1
{'value': 0.63553214, 'description': 'within top k documents', 'details': []}     1
{'value': 0.6350307, 'description': 'within top k documents', 'details': []}      1
{'value': 0.6332454, 'description': 'within top k documents', 'details': []}      1
{'value': 0.63233835, 'description': 'within top k documents', 'details': []}     1
{'value': 0.6301397, 'description': 'within top k documents', 'details': []}      1
Name: _explanation, dtype: int64

In [22]:
# Constant score search

k = 20

resp = es.search(
    index=INDEX_NAME,
    size=k,
    query={"constant_score": {"filter": {"match_all": {}}, "boost": 1.0}},
    explain=True
)

hits = resp.get("hits").get("hits")
df = pd.DataFrame(hits)

df = df[["_score", "_explanation"]]
df

Unnamed: 0,_score,_explanation
0,1.0,"{'value': 1.0, 'description': 'ConstantScore(*..."
1,1.0,"{'value': 1.0, 'description': 'ConstantScore(*..."
2,1.0,"{'value': 1.0, 'description': 'ConstantScore(*..."
3,1.0,"{'value': 1.0, 'description': 'ConstantScore(*..."
4,1.0,"{'value': 1.0, 'description': 'ConstantScore(*..."
5,1.0,"{'value': 1.0, 'description': 'ConstantScore(*..."
6,1.0,"{'value': 1.0, 'description': 'ConstantScore(*..."
7,1.0,"{'value': 1.0, 'description': 'ConstantScore(*..."
8,1.0,"{'value': 1.0, 'description': 'ConstantScore(*..."
9,1.0,"{'value': 1.0, 'description': 'ConstantScore(*..."


In [23]:
df["_explanation"].value_counts()

{'value': 1.0, 'description': 'ConstantScore(*:*)', 'details': []}    20
Name: _explanation, dtype: int64