Demonstrates the basic operations of PyMilvus, a Python SDK of Milvus. Before running, make sure that you have a running Milvus instance.

connect to Milvus
create collection
insert data
create index
search, query, and hybrid search on entities
delete entities by PK
drop collection

https://github.com/milvus-io/pymilvus/blob/master/examples/hello_milvus.ipynb

In [28]:
import numpy as np
import time
from pathlib import Path

from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
num_entities, dim = 3000, 8

In [25]:
rng = np.random.default_rng(seed=19530)

In [43]:
connections.connect("default", host="localhost", port="19530")

has = utility.has_collection("test1")
print(f"Does collection hello_milvus exist in Milvus: {has}")

Does collection hello_milvus exist in Milvus: False


In [67]:
# %load ../image_retreival/clip_embeddings

from sentence_transformers import SentenceTransformer
import glob
import numpy as np
from PIL import Image

img_model = SentenceTransformer('clip-ViT-B-32')
img_names = list(glob.glob('./data/photos/*.jpg'))

def encode_img(img_model,img_names):
    print("Images:", len(img_names))

    #non-multilingual CLIP Model
    img_emb = img_model.encode([Image.open(filepath) for filepath in img_names[:10]], batch_size=128, convert_to_tensor=True, show_progress_bar=True)
    img_emb = img_emb /  np.linalg.norm(img_emb, axis=1, keepdims=True)
    return img_emb, img_names

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [12]:
img_names = list(glob.glob('../data/photos/*.jpg'))
len(img_names)

24996

In [14]:
img_emb,img_names = encode_img(img_model,img_names)

Images: 24996


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
img_emb_np = img_emb.detach().numpy()

In [18]:
np.save("../data/test_embed.npy",img_emb.detach().numpy())

In [42]:
#utility.drop_collection("test1")

In [44]:
num_entities, dim = 10, 512

In [54]:
fields = [
    FieldSchema(name="fname", dtype=DataType.INT64, is_primary=True, auto_id=False, max_length=100),
    FieldSchema(name="random", dtype=DataType.DOUBLE),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
]

schema = CollectionSchema(fields, "hello_milvus is the simplest demo to introduce the APIs")

hello_milvus = Collection("test1", schema, consistency_level="Strong")

In [48]:
fnames = {idx: i.split("/")[-1].split(".")[0] for idx, i in enumerate(img_names[:num_entities])}
len(fnames)

10

In [51]:
list(fnames.keys())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [52]:
entities = [list(fnames.keys()), # provide the fname field because `auto_id` is set to False
            rng.random(num_entities).tolist(), # field random, only supports list
            img_emb_np]  # field embeddings, supports numpy.ndarray and list
entities

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0.03539747517805536,
  0.7917858816321333,
  0.9634580710517987,
  0.7769574961455377,
  0.9158904944704735,
  0.19604735640309023,
  0.4539412342285505,
  0.7759238287816242,
  0.4751541013084166,
  0.5435670482374748],
 array([[ 0.02169453, -0.02511793, -0.00447961, ...,  0.01452356,
          0.0413039 , -0.00763403],
        [-0.00709092, -0.00226662,  0.00256219, ...,  0.00291399,
          0.01496236,  0.0149281 ],
        [ 0.0128168 ,  0.02356843,  0.0500972 , ...,  0.0153697 ,
         -0.00603879,  0.00370851],
        ...,
        [ 0.01146269,  0.04268388, -0.003989  , ...,  0.01373177,
         -0.0011    , -0.00875279],
        [ 0.04586982,  0.03013872, -0.0162409 , ...,  0.00538639,
          0.02090447,  0.00427006],
        [ 0.0137448 ,  0.03019687,  0.00593366, ...,  0.05473933,
          0.00406447,  0.00419127]], dtype=float32)]

## Insert Data

In [55]:
insert_result = hello_milvus.insert(entities)

print(f"Number of entities in Milvus: {hello_milvus.num_entities}")  # check the num_entites

Number of entities in Milvus: 10


In [56]:
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}

hello_milvus.create_index("embeddings", index)

Status(code=0, message='')

In [57]:
hello_milvus.load()

In [58]:
hello_milvus

<Collection>:
-------------
<name>: test1
<partitions>: [{"name": "_default", "collection_name": "test1", "description": ""}]
<description>: hello_milvus is the simplest demo to introduce the APIs
<schema>: {
  auto_id: False
  description: hello_milvus is the simplest demo to introduce the APIs
  fields: [{
    name: fname
    description: 
    type: 5
    is_primary: True
    auto_id: False
  }, {
    name: random
    description: 
    type: 11
  }, {
    name: embeddings
    description: 
    type: 101
    params: {'dim': 512}
  }]
}

In [62]:
len(entities[-1][-2:])

2

In [65]:
vectors_to_search = entities[-1][-2:]
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}

start_time = time.time()
result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, output_fields=["fname"])
end_time = time.time()

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, random field: {hit.entity.get('fname')}")
print(search_latency_fmt.format(end_time - start_time))

hit: (distance: 0.0, id: 8), random field: 8
hit: (distance: 0.4052489399909973, id: 7), random field: 7
hit: (distance: 0.5453382730484009, id: 1), random field: 1
hit: (distance: 0.0, id: 9), random field: 9
hit: (distance: 0.5733288526535034, id: 5), random field: 5
hit: (distance: 0.6993972063064575, id: 2), random field: 2
search latency = 0.2803s
