In [18]:
from pymilvus import CollectionSchema, FieldSchema, DataType, MilvusClient, DataType
from time import time
import numpy as np
import random
import string
from tqdm import tqdm

client = MilvusClient(
    uri="http://140.112.28.129:19530",
    db_name="default"
)

In [2]:
EMBEDDING_LEN = 2048
schema = CollectionSchema(fields=[
    FieldSchema(name='filename', dtype=DataType.VARCHAR, is_primary =True, max_length=128),
    FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_LEN),
], auto_id=False)

index_params = client.prepare_index_params()
index_params.add_index(
    field_name="embedding", 
    index_type="IVF_FLAT",
    metric_type="COSINE",
    params={"nlist": 1}
)

In [12]:
client.drop_collection(collection_name="collection")
client.create_collection(
    collection_name="collection",
    schema=schema,
    index_params=index_params
)

In [13]:


def insert_random_data(partition_name, num, collection_name = "collection"):
    for _ in tqdm(range(num)):
        random_vector = np.random.rand(EMBEDDING_LEN)
        random_filename = ''.join(random.choice(string.ascii_letters) for x in range(20))
        
        client.insert(
            collection_name=collection_name,
            data = [{
                "filename": random_filename,
                "embedding": random_vector
            }],
            partition_name=partition_name
        )

In [14]:
sizes = [100, 2000, 5000, 10000]

for size in sizes:
    client.create_partition(
        collection_name="collection", 
        partition_name= f"partition_{size}"
    )
    insert_random_data(f"partition_{size}", size)

100%|██████████| 100/100 [00:00<00:00, 103.89it/s]
100%|██████████| 2000/2000 [00:20<00:00, 97.95it/s] 
100%|██████████| 5000/5000 [00:44<00:00, 113.36it/s]
100%|██████████| 10000/10000 [01:29<00:00, 112.34it/s]


In [None]:
# from time import time

# client.release_partitions(
#     collection_name="collection",
#     partition_names=["partition_large"]
# )

# t = time()  
# client.load_partitions(
#     collection_name="collection",
#     partition_names=["partition_large"]
# )
# print(time() - t)



In [15]:
from pymilvus import (
    connections, utility, DataType, FieldSchema, CollectionSchema, Collection, Partition
)
connections.connect(db_name='default', host='140.112.28.129', port='19530')

dic = {}
for size in sizes:
    dic[size] = Partition(collection="collection", name=f"partition_{size}")

In [23]:
times = [[] for _ in range(4)]
times

[[], [], [], []]

In [27]:
time_dic = {}
for size in sizes:
    time_dic[size] = []

for _ in range(10):
    for i, (size, partition) in enumerate(dic.items()):
        t = time()
        partition.load()
        partition.flush()
        partition.release()
        partition.flush()
        due = time() - t
        time_dic[size].append(due)

for size, times in time_dic.items():
    print(f"{size}: {sum(times) / 10}")

100: 3.9702313423156737
2000: 4.011171817779541
5000: 4.011175680160522
10000: 3.988883113861084


In [38]:
param = {
    "metric_type": "COSINE",
    "params": {
        "nprobe": 10000,
        "radius": 1,
        "range_filter": 3,
        "max_empty_result_buckets":65536
    }
}

time_dic = {}
for size in sizes:
    time_dic[size] = []

for i, (size, partition) in enumerate(dic.items()):
    t = time()
    partition.load()

for _ in range(10):
    for i, (size, partition) in enumerate(dic.items()):
        t = time()
        res = partition.search(
            data=[np.random.rand(EMBEDDING_LEN)],
            anns_field="embedding",
            param=param,
            batch_size=1,
            limit=1,
            output_fields=["filename", "embedding"]
        )
        due = time() - t
        time_dic[size].append(due)

for size, times in time_dic.items():
    print(f"{size}: {sum(times) / 10}")

100: 0.005907773971557617
2000: 0.007582974433898926
5000: 0.010500383377075196
10000: 0.013728451728820801
