In [1]:
import os
os.remove('./hello_milvus.py')
os.getcwd()

'/Users/derbysofti88/OneDrive - 德比软件（上海）有限公司/chatgpt_retrieval_plugin/milvus_standalone'

In [2]:
# refer https://milvus.io/docs/example_code.md
!wget https://raw.githubusercontent.com/milvus-io/pymilvus/v2.2.4/examples/hello_milvus.py

--2023-04-06 12:50:36--  https://raw.githubusercontent.com/milvus-io/pymilvus/v2.2.4/examples/hello_milvus.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7380 (7.2K) [text/plain]
Saving to: ‘hello_milvus.py’


2023-04-06 12:50:37 (20.9 MB/s) - ‘hello_milvus.py’ saved [7380/7380]



# hello_milvus.py demonstrates the basic operations of PyMilvus, a Python SDK of Milvus.
# 1. connect to Milvus
# 2. create collection
# 3. insert data
# 4. create index
# 5. search, query, and hybrid search on entities
# 6. delete entities by PK
# 7. drop collection

In [3]:
import time

import numpy as np
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
num_entities, dim = 3000, 8

In [4]:
#################################################################################
# 1. connect to Milvus
# Add a new connection alias `default` for Milvus server in `localhost:19530`
# Actually the "default" alias is a buildin in PyMilvus.
# If the address of Milvus is the same as `localhost:19530`, you can omit all
# parameters and call the method as: `connections.connect()`.
#
# Note: the `using` parameter of the following methods is default to "default".
print(fmt.format("start connecting to Milvus"))
connections.connect("default", host="10.200.0.43", port="19530")

has = utility.has_collection("hello_milvus")
print(f"Does collection hello_milvus exist in Milvus: {has}")


=== start connecting to Milvus     ===

Does collection hello_milvus exist in Milvus: False


In [5]:
#################################################################################
# 2. create collection
# We're going to create a collection with 3 fields.
# +-+------------+------------+------------------+------------------------------+
# | | field name | field type | other attributes |       field description      |
# +-+------------+------------+------------------+------------------------------+
# |1|    "pk"    |   VarChar  |  is_primary=True |      "primary field"         |
# | |            |            |   auto_id=False  |                              |
# +-+------------+------------+------------------+------------------------------+
# |2|  "random"  |    Double  |                  |      "a double field"        |
# +-+------------+------------+------------------+------------------------------+
# |3|"embeddings"| FloatVector|     dim=8        |  "float vector with dim 8"   |
# +-+------------+------------+------------------+------------------------------+
fields = [
    FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
    FieldSchema(name="random", dtype=DataType.DOUBLE),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
]

schema = CollectionSchema(fields, "hello_milvus is the simplest demo to introduce the APIs")

print(fmt.format("Create collection `hello_milvus`"))
hello_milvus = Collection("hello_milvus", schema, consistency_level="Strong")


=== Create collection `hello_milvus` ===



In [6]:
################################################################################
# 3. insert data
# We are going to insert 3000 rows of data into `hello_milvus`
# Data to be inserted must be organized in fields.
#
# The insert() method returns:
# - either automatically generated primary keys by Milvus if auto_id=True in the schema;
# - or the existing primary key field from the entities if auto_id=False in the schema.

print(fmt.format("Start inserting entities"))
rng = np.random.default_rng(seed=19530)
entities = [
    # provide the pk field because `auto_id` is set to False
    [str(i) for i in range(num_entities)],
    rng.random(num_entities).tolist(),  # field random, only supports list
    rng.random((num_entities, dim)),    # field embeddings, supports numpy.ndarray and list
]

insert_result = hello_milvus.insert(entities)

print(f"Number of entities in Milvus: {hello_milvus.num_entities}")  # check the num_entites
hello_milvus.flush()
print(f"Number of entities in Milvus: {hello_milvus.num_entities}")  # check the num_entites


=== Start inserting entities       ===

Number of entities in Milvus: 0
Number of entities in Milvus: 3000


In [7]:
hello_milvus

<Collection>:
-------------
<name>: hello_milvus
<partitions>: [{"name": "_default", "collection_name": "hello_milvus", "description": ""}]
<description>: hello_milvus is the simplest demo to introduce the APIs
<schema>: {
  auto_id: False
  description: hello_milvus is the simplest demo to introduce the APIs
  fields: [{
    name: pk
    description: 
    type: 21
    params: {'max_length': 100}
    is_primary: True
    auto_id: False
  }, {
    name: random
    description: 
    type: 11
  }, {
    name: embeddings
    description: 
    type: 101
    params: {'dim': 8}
  }]
}

In [8]:
entities[0][:5],entities[1][:5],entities[2][:5]

(['0', '1', '2', '3', '4'],
 [0.6378742006852851,
  0.43925103574669633,
  0.1321158395732429,
  0.468666676812172,
  0.744296470467782],
 [[0.20963514452725185,
   0.397466580541035,
   0.12019053120587031,
   0.6947491504127892,
   0.9535574841496438,
   0.5454552024547541,
   0.823604477841353,
   0.2109630840483121],
  [0.5232361500562313,
   0.8035404362846539,
   0.7782466231866557,
   0.8036957419750694,
   0.4914802763227951,
   0.8265614091830081,
   0.6145269082588252,
   0.8023454579318446],
  [0.1083407654818962,
   0.7225640094032827,
   0.6481155817809635,
   0.04529253647704867,
   0.3306724099072048,
   0.00914072646802988,
   0.4559419450672991,
   0.4074524517614092],
  [0.6025988148174869,
   0.8369877822102969,
   0.14832204233846147,
   0.7041321547709049,
   0.42856021028726843,
   0.7978481501398528,
   0.9857221101925083,
   0.40442546447158045],
  [0.8030917388404358,
   0.7484994615404847,
   0.7331518929867509,
   0.7597000461295075,
   0.11321863640249508,
 

In [9]:
insert_result

(insert count: 3000, delete count: 0, upsert count: 0, timestamp: 440600268220923907, success count: 3000, err count: 0)

In [10]:
################################################################################
# 4. create index
# We are going to create an IVF_FLAT index for hello_milvus collection.
# create_index() can only be applied to `FloatVector` and `BinaryVector` fields.

# todo: need to refer to https://milvus.io/docs/build_index.md
print(fmt.format("Start Creating index IVF_FLAT"))
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}

hello_milvus.create_index("embeddings", index)
hello_milvus


=== Start Creating index IVF_FLAT  ===



<Collection>:
-------------
<name>: hello_milvus
<partitions>: [{"name": "_default", "collection_name": "hello_milvus", "description": ""}]
<description>: hello_milvus is the simplest demo to introduce the APIs
<schema>: {
  auto_id: False
  description: hello_milvus is the simplest demo to introduce the APIs
  fields: [{
    name: pk
    description: 
    type: 21
    params: {'max_length': 100}
    is_primary: True
    auto_id: False
  }, {
    name: random
    description: 
    type: 11
  }, {
    name: embeddings
    description: 
    type: 101
    params: {'dim': 8}
  }]
}

In [11]:
################################################################################
# 5. search, query, and hybrid search
# After data were inserted into Milvus and indexed, you can perform:
# - search based on vector similarity
# - query based on scalar filtering(boolean, int, etc.)
# - hybrid search based on vector similarity and scalar filtering.
#

# Before conducting a search or a query, you need to load the data in `hello_milvus` into memory.
print(fmt.format("Start loading"))
hello_milvus.load()


=== Start loading                  ===



In [12]:
# -----------------------------------------------------------------------------
# search based on vector similarity
print(fmt.format("Start searching based on vector similarity"))
vectors_to_search = entities[-1][-2:]#two vectors here
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}

start_time = time.time()
result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, output_fields=["random"])
end_time = time.time()

for hits in result:#result lens 2*3
    for hit in hits:
        print(f"hit: {hit}, random field: {hit.entity.get('random')}")
print(search_latency_fmt.format(end_time - start_time))
print(result)
print(hit.entity)


=== Start searching based on vector similarity ===

hit: (distance: 0.0, id: 2998), random field: 0.9728033590489911
hit: (distance: 0.08883658051490784, id: 1262), random field: 0.2978858685751561
hit: (distance: 0.09590047597885132, id: 1265), random field: 0.3042039939240304
hit: (distance: 0.0, id: 2999), random field: 0.02316334456872482
hit: (distance: 0.05628091096878052, id: 1580), random field: 0.3855988746044062
hit: (distance: 0.08096685260534286, id: 2377), random field: 0.8745922204004368
search latency = 0.2387s
["['(distance: 0.0, id: 2998)', '(distance: 0.08883658051490784, id: 1262)', '(distance: 0.09590047597885132, id: 1265)']", "['(distance: 0.0, id: 2999)', '(distance: 0.05628091096878052, id: 1580)', '(distance: 0.08096685260534286, id: 2377)']"]
id: 2377, distance: 0.08096685260534286, entity: {'random': 0.8745922204004368}


In [13]:
# -----------------------------------------------------------------------------
# query based on scalar filtering(boolean, int, etc.)
print(fmt.format("Start querying with `random > 0.5`"))

start_time = time.time()
result = hello_milvus.query(expr="random > 0.5", output_fields=["embeddings", ])
end_time = time.time()

print(f"query result:\n-{result[0]}")
print(len(result),len(entities[0]))
print(search_latency_fmt.format(end_time - start_time))


=== Start querying with `random > 0.5` ===

query result:
-{'embeddings': [0.20963514, 0.39746657, 0.12019053, 0.6947492, 0.9535575, 0.5454552, 0.82360446, 0.21096309], 'pk': '0'}
1531 3000
search latency = 0.4394s


In [14]:
# -----------------------------------------------------------------------------
# pagination
r1 = hello_milvus.query(expr="random > 0.5", limit=4, output_fields=["random"])
r2 = hello_milvus.query(expr="random > 0.5", offset=1, limit=3, output_fields=["random",'embeddings'])
print(f"query pagination(limit=4):\n\t{r1}")
print(f"query pagination(offset=1, limit=3):\n\t{r2}")

query pagination(limit=4):
	[{'random': 0.6378742006852851, 'pk': '0'}, {'random': 0.5763523024650556, 'pk': '100'}, {'random': 0.9425935891639464, 'pk': '1000'}, {'random': 0.7893211256191387, 'pk': '1001'}]
query pagination(offset=1, limit=3):
	[{'random': 0.5763523024650556, 'embeddings': [0.5860017, 0.24227226, 0.8318699, 0.0060517574, 0.27727962, 0.5513293, 0.47201252, 0.6331349], 'pk': '100'}, {'random': 0.9425935891639464, 'embeddings': [0.06456853, 0.121232815, 0.0850102, 0.2686066, 0.3113729, 0.21573599, 0.94793427, 0.8368486], 'pk': '1000'}, {'random': 0.7893211256191387, 'embeddings': [0.5073896, 0.2198741, 0.76709545, 0.27336067, 0.88386256, 0.6357631, 0.35280448, 0.6349966], 'pk': '1001'}]


In [15]:
# -----------------------------------------------------------------------------
# hybrid search
print(fmt.format("Start hybrid searching with `random > 0.5`"))

start_time = time.time()
result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, expr="random > 0.5", output_fields=["random"])
end_time = time.time()

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, random field: {hit.entity.get('random')}")
print(search_latency_fmt.format(end_time - start_time))


=== Start hybrid searching with `random > 0.5` ===

hit: (distance: 0.0, id: 2998), random field: 0.9728033590489911
hit: (distance: 0.14606499671936035, id: 747), random field: 0.5648774800635661
hit: (distance: 0.1530652642250061, id: 2527), random field: 0.8928974315571507
hit: (distance: 0.08096685260534286, id: 2377), random field: 0.8745922204004368
hit: (distance: 0.20354536175727844, id: 2034), random field: 0.5526117606328499
hit: (distance: 0.21908017992973328, id: 958), random field: 0.6647383716417955
search latency = 0.4048s


In [16]:
###############################################################################
# 6. delete entities by PK
# You can delete entities by their PK values using boolean expressions.
ids = insert_result.primary_keys

expr = f'pk in ["{ids[0]}" , "{ids[1]}"]'
print(fmt.format(f"Start deleting with expr `{expr}`"))

result = hello_milvus.query(expr=expr, output_fields=["random", "embeddings"])
print(f"query before delete by expr=`{expr}` -> result: \n-{result[0]}\n-{result[1]}\n")

hello_milvus.delete(expr)

result = hello_milvus.query(expr=expr, output_fields=["random", "embeddings"])
print(f"query after delete by expr=`{expr}` -> result: {result}\n")


=== Start deleting with expr `pk in ["0" , "1"]` ===

query before delete by expr=`pk in ["0" , "1"]` -> result: 
-{'random': 0.6378742006852851, 'embeddings': [0.20963514, 0.39746657, 0.12019053, 0.6947492, 0.9535575, 0.5454552, 0.82360446, 0.21096309], 'pk': '0'}
-{'random': 0.43925103574669633, 'embeddings': [0.52323616, 0.8035404, 0.77824664, 0.80369574, 0.4914803, 0.8265614, 0.6145269, 0.80234545], 'pk': '1'}

query after delete by expr=`pk in ["0" , "1"]` -> result: []



In [17]:
print(utility.list_collections())
print(Collection('hello_milvus').indexes)
print(Collection('c0b1ff1082ad74ae999092fece02c1a00'))

['c0b1ff1082ad74ae999092fece02c1a00', 'c26a7fb16cf2545e5b8fcb401762d0366', 'ca9dc8b95bc5c456daa5ed7a16b76bbe8', 'hello_milvus', 'c3230df20f9aa43208a9083bf35dbfd8e']
[<pymilvus.orm.index.Index object at 0x7f8d4121a2c0>]
<Collection>:
-------------
<name>: c0b1ff1082ad74ae999092fece02c1a00
<partitions>: [{"name": "_default", "collection_name": "c0b1ff1082ad74ae999092fece02c1a00", "description": ""}]
<description>: 
<schema>: {
  auto_id: True
  description: 
  fields: [{
    name: pk
    description: 
    type: 5
    is_primary: True
    auto_id: True
  }, {
    name: embedding
    description: 
    type: 101
    params: {'dim': 1536}
  }, {
    name: text
    description: 
    type: 21
    params: {'max_length': 65535}
  }, {
    name: document_id
    description: 
    type: 21
    params: {'max_length': 65535}
  }, {
    name: source_id
    description: 
    type: 21
    params: {'max_length': 65535}
  }, {
    name: id
    description: 
    type: 21
    params: {'max_length': 65535}
 

In [18]:
###############################################################################
# 7. drop collection
# Finally, drop the hello_milvus collection
print(fmt.format("Drop collection `hello_milvus`"))
utility.drop_collection("hello_milvus")


=== Drop collection `hello_milvus` ===



In [19]:
utility.list_collections()

['c3230df20f9aa43208a9083bf35dbfd8e',
 'ca9dc8b95bc5c456daa5ed7a16b76bbe8',
 'c0b1ff1082ad74ae999092fece02c1a00',
 'c26a7fb16cf2545e5b8fcb401762d0366']