In [None]:
!python3 -m pip install pymilvus

# restart the runtime after the installation completes.
# To do so, choose Runtime > Restart runtime from the main menu.

Collecting pymilvus
  Downloading pymilvus-2.2.14-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.9/147.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting grpcio<=1.56.0,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.56.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting environs<=9.5.0 (from pymilvus)
  Downloading environs-9.5.0-py2.py3-none-any.whl (12 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow>=3.0.0 (from environs<=9.5.0->pymilvus)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

## Connect to cluster

When creating a dedicated cluster, you need to configure a cluster credential consisting of a pair of username and password. Be sure to take note of these details, as you’ll need them to connect to the cluster.

If you prefer private links, just replace the uri with your private links. Before that, ensure you have access to your private links. For details, see [Set up Private Link](https://docs.zilliz.com/docs/set-up-a-private-link).

In [14]:
import json, time
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

CLUSTER_ENDPOINT="YOUR_CLUSTER_ENDPOINT" # Set your cluster endpoint
TOKEN="YOUR_CLUSTER_TOKEN" # Set your token
COLLECTION_NAME="medium_articles_2020" # Set your collection name
DATASET_PATH="../medium_articles_2020_dpr.json" # Set your dataset path


# Connect to cluster
connections.connect(
  alias='default',
  # Public endpoint obtained from Zilliz Cloud
  uri=CLUSTER_ENDPOINT,
  # API key or a colon-separated cluster username and password
  token=TOKEN,
)

## Create collection

Dynamic data models are designed to reduce the learning curve and complexity in inserting entities. For production environments, we recommend that you use a custom schema instead of a dynamic one to ensure that all your data is stored as expected.

You can define a custom data model by specifying the name and data type of each field in a collection.

In [4]:
# 1. Define fields
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512),   
    FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=768),
    FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="reading_time", dtype=DataType.INT64),
    FieldSchema(name="publication", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="claps", dtype=DataType.INT64),
    FieldSchema(name="responses", dtype=DataType.INT64)
]

After you have defined the fields, create a schema for the collection.

In [5]:
# 2. Build the schema
schema = CollectionSchema(
    fields,
    description="Schema of Medium articles",
    enable_dynamic_field=False
)

Finally, you can create a collection using the collection schema just defined.

In [6]:
# 3. Create collection
collection = Collection(
    name=COLLECTION_NAME, 
    description="Medium articles published between Jan and August in 2020 in prominent publications",
    schema=schema
)

E1103 18:00:26.003186000 6220509184 ssl_transport_security.cc:1420]    Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.
E1103 18:00:26.008460000 6220509184 ssl_transport_security.cc:1420]    Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.


## Index collection

Indexes are necessary to achieve extremely high performance in ANN searches on Zilliz Cloud. Zilliz Cloud clusters support indexing only on the vector field. Indexing a collection is equivalent to indexing the vector field in that collection.

The only supported index type for Zilliz Cloud clusters is AUTOINDEX. Once an index type rather than AUTOINDEX is specified, AUTOINDEX automatically applies. For details, see [AUTOINDEX Explained](https://docs.zilliz.com/docs/autoindex-explained).

In [8]:
# 4. Index collection
# 'index_type' defines the index algorithm to be used.
#    AUTOINDEX is the only option.
#
# 'metric_type' defines the way to measure the distance 
#    between vectors. Possible values are L2, IP, and Cosine,
#    and defaults to Cosine.
index_params = {
    "index_type": "AUTOINDEX",
    "metric_type": "L2",
    "params": {}
}

# To name the index, do as follows:
collection.create_index(
    field_name="title_vector", 
    index_params=index_params,
    index_name='title_vector_index'
)

Status(code=0, message=)

## Load and release collection

For collections created using SDKs, you should load them before you can perform searches and queries. You can also release collections to save the expense if the collections are not needed temporarily.

In [9]:
# 5. Load collection
collection.load()

# Get loading progress
progress = utility.loading_progress(COLLECTION_NAME)

print(progress)

{'loading_progress': '100%'}


E1103 18:01:15.328923000 6220509184 ssl_transport_security.cc:1420]    Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.
E1103 18:01:15.331390000 6220509184 ssl_transport_security.cc:1420]    Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.


To release a collection, do as follows:

In [None]:
# collection.release()

## Insert entities

To add items to a collection, make sure that the format of these items complies with the schema of the collection.

### Prepare data

You may arrange your data as either a list of rows or a list of columns.

- Arrange your data as a list of rows.

  To arrange your data in a list of rows, each row should be a dictionary where the field name serves as the key and the field value is its corresponding value. The following code snippet prepares a list of two hundred rows from the [Example Dataset](https://docs.zilliz.com/2df10a1ffe454c52a2be0ec069f4424e). You can change the slicing to include more or all rows in the dataset.

In [None]:
# Download the dataset

!curl https://assets.zilliz.com/medium_articles_2020_dpr_a13e0377ae.json \
    --output medium_articles_2020_dpr.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 60.4M  100 60.4M    0     0  44.3M      0  0:00:01  0:00:01 --:--:-- 44.3M


In [10]:
# 6. Prepare data

# Prepare a list of rows
with open(DATASET_PATH) as f:
    data = json.load(f)
    rows = data['rows']

print(rows[:3])

[{'id': 0, 'title': 'The Reported Mortality Rate of Coronavirus Is Not Important', 'title_vector': [0.041732933, 0.013779674, -0.027564144, -0.013061441, 0.009748648, 0.00082446384, -0.00071647146, 0.048612226, -0.04836573, -0.04567751, 0.018008126, 0.0063936645, -0.011913628, 0.030776596, -0.018274948, 0.019929802, 0.020547243, 0.032735646, -0.031652678, -0.033816382, -0.051087562, -0.033748355, 0.0039493158, 0.009246126, -0.060236514, -0.017136049, 0.028754413, -0.008433934, 0.011168004, -0.012391256, -0.011225835, 0.031775184, 0.002929508, -0.007448661, -0.005337719, -0.010999258, -0.01515909, -0.005130484, 0.0060212007, 0.0034560722, -0.022935811, -0.04970116, -0.0155887455, 0.06627353, -0.006052789, -0.051570725, -0.109865054, 0.033205193, 0.00041118253, 0.0029823708, 0.036160238, -0.011256539, 0.00023560718, 0.058322437, 0.022275906, 0.015206677, -0.02884609, 0.0016338055, 0.0049200393, 0.014388571, -0.0049061654, -0.04664761, -0.027454877, 0.017526226, -0.005100602, 0.018090058,

- Arrange your data as a list of columns.

  To add a list of columns, each column is represented by a nested list containing the values of all rows within that column. The following code snippet prepares a list of columns containing two records from the [Example Dataset](https://docs.zilliz.com/2df10a1ffe454c52a2be0ec069f4424e). You can change the slicing to include more or all records.

In [11]:
# Prepare a list of columns
with open(DATASET_PATH) as f:
    keys = list(rows[0].keys())
    columns = [ [] for x in keys ]
    for row in rows:
        for x in keys:
            columns[keys.index(x)].append(row[x])

    columns_demo = [ [] for x in keys ]
    for row in rows[:3]:
        for x in keys:
            columns_demo[keys.index(x)].append(row[x])

print(columns_demo)

[[0, 1, 2], ['The Reported Mortality Rate of Coronavirus Is Not Important', 'Dashboards in Python: 3 Advanced Examples for Dash Beginners and Everyone Else', 'How Can We Best Switch in Python?'], [[0.041732933, 0.013779674, -0.027564144, -0.013061441, 0.009748648, 0.00082446384, -0.00071647146, 0.048612226, -0.04836573, -0.04567751, 0.018008126, 0.0063936645, -0.011913628, 0.030776596, -0.018274948, 0.019929802, 0.020547243, 0.032735646, -0.031652678, -0.033816382, -0.051087562, -0.033748355, 0.0039493158, 0.009246126, -0.060236514, -0.017136049, 0.028754413, -0.008433934, 0.011168004, -0.012391256, -0.011225835, 0.031775184, 0.002929508, -0.007448661, -0.005337719, -0.010999258, -0.01515909, -0.005130484, 0.0060212007, 0.0034560722, -0.022935811, -0.04970116, -0.0155887455, 0.06627353, -0.006052789, -0.051570725, -0.109865054, 0.033205193, 0.00041118253, 0.0029823708, 0.036160238, -0.011256539, 0.00023560718, 0.058322437, 0.022275906, 0.015206677, -0.02884609, 0.0016338055, 0.00492003

### Insert data

Once your data is ready, you can insert it as follows:

In [15]:
# 7. Insert data
results = collection.insert(rows)
# results = collection.insert(columns) # also works

print(f"Data inserted successfully! Inserted rows: {results.insert_count}")

# If you have prepared your data in columns, you can do as follows:
# results = collection.insert(columns)

time.sleep(5)


Data inserted successfully! Inserted rows: 5979


## Search and query

A single-vector search request involves using only one vector and asking for the top-K entities that are most similar to the input query vector.

You can also conduct a bulk search by providing multiple query vectors in a single request. In most cases, bulk search is more efficient than conducting single-vector searches because the total latency is much lower than searching against these query vectors in individual requests.

Before searching a collection, you must define the search parameters. Ensure that the metric type matches the one defined in the index parameters. Then, reference the search parameters in the search request and set the query vector, vector field name, limits, and any other applicable parameters.

The following uses a single-vector search as an example. The results display the top 5 most similar entities, along with their primary keys and distances.

In [17]:
# 8. Search data
# Metric type should be the same as
# that defined in the index parameters 
# used to create the index.
search_params = {
    "metric_type": "L2"
}

results = collection.search(
    data=[rows[0]['title_vector']],
    anns_field="title_vector",
    param=search_params,
    output_fields=["title", "link"],
    limit=5
)

# Get all returned IDs
# results[0] indicates the result 
# of the first query vector in the 'data' list
ids = results[0].ids

print(ids)

# Get the distance from 
# all returned vectors to the query vector.
distances = results[0].distances

print(distances)

# Get the values of the output fields
# specified in the search request
entities = [ x.entity.to_dict()["entity"] for x in results[0] ]

print(entities)

[0, 3177]
[0.0, 0.29999834299087524]
[{'title': 'The Reported Mortality Rate of Coronavirus Is Not Important', 'link': 'https://medium.com/swlh/the-reported-mortality-rate-of-coronavirus-is-not-important-369989c8d912'}, {'title': 'Following the Spread of Coronavirus', 'link': 'https://towardsdatascience.com/following-the-spread-of-coronavirus-23626940c125'}]


E1103 18:06:07.353191000 6220509184 ssl_transport_security.cc:1420]    Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.


## Drop collection

You can drop the collection as follows:

In [18]:
# 9. Drop collection
utility.drop_collection(COLLECTION_NAME)