In [None]:
!python3 -m pip install pymilvus

# You'd better restart the runtime after installing pymilvus.
# To do so, choose "Runtime > Restart Runtime" from the above main menu.

Collecting pymilvus
  Downloading pymilvus-2.2.14-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.9/147.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting grpcio<=1.56.0,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.56.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting environs<=9.5.0 (from pymilvus)
  Downloading environs-9.5.0-py2.py3-none-any.whl (12 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow>=3.0.0 (from environs<=9.5.0->pymilvus)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

## Connect to cluster

When creating a dedicated cluster, you need to configure a cluster credential consisting of a pair of username and password. Be sure to take note of these details, as you’ll need them to connect to the cluster.

If you prefer private links, just replace the uri with your private links. Before that, ensure you have access to your private links. For details, see [Set up Private Link](https://docs.zilliz.com/docs/set-up-a-private-link).

In [None]:
import json
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

CLUSTER_ENDPOINT = "replace-this-with-your-cluster-endpoint"
TOKEN = "replace-this-with-your-token"

# Connect to cluster
connections.connect(
  alias='default',
  #  Public endpoint obtained from Zilliz Cloud
  uri=CLUSTER_ENDPOINT,
  secure=True,
  token=TOKEN, # Username and password specified when you created this cluster
    # Or continue using legacy method `user` and `password` to replace `token`:
    # user='',
    # password=''
)

## Create collection with partition key enabled

To demonstrate the use of partition keys, we will continue to use the [example dataset](https://docs.zilliz.com/docs/example-dataset) that contains over 5,000 articles, and `publication` will serve as the partition key. The schema of the collection to be created is similar to the one specified in [Use Customized Schema](https://docs.zilliz.com/docs/use-customized-schema) except for the settings of the `publication` field.

There are two ways to set up a partition key:

- Set `is_partition_key` of the target field to `True` in the corresponding `FieldSchema`, or
- Set `partition_key_field` in the `CollectionSchema` to the name of the target field.

In [None]:
# 2. Define fields
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=768),
    FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="reading_time", dtype=DataType.INT64),
    # set `is_partition_key` to `true` for the field `publication`
    FieldSchema(name="publication", dtype=DataType.VARCHAR, is_partition_key=True, max_length=512),
    FieldSchema(name="claps", dtype=DataType.INT64),
    FieldSchema(name="responses", dtype=DataType.INT64)
]

After you have defined the fields, create a schema for the collection.

In [None]:
# 3. Build the schema
schema = CollectionSchema(
    fields,
    description="Schema of Medium articles",
    # This is an alternative to `is_partition_key` in field settings.
    partition_key_field="publication"
)

Finally, you can create a collection using the collection schema just defined.

In [None]:
# 4. Create collection
collection = Collection(
    name="medium_articles",
    description="Medium articles published between Jan and August in 2020 in prominent publications",
    schema=schema
)

## Index and load collection

In this section, we will index and load the collection before inserting data.

In [None]:
# 4. index the vector field
index_params = {
    "index_type": "AUTOINDEX",
    "metric_type": "L2",
    "params": {}
}

collection.create_index(
  field_name="title_vector",
  index_params=index_params
)

# 5. load the collection
collection.load()

# Get loading progress
progress = utility.loading_progress("medium_articles")

print(f"Collection loaded successfully: {progress}")



Collection loaded successfully: {'loading_progress': '100%'}


## Insert data

In this section, we will insert the data from the [example dataset](https://docs.zilliz.com/docs/example-dataset) into the above created collection.

In [None]:
# Download the dataset

!curl https://assets.zilliz.com/medium_articles_2020_dpr_a13e0377ae.json \
    --output medium_articles_2020_dpr.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 60.4M  100 60.4M    0     0   108M      0 --:--:-- --:--:-- --:--:--  108M


In [None]:
with open("medium_articles_2020_dpr.json") as f:
    data = json.load(f)
    list_of_rows = data['rows']

        # prepare a list of 5 entities
    data_rows = []
    for row in list_of_rows:
        data_rows.append(row)

# 6. insert data
collection.insert(data_rows)

# The flush operation is not always necessary, only if you need to perform a search immediately after data insertion.
collection.flush()

print("Entity counts: ", collection.num_entities)

# Output
# Entity counts:  5979

Entity counts:  11958


## Conduct ANN search using partition key

Once you have indexed and loaded the collection as well as inserted data as described in [Use Customized Schema](https://docs.zilliz.com/docs/use-customized-schema), you can conduct an ANN search using the partition key.

To conduct an ANN search using the partition key, you should include either of the following in the boolean expression of the search request:

```python
expr='<partition_key>=="xxxx"'

expr='<partition_key> in ["xxx", "xxx"]'
```

Do replace <partition_key> with the name of the field that is designated as the partition key.

In [None]:
# 8. Search data

result = collection.search(
    data=[data_rows[0]['title_vector']],
    anns_field="title_vector",
    param={"metric_type": "L2", "params": {"nprobe": 10}},
    limit=3,
    expr='claps > 30 and reading_time < 10 and publication in ["Towards Data Science", "Personal Growth"]',
    output_fields=["title", "link" ],
)

# Get all returned IDs
# results[0] indicates the result
# of the first query vector in the 'data' list
ids = result[0].ids

print(ids)

# Output:
#
# [5607, 5641, 3441]

# Get the distance from
# all returned vectors to the query vector.
distances = result[0].distances

print(distances)

# Output:
#
# [0.36103835701942444, 0.37674015760421753, 0.4162980318069458]

# Get the values of the output fields
# specified in the search request
hits = result[0]
for hit in hits:
    print(hit.entity.get("title"))
    print(hit.entity.get("link"))

# Output:
#
# The Hidden Side Effect of the Coronavirus
# <https://medium.com/swlh/the-hidden-side-effect-of-the-coronavirus-b6a7a5ee9586>
# Why The Coronavirus Mortality Rate is Misleading
# <https://towardsdatascience.com/why-the-coronavirus-mortality-rate-is-misleading-cc63f571b6a6>
# Coronavirus shows what ethical Amazon could look like
# <https://medium.com/swlh/coronavirus-shows-what-ethical-amazon-could-look-like-7c80baf2c663>

[5641, 938, 842]
[0.37674015760421753, 0.436093807220459, 0.49443870782852173]
Why The Coronavirus Mortality Rate is Misleading
https://towardsdatascience.com/why-the-coronavirus-mortality-rate-is-misleading-cc63f571b6a6
Mortality Rate As an Indicator of an Epidemic Outbreak
https://towardsdatascience.com/mortality-rate-as-an-indicator-of-an-epidemic-outbreak-704592f3bb39
Choosing the right performance metrics can save lives against Coronavirus
https://towardsdatascience.com/choosing-the-right-performance-metrics-can-save-lives-against-coronavirus-2f27492f6638


## Drop Collection

You can drop the collection as follows:

In [None]:
res = utility.drop_collection("medium_articles_with_dynamic")

print(res)

None
