In [None]:
!python3 -m pip install pymilvus

# You'd better restart the runtime after installing pymilvus.
# To do so, choose "Runtime > Restart Runtime" from the above main menu.

Collecting pymilvus
  Downloading pymilvus-2.2.14-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.9/147.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting grpcio<=1.56.0,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.56.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Collecting environs<=9.5.0 (from pymilvus)
  Downloading environs-9.5.0-py2.py3-none-any.whl (12 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow>=3.0.0 (from environs<=9.5.0->pymilvus)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

## Connect to cluster

When creating a dedicated cluster, you need to configure a cluster credential consisting of a pair of username and password. Be sure to take note of these details, as you’ll need them to connect to the cluster.

If you prefer private links, just replace the uri with your private links. Before that, ensure you have access to your private links. For details, see [Set up Private Link](https://docs.zilliz.com/docs/set-up-a-private-link).

In [1]:
import json, time
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

CLUSTER_ENDPOINT="YOUR_CLUSTER_ENDPOINT" # Set your cluster endpoint
TOKEN="YOUR_CLUSTER_TOKEN" # Set your token
COLLECTION_NAME="medium_articles_2020" # Set your collection name
DATASET_PATH="../medium_articles_2020_dpr.json" # Set your dataset path

connections.connect(
  alias='default', 
  #  Public endpoint obtained from Zilliz Cloud
  uri=CLUSTER_ENDPOINT,
  # API key or a colon-separated cluster username and password
  token=TOKEN, 
)

## Defind JSON field

To define a JSON field, simply follow the same procedure as defining fields of other types.

In the following code, `article_meta` is a JSON field because its `dtype` is set to `DataType.JSON`.

In [2]:
# 2. Define fields
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True, max_length=100),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=768),
    # The following field is a JSON field
    FieldSchema(name="article_meta", dtype=DataType.JSON)
]

# 3. Create schema with dynamic field enabled
schema = CollectionSchema(
		fields, 
		"The schema for a medium news collection", 
)

# 4. Create collection
collection = Collection(COLLECTION_NAME, schema)

# 5. Index collection
index_params = {
    "index_type": "AUTOINDEX",
    "metric_type": "L2",
    "params": {}
}

collection.create_index(
  field_name="title_vector", 
  index_params=index_params
)

collection.load()

## Insert field values

After creating a collection from the `CollectionSchema` object, dictionaries such as the one above can be inserted into it.

In [None]:
# Download the dataset

!curl https://assets.zilliz.com/medium_articles_2020_dpr_a13e0377ae.json \
    --output medium_articles_2020_dpr.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 60.4M  100 60.4M    0     0   122M      0 --:--:-- --:--:-- --:--:--  122M


In [4]:
# 6. Prepare data
with open(DATASET_PATH) as f:
    data = json.load(f)
    list_of_rows = data['rows']

    data_rows = []
    for row in list_of_rows:
        # Remove the id field because auto-id is enabled for the primary key
        del row['id']
        # Create the article_meta field and 
        row['article_meta'] = {}
        # Move the following keys into the article_meta field
        row['article_meta']['link'] = row.pop('link')
        row['article_meta']['reading_time'] = row.pop('reading_time')
        row['article_meta']['publication'] = row.pop('publication')
        row['article_meta']['claps'] = row.pop('claps')
        row['article_meta']['responses'] = row.pop('responses')
        # Append this row to the data_rows list
        data_rows.append(row)

# 7. Insert data

result = collection.insert(data_rows)
collection.flush()

print(f"Data inserted successfully! Inserted counts: {result.insert_count}")


Data inserted successfully! Inserted counts: 5979


# Search within JSON field

Once all of your data has been added, you can conduct searches using the keys in the JSON field in the same manner as you would with a standard scalar field. Simply follow these steps:

In [5]:
# 8. Search data
result = collection.search(
    data=[data_rows[0]['title_vector']],
    anns_field="title_vector",
    param={"metric_type": "L2", "params": {"nprobe": 10}},
    limit=3,
    # Access the keys in the JSON field
    expr='article_meta["claps"] > 30 and article_meta["reading_time"] < 10',
    # Include the JSON field in the output to return
    output_fields=["title", "article_meta"],
)

print([ list(map(lambda y: y.entity.to_dict(),  x)) for x in result ])

[[{'id': 445383736951035424, 'distance': 0.36103832721710205, 'entity': {'article_meta': {'link': 'https://medium.com/swlh/the-hidden-side-effect-of-the-coronavirus-b6a7a5ee9586', 'reading_time': 8, 'publication': 'The Startup', 'claps': 83, 'responses': 0}, 'title': 'The Hidden Side Effect of the Coronavirus'}}, {'id': 445383736951035458, 'distance': 0.37674015760421753, 'entity': {'article_meta': {'link': 'https://towardsdatascience.com/why-the-coronavirus-mortality-rate-is-misleading-cc63f571b6a6', 'reading_time': 9, 'publication': 'Towards Data Science', 'claps': 2900, 'responses': 47}, 'title': 'Why The Coronavirus Mortality Rate is Misleading'}}, {'id': 445383736951033258, 'distance': 0.416297972202301, 'entity': {'article_meta': {'link': 'https://medium.com/swlh/coronavirus-shows-what-ethical-amazon-could-look-like-7c80baf2c663', 'reading_time': 4, 'publication': 'The Startup', 'claps': 51, 'responses': 0}, 'title': 'Coronavirus shows what ethical Amazon could look like'}}]]


## Drop collection

You can drop the collection as follows:

In [6]:
# 9. Drop collection
utility.drop_collection(COLLECTION_NAME)