In [None]:
!python3 -m pip install pymilvus

# Restart the runtime after the installation completes.
# To do so, choose "Runtime > Restart runtime" from the main menu.

Collecting pymilvus
  Downloading pymilvus-2.2.14-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.9/147.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting grpcio<=1.56.0,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.56.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting environs<=9.5.0 (from pymilvus)
  Downloading environs-9.5.0-py2.py3-none-any.whl (12 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow>=3.0.0 (from environs<=9.5.0->pymilvus)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

## Connect to cluster

When creating a dedicated cluster, you need to configure a cluster credential consisting of a pair of username and password. Be sure to take note of these details, as you’ll need them to connect to the cluster.

If you prefer private links, just replace the uri with your private links. Before that, ensure you have access to your private links. For details, see [Set up Private Link](https://docs.zilliz.com/docs/set-up-a-private-link).

In [9]:
import json, time
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

CLUSTER_ENDPOINT="http://localhost:19530" # Set your cluster endpoint
TOKEN="root:Milvus" # Set your token
COLLECTION_NAME="medium_articles_2020" # Set your collection name
DATASET_PATH="../medium_articles_2020_dpr.json" # Set your dataset path

connections.connect(
  alias='default', 
  #  Public endpoint obtained from Zilliz Cloud
  uri=CLUSTER_ENDPOINT,
  # API key or a colon-separated cluster username and password
  token=TOKEN, 
)

## Enable dynamic schema

To create a collection using a dynamic schema, set `enable_dynamic_field` to `True` when defining the data model. Afterward, all undefined fields and their values in the data entities inserted afterward will be treated as pre-defined fields. We prefer to use the term "dynamic fields" to refer to these key-value pairs.

With these dynamic fields, you can ask Milvus to output them in search/query results and include them in search and query filter expressions just as they are already defined in the collection schema.

In [10]:
# 2. Define fields
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True, max_length=100),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=768)
]

# 3. Create schema with dynamic field enabled
schema = CollectionSchema(
		fields, 
		"The schema for a medium news collection", 
		enable_dynamic_field=True
)

# 4. Create collection
collection = Collection(COLLECTION_NAME, schema)

# 5. Index collection
index_params = {
    "index_type": "AUTOINDEX",
    "metric_type": "L2",
    "params": {}
}

collection.create_index(
  field_name="title_vector", 
  index_params=index_params
)

collection.load()

# Get loading progress
progress = utility.loading_progress(COLLECTION_NAME)

print(progress)

{'loading_progress': '100%'}


## Insert dynamic data

Once the collection is created, you can start inserting data, including the dynamic data into the collection.

### Prepare data

Now we need to prepare a piece of applicable data out of the [Example Dataset](https://docs.zilliz.com/docs/example-dataset).

In [None]:
# Download the dataset

!curl https://assets.zilliz.com/medium_articles_2020_dpr_a13e0377ae.json \
    --output medium_articles_2020_dpr.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 60.4M  100 60.4M    0     0  36.1M      0  0:00:01  0:00:01 --:--:-- 36.1M


In [11]:
# 6. Prepare data
with open(DATASET_PATH) as f:
    data = json.load(f)
    list_of_rows = data['rows']

    data_rows = []
    for row in list_of_rows:
        # Remove the id field because the primary key has auto_id enabled.
        del row['id']
        # Other keys except the title and title_vector fields in the row 
        # will be treated as dynamic fields.
        data_rows.append(row)

### Insert data

Then you can safely insert the data into the collection.

In [12]:
# 7. Insert data
result = collection.insert(data_rows)
collection.flush()

print(f"Data inserted successfully! Inserted counts: {result.insert_count}")

Data inserted successfully! Inserted counts: 5979


## Search with dynamic fields

If you have created medium_articles_with_dynamic with dynamic field enabled, and inserted data with dynamic fields into, index, and load the collection, you can use dynamic fields in the filter expression of a search or a query as follows:

In [13]:
# 8. Search data
result = collection.search(
    data=[data_rows[0]['title_vector']],
    anns_field="title_vector",
    param={"metric_type": "L2", "params": {"nprobe": 10}},
    limit=3,
    # Access dynamic fields in the boolean expression
    expr='claps > 30 and reading_time < 10',
    # Include dynamic fields in the output to return
    output_fields=["title", "reading_time", "claps"],
)

result = [ list(map(lambda y: y.entity.to_dict(), x)) for x in result ]

print(result)

# get collection info
print("Entity counts: ", collection.num_entities)

# Output
#
# Entity counts:  5979

[[{'id': 445383736951017465, 'distance': 0.36103832721710205, 'entity': {'title': 'The Hidden Side Effect of the Coronavirus', 'reading_time': 8, 'claps': 83}}, {'id': 445383736951017499, 'distance': 0.37674015760421753, 'entity': {'title': 'Why The Coronavirus Mortality Rate is Misleading', 'reading_time': 9, 'claps': 2900}}, {'id': 445383736951015299, 'distance': 0.416297972202301, 'entity': {'title': 'Coronavirus shows what ethical Amazon could look like', 'reading_time': 4, 'claps': 51}}]]
Entity counts:  5979


It is worth noting that `claps` and `reading_time` are not present when you define the schema, which does not prevent you from using them in the filter expression and including them in the output fields if the data entities inserted have these fields, just like you normally do in the past.

If the key of a dynamic field contains characters other than digits, letters, and underscores (e.g. plus signs, asterisks, or dollar signs), you need to include the key within $meta[] as shown in the following code snippet when using it in a boolean expression or including it in the output fields.

## Drop Collection

You can drop the collection as follows:

In [14]:
# 9. Drop collection
utility.drop_collection(COLLECTION_NAME)