## 创建客户端

In [1]:
from pymilvus import MilvusClient, DataType


# Define server address
SERVER_ADDR = "http://localhost:19530"

client = MilvusClient(uri = SERVER_ADDR)

## 定义集合模式schema

In [2]:
schema = client.create_schema(
    auto_id=False,
    enable_dynamic_fields=True
)

# 'vector_field1' 支持null并且默认值设为'Unknown' 。 'vector_field2支持null并且不设置默认值'
schema.add_field(
    field_name = 'varchar_field1',
    datatype=DataType.VARCHAR,
    max_length=100,
    nullable=True,
    default_value="Unknown"
)
schema.add_field(field_name="varchar_field2", datatype=DataType.VARCHAR, max_length=200, nullable=True)

# 设置"pk"字段作为primary key
schema.add_field(
    field_name="pk",
    is_primary=True,
    datatype=DataType.INT64,
    auto_id=True
)
# 添加vector field "embedding" datatype为FLOAT_VECTOR
schema.add_field(
    field_name="embedding",
    datatype=DataType.FLOAT_VECTOR,
    dim=3
)

{'auto_id': False, 'description': '', 'fields': [{'name': 'varchar_field1', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}, 'default_value': string_data: "Unknown"
, 'nullable': True}, {'name': 'varchar_field2', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 200}, 'nullable': True}, {'name': 'pk', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 3}}], 'enable_dynamic_field': False}

## 设置索引参数 index params 配置向量字段和字符串字段的度量类型

In [3]:
index_params = client.prepare_index_params()

# Index `varchar_field1` with AUTOINDEX
index_params.add_index(
    field_name="varchar_field1",
    index_type="AUTOINDEX",
    index_name="autoindex_varchar_field1",    
)

# Index `embedding` with COSINE 
index_params.add_index(
    field_name="embedding",
    index_type="AUTOINDEX",
    metric_type="COSINE",       # # Specify similarity metric type, options include L2, COSINE, or IP
)

# 检查集合是否已存在，如果存在则删除
if client.has_collection("my_collection"):
    client.drop_collection("my_collection")
    print("已删除现有集合 'my_collection'")

# 创建集合collections
client.create_collection(
    collection_name="my_collection",
    schema=schema,
    index_params=index_params,
)

print("成功创建集合 'my_collection'")

已删除现有集合 'my_collection'
成功创建集合 'my_collection'


In [4]:
# 插入数据
data = [
    {"varchar_field1": "Product A", "varchar_field2": "High quality product", "embedding": [0.1, 0.2, 0.3]},
    {"varchar_field1": "Product B", "embedding": [0.4, 0.5, 0.6]}, # varchar_field2 field is missing, which should be NULL
    {"varchar_field1": None, "varchar_field2": None,"embedding": [0.2, 0.3, 0.1]},  # `varchar_field1` should default to `Unknown`, `varchar_field2` is NULL
    {"varchar_field1": "Product C", "varchar_field2": None,  "embedding": [0.5, 0.7, 0.2]},  # `varchar_field2` is NULL
    {"varchar_field1": None, "varchar_field2": "Exclusive deal",  "embedding": [0.6, 0.4, 0.8]},  # `varchar_field1` should default to `Unknown`
    {"varchar_field1": "Unknown", "varchar_field2": None, "embedding": [0.8, 0.5, 0.3]},  # `varchar_field2` is NULL
    {"varchar_field1": "", "varchar_field2": "Best seller", "embedding": [0.8, 0.5, 0.3]}, # Empty string is not treated as NULL
]

client.insert(
    collection_name="my_collection",
    data=data
)
print("数据插入成功")

数据插入成功


In [5]:
# 带过滤表达式的向量搜索
# Filter `varchar_field2` with value "Best seller"
filter = 'varchar_field2 == "Best seller"'

search_params = {
    "metric_type": "COSINE",
    "params": {"nprobe": 10},
}

results = client.search(
    collection_name="my_collection",
    data=[[0.3, -0.6, 0.1]],
    search_params=search_params,
    filter=filter,
    output_fields=["pk", "varchar_field1", "varchar_field2", "embedding"],
    limit=3
)
print(results)

data: [[{'pk': 460327157461156626, 'distance': -0.04468163847923279, 'entity': {'varchar_field2': 'Best seller', 'varchar_field1': '', 'pk': 460327157461156626, 'embedding': [0.800000011920929, 0.5, 0.30000001192092896]}}]]


In [6]:
# 使用筛选表达式进行查询
results = client.query(
    collection_name="my_collection",
    filter='varchar_field1 == "Unknown"',
    output_fields=["pk", "varchar_field1", "varchar_field2", "embedding"],
)
print("查询结果:")
print(results)

查询结果:
data: ["{'pk': 460327157461156622, 'varchar_field1': 'Unknown', 'varchar_field2': None, 'embedding': [0.20000000298023224, 0.30000001192092896, 0.10000000149011612]}", "{'pk': 460327157461156624, 'varchar_field1': 'Unknown', 'varchar_field2': 'Exclusive deal', 'embedding': [0.6000000238418579, 0.4000000059604645, 0.800000011920929]}", "{'pk': 460327157461156625, 'varchar_field1': 'Unknown', 'varchar_field2': None, 'embedding': [0.800000011920929, 0.5, 0.30000001192092896]}"], extra_info: {}


In [7]:
filter = "pk in [1, 3, 5]"
results = client.query(
    collection_name="my_collection",
    filter=filter,
    output_fields=["pk", "varchar_field1", "varchar_field2", "embedding"],
)
print("查询结果:")
print(results)  

查询结果:
data: [], extra_info: {}
