In [None]:
from pymilvus import MilvusClient, client, DataType

# 不支持windows
client = MilvusClient("../data/milvus_demo.db")
# client = MilvusClient(uri="http://dbconn.sealosgzg.site:45034")

In [8]:
if client.has_collection(collection_name="demo_shcema_collection"):
    client.drop_collection(collection_name="demo_shcema_collection")

# 3.1. Create schema
schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)

# 3.2. Add fields to schema
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=768)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=512)
# schema.add_field(field_name="subject", datatype=DataType.VARCHAR, max_length=512)

index_params = client.prepare_index_params()

# 3.4. Add indexes - 修复索引类型
index_params.add_index(field_name="id")  # 主键字段不需要指定索引类型

index_params.add_index(
    field_name="vector", 
    index_type="IVF_FLAT",  # 使用有效的索引类型
    metric_type="COSINE",
    params={"nlist": 1024}  # IVF_FLAT 需要 nlist 参数
)

client.create_collection(
    collection_name="demo_shcema_collection", schema=schema, index_params=index_params
)

res = client.get_load_state(collection_name="demo_shcema_collection")

print(res)

{'state': <LoadState: Loaded>}


In [13]:
# import os
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
    "你好",
    "新增的测试数据",
]

# 设置 Hugging Face 镜像
# export HF_ENDPOINT=https://hf-mirror.com
# from pymilvus import model
# embedding_fn = model.DefaultEmbeddingFunction()
# vectors = embedding_fn.encode_documents(docs)

# pip install sentence-transformers
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('all-MiniLM-L6-v2')  # 这个模型比较小
# vectors = model.encode(docs)

import numpy as np

np.random.seed(42)  # 保证可重复
vectors = [np.random.rand(768).astype(np.float32) for _ in docs]

print("Dim:", vectors[0].shape)  # Dim: 768 (768,)

data = [
    {"id": i, "vector": vectors[i], "text": docs[i], "subject": "a"}
    for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

Dim: (768,)
Data has 5 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


In [14]:
res = client.insert(collection_name="demo_shcema_collection", data=data)

print(res)

{'insert_count': 5, 'ids': [0, 1, 2, 3, 4]}


In [16]:
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing?"])

res = client.search(
    collection_name="demo_shcema_collection",  # target collection
    data=query_vectors,  # query vectors
    limit=2,  # number of returned entities
    output_fields=["text", "subject"],  # specifies fields to be returned
)

print(res)

NameError: name 'embedding_fn' is not defined

In [6]:
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
    {"id": 4 + i, "vector": vectors[i], "text": docs[i], "subject": "biology"}
    for i in range(len(vectors))
]

client.insert(collection_name="demo_shcema_collection", data=data)

res = client.search(
    collection_name="demo_shcema_collection",
    data=embedding_fn.encode_queries(["tell me AI related information"]),
    filter="subject == 'biology'",
    limit=2,
    output_fields=["text", "subject"],
)

print(res)

data: [[{'id': 5, 'distance': 0.2703055143356323, 'entity': {'text': 'Computational synthesis with AI algorithms predicts molecular properties.', 'subject': 'biology'}}, {'id': 4, 'distance': 0.1642589271068573, 'entity': {'text': 'Machine learning has been used for drug design.', 'subject': 'biology'}}]]


In [7]:
res = client.query(
    collection_name="demo_shcema_collection",
    filter="subject == 'history'",
    output_fields=["text", "subject"],
)

print(res)

data: [], extra_info: {}
