In [22]:
# from pymilvus import model, MilvusClient, DataType, FieldSchema, CollectionSchema
# import pandas as pd
from pymilvus import model, MilvusClient, DataType, FieldSchema, CollectionSchema
import pandas as pd

In [18]:
def init_embedding():
    # 初始化嵌入模型
    ef = model.DefaultEmbeddingFunction()  # 正確引用model內的方法
    client = MilvusClient("./milvus_data.db")
    return ef, client

In [38]:
# 初始化组件
ef = model.DefaultEmbeddingFunction()  # 确保已安装 pymilvus[model]
client = MilvusClient("./dqe_milvus_data.db")  # 必须包含.db后缀

# 定义字段结构（关键修正点）
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="module", dtype=DataType.VARCHAR, max_length=50),
    FieldSchema(name="severity", dtype=DataType.VARCHAR, max_length=1),
    FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=2000),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=ef.dim)
]

# 创建集合（新API规范）
collection_name = "quality_issues"
if client.has_collection(collection_name):
    client.drop_collection(collection_name)

client.create_collection(
    collection_name=collection_name,
    schema=CollectionSchema(fields, description="质量知识库"),  # 单一路径传递schema
    # 不再需要单独传递fields参数
)

# 数据插入示例
df = pd.read_csv("deq_learn_refine2_correct.csv")
data = [{
    "module": row["模块"],
    "severity": str(row["严重度"]),
    "description": row["问题现象描述"],
    "vector": ef.encode_documents([row["问题现象描述"]])[0]
} for _, row in df.iterrows()]

client.insert(collection_name, data)

print(f"成功插入 {len(df)} 条数据，向量维度={ef.dim}")


成功插入 28 条数据，向量维度=768


#### 建立index

In [43]:
# 索引創建參數
index_params = client.prepare_index_params()
index_params.add_index(
    field_name="vector",
    index_type="IVF_FLAT",
    metric_type="COSINE",
    params={"nlist": 256}  # 典型值 128-4096
)

# 搜索參數對應關係
search_params = {
    "nprobe": 32  # 值範圍 [1, nlist]
}


#### Test Codes

In [41]:
def test_basic_operations():
    # 连接验证
    assert client.has_collection(collection_name), "集合创建失败"
    
    # 数据量验证
    count = client.query(collection_name, filter="", output_fields=["count(*)"])[0]["count(*)"]
    assert count == len(df), f"数据量不符 ({count} vs {len(df)})"
    
    # 向量维度验证
    collection_info = client.describe_collection(collection_name)
    
    # assert collection_info["vector_field"]["dim"] == ef.dim, "向量维度错误"
    vector_fields = [f for f in collection_info['fields'] if f['type'] == DataType.FLOAT_VECTOR]
    assert len(vector_fields) > 0, "未找到向量字段"
    assert vector_fields[0]['params']['dim'] == ef.dim, "向量維度錯誤"
    
    # 随机抽样验证
    sample = client.query(collection_name, filter="pk < 5", output_fields=["*"])
    for item in sample:
        assert len(item["vector"]) == ef.dim, "向量长度异常"
        assert item["description"], "描述字段为空"

test_basic_operations()


In [44]:
def test_hybrid_search():
    # 向量搜索 + 標量過濾
    query = "電池鼓包問題"
    query_vec = ef.encode_documents([query])[0]
    
    results = client.search(
        collection_name=collection_name,
        data=[query_vec],
        expr="severity == 'A'",  # 獨立傳遞篩選條件
        limit=3,
        output_fields=["module", "severity"],
        param={"nprobe": 32}  # IVF_FLAT 專用參數
    )
    
    assert len(results) > 0, "混合查詢無結果"
    for hit in results:
        assert hit.entity.get("severity") == 'A', "嚴重度過濾失效"
        print(f"找到相似案例: {hit.entity.module} | 相似度: {hit.distance:.4f}")

test_hybrid_search()

2025-04-10 18:03:50,074 [ERROR][handler]: Unexpected error: [search], GrpcHandler.search() got multiple values for argument 'param', <Time: {'RPC start': '2025-04-10 18:03:50.074042', 'Exception': '2025-04-10 18:03:50.074083'}> (decorators.py:158)
2025-04-10 18:03:50,074 [ERROR][search]: Failed to search collection: quality_issues (milvus_client.py:415)


MilvusException: <MilvusException: (code=1, message=Unexpected error, message=<GrpcHandler.search() got multiple values for argument 'param'>)>

In [30]:
import time

def test_performance():
    # 查询延迟测试
    start = time.time()
    client.query(collection_name, filter="pk < 100", limit=100)
    print(f"标量查询延迟: {time.time()-start:.4f}s")

    # 向量搜索压力测试
    test_vectors = [ef.random_vector(ef.dim) for _ in range(100)]
    start = time.time()
    client.search(collection_name, data=test_vectors, limit=3)
    print(f"批量向量搜索延迟: {time.time()-start:.4f}s")

test_performance()

标量查询延迟: 0.0019s


AttributeError: 'OnnxEmbeddingFunction' object has no attribute 'random_vector'