In [2]:
from pymilvus import MilvusClient
# 实例化客户端
client = MilvusClient("milvus_demo.db")

## 创建好client 和 collections

In [3]:
if client.has_collection(collection_name="demo_collection"):
    client.drop_collection(collection_name="demo_collection")
client.create_collection(
    collection_name="demo_collection", 
    dimension=768
    )             # 在 Milvus 中，我们需要一个 Collections 来存储向量及其相关元数据 可以定义 Schema 和索引参数来配置向量规格，如维度、索引类型和远距离度量

In [4]:
# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# This will download a small embedding model "paraphrase-albert-small-v2" (~50MB).

## 加载模型并且对数据进行embedding向量化

In [5]:
from pymilvus import model


# This will download a small embedding model "paraphrase-albert-small-v2" (~50MB).
embedding_fn = model.DefaultEmbeddingFunction()

# Text strings to search from.
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

# 对原始数据用embedding模型进行向量化 encode_documents
vectors = embedding_fn.encode_documents(docs)

# The output vector has 768 dimensions, matching the collection that we just created.  这里注意embedding模型hidden_dim需要与collection的dimension一致
print("Dim:", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Dim: 768 (768,)


In [6]:
print(f"向量的长度为：{len(vectors[0])}")

向量的长度为：768


In [7]:
# 插入 Milvus 数据库的数据结构：每条记录包含 ID、向量、原文、分类标签。 
data = [
    {
        "id": i,                  # Milvus 中的主键 每条数据的唯一标识符 不能重复
        "vector": vectors[i],     # 文本 ---> vectors
        "text": docs[i],          # 原始文本
        "subject": "history"      # 标签 可以自定义
    }
    for i in range(len(vectors))
]
print(f"Data has {len(data)} entities,each with fields:{data[0].keys()}")    # data[0].keys() → 查看第一条数据包含哪些字段（也就是 Milvus 集合的 schema 应该有的字段）。

Data has 3 entities,each with fields:dict_keys(['id', 'vector', 'text', 'subject'])


In [8]:
print("Vector dim:", len(data[0]["vector"]))        # 获取第一条数据的向量（data[0]["vector"]），并打印其长度

Vector dim: 768


## 让我们把数据插入 Collections：

In [9]:
res = client.insert(collection_name="demo_collection", data=data)
print(f"Inserted {len(res)} entities")                           
print(res)      # insert_count: 3 表示有 3 条数据被成功写入数据库。  ids: 分配给每条数据的唯一 ID 列表

Inserted 3 entities
{'insert_count': 3, 'ids': [0, 1, 2]}


## 语义搜索

### 1.向量搜索

In [10]:
# Milvus 可同时接受一个或多个向量搜索请求。query_vectors 变量的值是一个向量列表，其中每个向量都是一个浮点数数组。
queries = ["who is Alan Turing?"]
query_vectors = embedding_fn.encode_queries(queries)

# 执行搜索 - 注意这里使用 query_vectors 而不是原来的 data
result = client.search(
    collection_name="demo_collection",
    data=query_vectors,  # 这里应该是查询向量，不是插入数据
    limit=2,
    output_fields=[ "text", "subject"]       # 指定output_fields 的实体详细信息 存储在 entity
)
print(result)

# 输出结果是一个结果列表，每个结果映射到一个向量搜索查询。
# 每个查询都包含一个结果列表，其中每个结果都包含实体主键、到查询向量的距离以及指定output_fields 的实体详细信息。


data: [[{'id': 2, 'distance': 0.5859946012496948, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118255615234375, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]]


## 带元数据过滤的向量搜索

In [11]:
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
    {"id": 3 + i, "vector": vectors[i], "text": docs[i], "subject": "biology"}
    for i in range(len(vectors))
]

client.insert(collection_name="demo_collection", data=data)

res = client.search(
    collection_name="demo_collection",
    data=embedding_fn.encode_queries(["tell me AI related information"]),
    filter="subject == 'biology'",
    limit=2,
    output_fields=["text", "subject"],
)

print(res)


data: [[{'id': 4, 'distance': 0.2703055739402771, 'entity': {'text': 'Computational synthesis with AI algorithms predicts molecular properties.', 'subject': 'biology'}}, {'id': 3, 'distance': 0.16425904631614685, 'entity': {'text': 'Machine learning has been used for drug design.', 'subject': 'biology'}}]]


## 查询

In [12]:
res = client.query(
    collection_name="demo_collection",
    filter="subject == 'history'",
    output_fields=["text", "subject"],
)

In [13]:
res

data: ["{'id': 0, 'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history'}", "{'id': 1, 'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}", "{'id': 2, 'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}"], extra_info: {}

In [14]:
res = client.query(
    collection_name="demo_collection",
    ids=[0, 2],
    output_fields=[ "text", "subject"],
)

In [15]:
res

data: ["{'id': 0, 'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history'}", "{'id': 2, 'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}"], extra_info: {}

In [16]:
# Delete entities by primary key
res = client.delete(
    collection_name="demo_collection",
    ids=[0, 2],
)
print(res)

[0, 2]


In [17]:
# Delete entities by a filter expression
res = client.delete(
    collection_name="demo_collection",
    filter="subject == 'biology'",
)

print(res)

[3, 4, 5]


In [None]:
# 删除collections
client.drop_collection(collection_name="demo_collection")