# Milvus lite

In [1]:
from pymilvus import MilvusClient

client = MilvusClient("milvus_demo.db")

## Create Collection
简易快速构建，但仍然需要预先设定dimension

In [2]:
if client.has_collection(collection_name="demo_collection"):
    client.drop_collection(collection_name="demo_collection")
client.create_collection(
    collection_name="demo_collection",
    dimension=768,# The vectors we will use in this demo has 768 dimensions
)

### 按需安装

In [None]:
%pip install "pymilvus[model]"

## Embedding Function
#### pymilvus默认方案  

In [4]:
from pymilvus import model

# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# # This will download a small embedding model "paraphrase-albert-small-v2" (~50MB).
embedding_fn = model.DefaultEmbeddingFunction()

# Text strings to search from.
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

vectors = embedding_fn.encode_documents(docs)
# The output vector has 768 dimensions, matching the collection that we just created.print("Dim:", embedding_fn.dim, vectors[0].shape)# Dim: 768 (768,)# Each entity has id, vector representation, raw text, and a subject label that we use# to demo metadata filtering later.
data = [
    {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"}
    for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

res = client.insert(collection_name="demo_collection", data=data)

print(res)


  from .autonotebook import tqdm as notebook_tqdm


Data has 3 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768
{'insert_count': 3, 'ids': [0, 1, 2]}


#### ollama方案  

In [8]:
from langchain_ollama import OllamaEmbeddings

# need to download the model first
# ollama pull llama3
# 同样支持其他模型，如 Qwen2.5:14b mxbai-embed-large

embeddings = OllamaEmbeddings(
    model="Qwen2.5:14b",
)



vectors1 = embeddings.embed_documents(docs)
print(vectors1)
# print every vector's element type
print([type(v) for v in vectors1[0]])


data1 = [
    {"id": i+100, "vector": vectors1[i], "text": docs[i], "subject": "history"}
    for i in range(len(vectors1))
]

res1 = client.insert(collection_name="demo_collection", data=data1)

print(res1)

ResponseError:  (status code: 502)

In [None]:
# Testing
response= 

# Search & Query
#### Search
这里的search类似于chroma中的query，是针对向量vector相似度进行的查询操作


In [7]:
search_vectors = embedding_fn.encode_queries(["Who is Alan Turing?"])
# If you don't have the embedding function you can use a fake vector to finish the demo:# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]

res = client.search(
    collection_name="demo_collection",# target collection
    data=search_vectors,# query vectors
    limit=2,# number of returned entities
    output_fields=["text", "subject"],# specifies fields to be returned
)

print(res)

data: ["[{'id': 2, 'distance': 0.5859944820404053, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118255615234375, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]"]


#### Search中添加filter

In [8]:
# Insert more docs in another subject.
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
    {"id": 3 + i, "vector": vectors[i], "text": docs[i], "subject": "biology"}
    for i in range(len(vectors))
]

client.insert(collection_name="demo_collection", data=data)

# This will exclude any text in "history" subject despite close to the query vector.
res = client.search(
    collection_name="demo_collection",
    data=embedding_fn.encode_queries(["tell me AI related information"]),
    filter="subject == 'biology'",
    limit=2,
    output_fields=["text", "subject"],
)

print(res)


data: ["[{'id': 4, 'distance': 0.27030569314956665, 'entity': {'text': 'Computational synthesis with AI algorithms predicts molecular properties.', 'subject': 'biology'}}, {'id': 3, 'distance': 0.16425910592079163, 'entity': {'text': 'Machine learning has been used for drug design.', 'subject': 'biology'}}]"]


#### Query
这里的Query更接近一种过滤筛查，是针对标量scalar进行的查询操作

In [9]:
res = client.query(
    collection_name="demo_collection",
    filter="subject == 'history'",
    output_fields=["text", "subject"],
)
print(res)

data: ["{'id': 0, 'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history'}", "{'id': 1, 'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}", "{'id': 2, 'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}", "{'id': 100, 'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history'}", "{'id': 101, 'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}", "{'id': 102, 'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}"]


In [10]:
res = client.query(
    collection_name="demo_collection",
    ids=[0, 2],
    output_fields=["vector", "text", "subject"],
)
print(res)

data: ["{'id': 0, 'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history', 'vector': [0.010727874, -0.0358951, 0.018749742, 0.016348792, 0.0365169, 0.0035882175, -0.00040048725, 0.028529387, 0.0022745791, 0.0018363232, 0.00422582, 0.027173955, -0.003684388, 0.030791475, 0.0045053908, 0.04422817, 0.010503829, -0.029494507, -0.06707342, -0.020526454, 0.015322794, -0.006004949, -0.062285483, -0.039614767, 0.014206327, 0.032707665, -0.020834524, -0.0441743, -0.028339896, 0.029424444, -0.028087296, -0.020809013, 0.017159738, 0.0021116603, 0.021823762, -0.0015776412, -0.03769679, 0.041460752, -0.025056409, 0.08333673, -0.01597924, 0.0098138405, -0.026605474, 0.00061898155, 0.0037359172, -0.034155168, 0.05870825, -0.023721974, 0.00674594, -0.035841778, -0.017559981, 0.022803072, 0.0026646056, 0.025119307, 0.04694526, 0.012622844, 0.018337501, -0.007165574, 0.042811256, 0.0050310106, 0.0570555, -0.014866017, 0.100454755, 0.0064572743, -0.06832724,

## Delete entities

In [15]:
# Delete entities by primary key
res = client.delete(collection_name="demo_collection", ids=[0, 2])

print(res)

# Delete entities by a filter expression
res = client.delete(
    collection_name="demo_collection",
    filter="subject == 'biology'",
)

print(res)


[0, 2]
[3, 4, 5]


## Drop Collection

In [17]:
# Drop collection
client.drop_collection(collection_name="demo_collection")
