In [1]:
from dotenv import load_dotenv
load_dotenv("../../.env", verbose=True)

import sys
sys.path.append('../../src/')

import pandas as pd

from llm.openai_hepler import encode_queries

## Encode texts to embeddings

In [2]:

# Text strings to search from.
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

vectors = encode_queries(docs)
# The output vector has 768 dimensions, matching the collection that we just created.
vectors


[[0.009537402540445328,
  -0.011060362681746483,
  -0.005924422759562731,
  -0.028450191020965576,
  0.022833600640296936,
  -0.003585977014154196,
  0.002272288780659437,
  0.056468334048986435,
  0.027521293610334396,
  0.02875262312591076,
  -0.017595048993825912,
  -0.019776878878474236,
  0.02974632754921913,
  0.017249412834644318,
  -0.07664486020803452,
  0.01870756596326828,
  0.016201702877879143,
  -0.006059437058866024,
  -0.01497037336230278,
  -0.02901184931397438,
  -0.012021663598716259,
  0.000781057111453265,
  -0.032857052981853485,
  0.01227009017020464,
  -0.010363689623773098,
  0.006880323402583599,
  0.006540087517350912,
  0.0007135500200092793,
  -0.027780519798398018,
  0.043139733374118805,
  8.809676364762709e-05,
  -0.018286321312189102,
  -0.01474354974925518,
  -0.03128008916974068,
  0.038711268454790115,
  0.024410566315054893,
  -0.03369954228401184,
  0.016277311369776726,
  -0.045494381338357925,
  0.0252530537545681,
  0.02873102016746998,
  0.0230

In [3]:
# Each entity has id, vector representation, raw text, and a subject label that we use
# to demo metadata filtering later.
data = [
    {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"}
        for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

Data has 3 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 3072


# 1 Milvus

## 1.1 Set up Vector DataBase

To create a local Milvus vector database, simply instantiate a MilvusClient by specifying a file name to store all data, such as "milvus_demo.db".

In [4]:
from pymilvus import MilvusClient

client = MilvusClient("milvus_demo.db")
# client = MilvusClient(uri="http://localhost:19530", token="root:Milvus")

## 1.2 Create a Collection

默认设置和选项：

- 主键和向量字段：
该集合会自动包含一个主键字段和一个向量字段，它们的默认名称分别是 "id"（主键）和 "vector"（向量）。

- 距离度量类型：
距离度量类型默认设置为 COSINE（余弦相似度），这是一种计算向量相似度的常用方法，特别是在高维空间中，余弦相似度通过计算两个向量之间夹角的余弦值来衡量它们的相似度。

- 主键行为：
默认情况下，主键字段接受整数输入，并且不会自动递增（即没有启用自动 ID 功能）。如果你想使用特定的 ID，需要手动为每个记录指定主键。Milvus 还提供自动 ID 功能，但在这个例子中没有启用。

In [5]:
if client.has_collection(collection_name = "demo_collection"):
    client.drop_collection(collection_name = "demo_collection")

client.create_collection(
    collection_name = "demo_collection",
    dimension = 3072,  # The vectors we will use in this demo has 768 dimensions
)

## 1.3 Insert

In [6]:
res = client.insert(
    collection_name = "demo_collection",
    data = data
)

print(res)

{'insert_count': 3, 'ids': [0, 1, 2]}


## 1.4 Search

Milvus accepts one or multiple vector search requests at the same time. The value of the query_vectors variable is a list of vectors, where each vector is an array of float numbers.

The output is a list of results, each mapping to a vector search query. Each query contains a list of results, where each result contains the entity primary key, the distance to the query vector, and the entity details with specified `output_fields`.


In [7]:
query_vectors = encode_queries(["Who is Alan Turing?"])

res = client.search(
    collection_name="demo_collection",
    data=query_vectors,
    limit=2,
    output_fields=["text", "subject"],
)

pd.DataFrame(res[0])

Unnamed: 0,id,distance,entity
0,1,0.613731,{'text': 'Alan Turing was the first person to ...
1,2,0.511313,"{'text': 'Born in Maida Vale, London, Turing w..."


### 1.4.1 Vector Search with Metadata Filtering

By default, the scalar fields are not indexed. If you need to perform metadata filtered search in large dataset, you can consider using fixed schema and also turn on the [index](https://milvus.io/docs/scalar_index.md) to improve the search performance.

In [14]:
# Insert more docs in another subject.
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]
vectors = encode_queries(docs)
data = [
    {"id": 3 + i, "vector": vectors[i], "text": docs[i], "subject": "biology"}
        for i in range(len(vectors))
]

client.insert(collection_name="demo_collection", data=data)

# This will exclude any text in "history" subject despite close to the query vector.
res = client.search(
    collection_name = "demo_collection",
    data = encode_queries(["tell me AI related information"]),
    filter = "subject == 'biology'",
    limit = 2,
    output_fields = ["text", "subject"],
)

pd.DataFrame(res[0])

Unnamed: 0,id,distance,entity
0,4,0.360059,{'text': 'Computational synthesis with AI algo...
1,3,0.257998,{'text': 'Machine learning has been used for d...


## 1.5 Query

A query() is an operation that retrieves all entities matching a cretria, such as a [filter expression](https://milvus.io/docs/boolean.md) or matching some ids.

For example, retrieving all entities whose scalar field has a particular value:

In [18]:
res = client.query(
    collection_name = "demo_collection",
    filter = "subject == 'history'",
    output_fields = ["text", "subject"],
)

pd.DataFrame(res)

Unnamed: 0,id,text,subject
0,0,Artificial intelligence was founded as an acad...,history
1,1,Alan Turing was the first person to conduct su...,history
2,2,"Born in Maida Vale, London, Turing was raised ...",history


In [19]:
res = client.query(
    collection_name="demo_collection",
    ids=[0, 2],
    output_fields=["vector", "text", "subject"],
)
pd.DataFrame(res)

Unnamed: 0,id,text,subject,vector
0,0,Artificial intelligence was founded as an acad...,history,"[0.009537403, -0.011060363, -0.0059244228, -0...."
1,2,"Born in Maida Vale, London, Turing was raised ...",history,"[-0.011374049, 0.045968324, -0.0060196617, -0...."


## 1.6 Delete Entities

If you'd like to purge data, you can delete entities specifying the primary key or delete all entities matching a particular filter expression.

In [20]:
# Delete entities by primary key
res = client.delete(collection_name="demo_collection", ids=[0, 2])

print(res)


[0, 2]


In [21]:
# Delete entities by a filter expression
res = client.delete(
    collection_name="demo_collection",
    filter="subject == 'biology'",
)

print(res)

[3, 4, 5]


## 1.7 Drop the collection

In [23]:
# Drop collection
client.drop_collection(collection_name="demo_collection")