In [1]:
!pip install --upgrade pymilvus openai requests tqdm
!pip install torch
!pip install llama-index
!pip install transformers
!pip install sentence-transformers 


!pip install llama-index-embeddings-huggingface 
!pip install llama-index-embeddings-instructor 

Collecting pymilvus
  Using cached pymilvus-2.5.1-py3-none-any.whl.metadata (5.7 kB)
Collecting openai
  Using cached openai-1.58.1-py3-none-any.whl.metadata (27 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting grpcio<=1.67.1,>=1.49.1 (from pymilvus)
  Using cached grpcio-1.67.1-cp312-cp312-macosx_10_9_universal2.whl.metadata (3.9 kB)
Collecting protobuf>=3.20.0 (from pymilvus)
  Using cached protobuf-5.29.2-cp38-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Collecting python-dotenv<2.0.0,>=1.0.1 (from pymilvus)
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Using cached ujson-5.10.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.3 kB)
Collecting pandas>=1.2.4 (from pymilvus)
  Using cached pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus)
  Using cached milvus_lite-2.4.10-py3-none-macosx_11_0_arm64.whl.metadata (9.

In [2]:
# ! wget https://github.com/milvus-io/milvus-docs/releases/download/v2.4.6-preview/milvus_docs_2.4.x_en.zip


In [3]:
# ! unzip milvus_docs_2.4.x_en.zip

## Prepare Data
We load all markdown files from the folder milvus_docs/en/faq. For each document, we just simply use "# " to separate the content in the file, which can roughly separate the content of each main part of the markdown file.

This seperation is very simplistic - something like docling will do a better job

In [2]:
from glob import glob

text_lines = []

for file_path in glob("en/faq/*.md", recursive=True):
    with open(file_path,"r") as file:
        file_text = file.read()

    text_lines += file_text.split("# ")

## Prepare embedding model

Example used open AI model we want a different one

In [4]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")


Testing embeddings

In [14]:
embeddings = embed_model.get_text_embedding("Hello World!")
print(len(embeddings))
embedding_dim = len(embeddings)

print(embeddings[:5])

384
[-0.003275700146332383, -0.011690796352922916, 0.04155922681093216, -0.038148149847984314, 0.024183079600334167]


## Load data in Milvus
Create connection and collection - using the local in library version

In [6]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(uri="./milvus_demo.db")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
collection_name = "my_rag_collection"

As for the argument of MilvusClient:

* Setting the uri as a local file, e.g../milvus.db, is the most convenient method, as it automatically utilizes Milvus Lite to store all data in this file.
* If you have large scale of data, you can set up a more performant Milvus server on docker or kubernetes. In this setup, please use the server uri, e.g.http://localhost:19530, as your uri.
* If you want to use Zilliz Cloud, the fully managed cloud service for Milvus, adjust the uri and token, which correspond to the Public Endpoint and Api key in Zilliz Cloud.

In [10]:
# if the collection exists create a new one
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

Create a new collection with specified parameters.

If we don't specify any field information, Milvus will automatically create a default id field for primary key, and a vector field to store the vector data. A reserved JSON field is used to store non-schema-defined fields and their values.

Filtering is also possible: https://milvus.io/docs/filtered-search.md

In [25]:
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
)

## Insert data

Iterate through the text lines, create embeddings, and then insert the data into Milvus.

Here is a new field text, which is a non-defined field in the collection schema. It will be automatically added to the reserved JSON dynamic field, which can be treated as a normal field at a high level.

In [16]:
from tqdm import tqdm

data = []

for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    data.append({"id": i, "vector": embed_model.get_text_embedding(line), "text": line})

milvus_client.insert(collection_name=collection_name, data=data)

Creating embeddings: 100%|█████████████████████████████████████████████████████████████████████| 72/72 [00:06<00:00, 11.19it/s]


{'insert_count': 72, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71], 'cost': 0}

# Build the RAG
## Retrieve data for a query
Let's try a milvus query

In [17]:
question = "How is data stored in milvus"


In [18]:
# Search for the question and return the top 3 matches
search_result = milvus_client.search(
    collection_name=collection_name,
    data=[
        embed_model.get_text_embedding(question)
    ], #questions becomes a vector as well
    limit = 3,
    search_params={"metric_type": "IP", "params": {}},
    output_fields=["text"],
)

In [21]:
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_result[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        " Where does Milvus store data?\n\nMilvus deals with two types of data, inserted data and metadata. \n\nInserted data, including vector data, scalar data, and collection-specific schema, are stored in persistent storage as incremental log. Milvus supports multiple object storage backends, including [MinIO](https://min.io/), [AWS S3](https://aws.amazon.com/s3/?nc1=h_ls), [Google Cloud Storage](https://cloud.google.com/storage?hl=en#object-storage-for-companies-of-all-sizes) (GCS), [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs), [Alibaba Cloud OSS](https://www.alibabacloud.com/product/object-storage-service), and [Tencent Cloud Object Storage](https://www.tencentcloud.com/products/cos) (COS).\n\nMetadata are generated within Milvus. Each Milvus module has its own metadata that are stored in etcd.\n\n###",
        0.8531745076179504
    ],
    [
        "How does Milvus flush data?\n\nMilvus returns success when inserted data are loaded to t

# RAG it up with LLM
Convert the retrieved documents to String

In [34]:
from openai import OpenAI
client = OpenAI(
    api_key="lm-studio",
    base_url="http://localhost:1234/v1")

In [23]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)

Define system and user prompts for the Lanage Model. This prompt is assembled with the retrieved documents from Milvus.



In [24]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

Use OpenAI client to call local LLM to generate a response based on the prompts.
I am using LM Studio you can get your list of models by runnnig: curl localhost:1234/v1/models

In [36]:
response = client.chat.completions.create(
    model="granite-3.1-8b-instruct",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)

In Milvus, inserted data such as vector data, scalar data, and collection-specific schema are stored in persistent storage as incremental logs. These can be stored across multiple object storage backends including MinIO, AWS S3, Google Cloud Storage, Azure Blob Storage, Alibaba Cloud OSS, and Tencent Cloud COS. Metadata generated within Milvus is stored in etcd. Incremental data, found in the growing segments buffered in memory before reaching a persistence threshold, and historical data from sealed segments stored in object storage are both loaded into memory for query processing.
