This notebook demonstrates basic operations with Milvus DB

### Setup requirement prerequisites

### 03.01. Connecting to Milvus

In [1]:
#Creating a connection

#Import the pymilvus package
from pymilvus import connections

#Create list of connections
connections.add_connection(
    #Specify a name for the connection
    learn={    
        "host": "localhost",
        "port": "19530",
        "username" : "",
        "password" : ""
    })

#Connect
connection_id="learn"
connections.connect(connection_id)

#List all connections
connections.list_connections()

MilvusException: <MilvusException: (code=2, message=Fail connecting to server on localhost:19530, illegal connection params or server unavailable)>

### 03.02. Creating databases and users

In [3]:
#Database operations
from pymilvus import db

#Get current list of databases available to the connection
current_dbs=db.list_database(using=connection_id)
print("Current databases: ", current_dbs)

db_name="banned_db"

if ( db_name not in current_dbs):
    print("Creating database :", db_name)
    wiki_db = db.create_database(db_name, using=connection_id) 
    
#Switch to use the new database
db.using_database(db_name, using=connection_id)


Current databases:  ['default']
Creating database : banned_db


In [4]:
#user management
from pymilvus import Role,utility

current_users=utility.list_usernames(using=connection_id)
print("Current user list: ", current_users)

new_user = "banned_public"

if new_user not in current_users:
    utility.create_user(new_user, "12345678", using=connection_id)

#Assign a role to the user
public_role = Role("public", using=connection_id)
print(" Role public exists? ", public_role.is_exist())

#Add user to role
public_role.add_user(new_user)


Current user list:  ['root']
 Role public exists?  True


### 03.03. Creating collections

In [5]:
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
import json

#Define fields
course_id = FieldSchema(
    name="course_id",
    dtype=DataType.INT64,
    is_primary=True,
    max_length=32)

title= FieldSchema(
    name="title",
    dtype=DataType.VARCHAR,
    max_length=256)

description= FieldSchema(
    name="description",
    dtype=DataType.VARCHAR,
    max_length=2048)

#Dim should match the embedding size 
desc_embedding = FieldSchema(
    name="desc_embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=1536
)

#Define schema
wiki_schema=CollectionSchema(
    fields=[course_id, title, description, desc_embedding],
    description="Courses List",
    enable_dynamic_field=True
)

collection_name="courses_list"

#Creation collection
wiki_collection=Collection(
    name=collection_name,
    schema=wiki_schema,
    using=connection_id,
    shard_num=2
)

from pymilvus import utility

#List all collections
print("Current collections: ",utility.list_collections(using=connection_id))

#setup existing collection into another object
r_collection=Collection(collection_name, using=connection_id)
print("\n", r_collection.schema)


Current collections:  ['courses_list']

 {'auto_id': False, 'description': 'Courses List', 'fields': [{'name': 'course_id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'title', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'description', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048}}, {'name': 'desc_embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1536}}], 'enable_dynamic_field': True}


### 03.04. Inserting data into Milvus

In [6]:
#read the input course CSV
import pandas as pd
course_descriptions = pd.read_csv("course-descriptions.csv")
course_descriptions.head()

Unnamed: 0,Course ID,Title,Description
0,1001,Architecting Big Data applications,Learn how to architect both simple and complex...
1,1002,MLOps Essentials: Monitoring Model Drift and Bias,As more and more ML models are developed and d...
2,1003,Apache Kafka Essential Training: Getting Started,"In this course, instructor introduces Apache..."
3,1004,Applied AI: Building NLP Apps with Hugging Fac...,Explore models designed for common NLP use cas...
4,1005,Deep Learning : Getting started,Deep learning as a technology has grown leaps ...


In [7]:
#Use langchain to create embeddings.
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
import os

#Setup open API key to use OpenAI's LLM
#Use your own key for OpenAI.

#If you use the free tier, you may hit rate limits with the number of requests

openai_api_key="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"
os.environ["OPENAI_API_KEY"] = openai_api_key

embeddings_model = OpenAIEmbeddings()

In [8]:
#Prepare data for insert

i_course_id = course_descriptions["Course ID"].tolist()
i_title = course_descriptions["Title"].tolist()
i_description = course_descriptions["Description"].tolist()

i_desc_embedding=[embeddings_model.embed_query(i)
                  for i in i_description]

#Format for data input
insert_data=[i_course_id, i_title, i_description, i_desc_embedding]

In [9]:
#Initiate a collection object and insert data
course_collection = Collection(collection_name,using=connection_id)

#Insert
mr=course_collection.insert(insert_data)

#Flush the data after insert
print("Inserted data. Now flushing")
course_collection.flush(timeout=180)

Inserted data. Now flushing


### 03.05. Build an index

In [10]:
#Build an index
index_params = {
    "metric_type":"L2",
    "index_type":"IVF_FLAT",
    "params" :{"nlist":1024}
}

course_collection.create_index(
    field_name="desc_embedding",
    index_params=index_params
)

utility.index_building_progress(collection_name,using=connection_id)

{'total_rows': 5, 'indexed_rows': 5, 'pending_index_rows': 0}

### 03.06. Querying scalar data

In [11]:
#Load the Collection
# NOTE: A collection should first be loaded into memory before
#       queries can be executed against it

course_collection.load()
print("Course collection loaded..")

Course collection loaded..


In [12]:
q_result= course_collection.query(
    expr = "course_id == 1001",
    output_fields = ["title","description"]
)
print(q_result)
print("\n Result object :", type(q_result[0]))

[{'title': 'Architecting Big Data applications', 'description': 'Learn how to architect both simple and complex batch processing applications, as you discover the basic principles of big data architectures such as horizontal scaling, distributed processing, technology selection and integration, and scheduling.', 'course_id': 1001}]

 Result object : <class 'dict'>


In [13]:
q_result2= course_collection.query(
    expr = "(title LIKE 'MLOps%') && (course_id > 1001) ",
    output_fields = ["title","description"]
)
print(q_result2)

[{'title': 'MLOps Essentials: Monitoring Model Drift and Bias', 'description': 'As more and more ML models are developed and deployed, the need arises to ensure that the models are effective and safe and that they perform as desired. Model monitoring, a core function of MLOps, helps data scientists and MLOps engineers to meet this need. In this course, data analytics expert   discusses the types of monitoring needed for ML models. ', 'course_id': 1002}]


### 03.07. Searching Vector fields

In [14]:
#Make sure that the collection is already loaded.

search_params = {
    "metric_type": "L2", 
    "offset": 0, 
    "ignore_growing": False, 
    "params": {"nprobe": 10}
}

#Embed the input search string
search_string = "machine learning"
search_embed=embeddings_model.embed_query(search_string)

#Perform search
s_results=course_collection.search(
    data=[search_embed], #input query to search for
    anns_field="desc_embedding", #field to search with ANN
    param=search_params,
    limit=10, #Limit output
    expr=None, #Use additional scalar conditions
    output_fields=["title"],
    consistency_level="Strong"
)

print("Search result object:", type(s_results[0]),"\n")
#Print results in order of match
for i in s_results[0]:
    print(i.id, str(round(i.distance, 2)), "\t",i.entity.get("title"))

Search result object: <class 'pymilvus.client.abstract.Hits'> 

1005 0.33 	 Deep Learning : Getting started
1004 0.34 	 Applied AI: Building NLP Apps with Hugging Face Transformers
1002 0.38 	 MLOps Essentials: Monitoring Model Drift and Bias
1001 0.4 	 Architecting Big Data applications
1003 0.5 	 Apache Kafka Essential Training: Getting Started


In [15]:
#Search an unrelated query

#Embed the input search string
search_string2 = "best movies of the year"
search_embed2=embeddings_model.embed_query(search_string2)

#Perform search
s_results2=course_collection.search(
    data=[search_embed2], #input query to search for
    anns_field="desc_embedding", #field to search with ANN
    param=search_params,
    limit=10, #Limit output
    expr=None, #Use additional scalar conditions
    output_fields=["title"],
    consistency_level="Strong"
)

#Print results in order of match
for i in s_results2[0]:
    print(i.id, str(round(i.distance, 2)), "\t",i.entity.get("title"))

1004 0.62 	 Applied AI: Building NLP Apps with Hugging Face Transformers
1001 0.63 	 Architecting Big Data applications
1003 0.64 	 Apache Kafka Essential Training: Getting Started
1005 0.64 	 Deep Learning : Getting started
1002 0.65 	 MLOps Essentials: Monitoring Model Drift and Bias


### 03.08. Deleting objects and entities

In [16]:
#Delete a single record
course_collection.delete("course_id in [1002]")

(insert count: 0, delete count: 1, upsert count: 0, timestamp: 446800336012443649, success count: 0, err count: 0)

In [17]:
#Drop a collection
utility.drop_collection(collection_name,using=connection_id)

In [18]:
#drop a database
#Make sure to drop all collections in the database first

db.drop_database(db_name, using=connection_id)