In [1]:
from pymilvus import AnnSearchRequest, DataType, Function, FunctionType, MilvusClient, model, RRFRanker

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
embedding_fn = model.DefaultEmbeddingFunction()
CHUNK_SIZE = 1000
VECTOR_DIM = embedding_fn.dim
COLLECTION_NAME = 'pdf_collection'

In [3]:
client = MilvusClient(uri='http://localhost:19530')

# Testing HybridSearch

## Create Document Schema and Collection

In [4]:
schema = client.create_schema(auto_id=True)

In [5]:
bm25_function = Function(
    name="text_bm25_emb",
    input_field_names=["text"],
    output_field_names=["vector_sparse"],
    function_type=FunctionType.BM25, # currently the only function type for sparse in Milvus
)
schema.add_function(bm25_function) # to allow full-text-search

{'auto_id': True, 'description': '', 'fields': [], 'enable_dynamic_field': False, 'functions': [{'name': 'text_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['text'], 'output_field_names': ['vector_sparse'], 'params': {}}]}

In [6]:
schema.add_field(
    field_name='id',
    datatype=DataType.INT64,
    is_primary=True,
    description='document id',
    auto_id=True,
)
schema.add_field(
    field_name='text',
    datatype=DataType.VARCHAR,
    max_length=int(CHUNK_SIZE*1.1), # give 110%
    enable_analyzer=True, # enable full-text search
    description='document chunked text for full-text search',
)
schema.add_field(
    field_name='vector_dense',
    datatype=DataType.FLOAT_VECTOR,
    dim=VECTOR_DIM,
    enable_analyzer=True,
    description='document chunked text dense vector',
)
schema.add_field(
    field_name='vector_sparse',
    datatype=DataType.SPARSE_FLOAT_VECTOR,
    description='document chunked text sparse embedding auto-generated by the built-in BM25 Function',
)
schema.add_field(
    field_name='metadata',
    datatype=DataType.JSON,
    description='document metadata',
)

{'auto_id': True, 'description': '', 'fields': [{'name': 'id', 'description': 'document id', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': 'document chunked text for full-text search', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1100, 'enable_analyzer': True}}, {'name': 'vector_dense', 'description': 'document chunked text dense vector', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768, 'enable_analyzer': True}}, {'name': 'vector_sparse', 'description': 'document chunked text sparse embedding auto-generated by the built-in BM25 Function', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}, {'name': 'metadata', 'description': 'document metadata', 'type': <DataType.JSON: 23>}], 'enable_dynamic_field': False, 'functions': [{'name': 'text_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['text'], 'output_field_names': ['vector_sparse'], 'params': {}}]}

In [7]:
index_params = client.prepare_index_params()

In [8]:
index_params.add_index(
    field_name='vector_dense',
    index_name='vector_dense_index',
    index_type='AUTOINDEX', # Need to compare with IVF_FLAT
    metric_type='IP', # Need to compare with L2
)
index_params.add_index(
    field_name='vector_sparse',
    index_name='vector_sparse_index',
    index_type='SPARSE_INVERTED_INDEX',
    metric_type='BM25',
    params={"inverted_index_algo": "DAAT_MAXSCORE"}, # need to compare another algo
)

In [9]:
client.has_collection(collection_name=COLLECTION_NAME) and client.drop_collection(collection_name=COLLECTION_NAME)
client.create_collection(
    collection_name=COLLECTION_NAME,
    schema=schema,
    index_params=index_params,
)

## Load Documents

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_SIZE/5, # Overlap to maintain context between chunks
    length_function=len,
    is_separator_regex=False,
)

In [11]:
from langchain_community.document_loaders import PyPDFLoader
import os
def load(filepath):
    if not os.path.isfile(filepath):
        return None
    docs = PyPDFLoader(filepath).load()
    chunks = text_splitter.split_documents(docs)
    data = []
    for i in range(len(chunks)):
        chunk = chunks[i]
        vector = embedding_fn.encode_documents([chunk.page_content])
        d = {
            "vector_dense": vector[0],
            "text": chunk.page_content[:CHUNK_SIZE],
            "metadata": chunk.metadata,
        }
        data.append(d)
    client.insert(
        collection_name=COLLECTION_NAME,
        data=data,
    )
load('./datasets/documents/datasets/How we built ‘BARITO’ to enhance logging - 5 min read.pdf')
load('./datasets/documents/datasets/GoPay.sh_ A glimpse into Indonesia’s leading e-wallet GoPay’s Developer Experience - The Jakarta Post.pdf')

Ignoring wrong pointing object 68 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)


## Search Documents

In [12]:
def search(query, limit=2, output_fields=['text', 'metadata']):
    vector_search = {
        "data": embedding_fn.encode_documents([query]),
        "anns_field": "vector_dense",
        "param": {"nprobe": 10},
        "limit": limit,
    }
    fulltext_search = {
        "data": [query],
        "anns_field": "vector_sparse",
        "param": {"drop_ratio_search": 0.2},
        "limit": limit
    }
    reqs = [
        AnnSearchRequest(**vector_search),
        AnnSearchRequest(**fulltext_search),
    ]
    
    res = client.hybrid_search(
        collection_name=COLLECTION_NAME,
        reqs=reqs,
        ranker=RRFRanker(),
        limit=limit,
        output_fields=output_fields,
    )
    return [
        hit
        for hits in res
        for hit in hits
    ]
    # for hits in res:
    #     print("TopK results:")
    #     for hit in hits:
    #         print(hit)
    #         print('====')

In [13]:
for hit in search('why do we need barito?', limit=3):
    print(f'id: {hit.id}\ndistance: {hit.distance}\ntext: {hit.entity.text}\nmetadata: {hit.metadata}\n')

id: 462408598834930262
distance: 0.03226646035909653
text: H Ho ow w  w we e  b bu ui il lt t  ‘ ‘B BA AR RI IT TO O’ ’  t to o  e en nh ha an nc ce e  l lo og gg gi in ng g
Building a logging infrastructure commensurate to GO-JEK’s scale
5 MIN READ   |   DEC 08, 2018 Share: 
BB yy   TT aa r r aa   BB aa s s kk aa r r aa
Imagine coming home from a tiring day of work and your white-fenced house is
suddenly pink. You’d have questions. Many of them. Did I arrive at the correct
house? Who did this? When did this happen? For what? And why pink? You’d
look around frantically for clues. Did any of your neighbours see anything? You’d
How we built ‘BARITO’ to enhance logging - 5 min read https://www.gojek.io/blog/how-we-built-barito-to-enhance-logging
1 of 10 20/11/25, 10.27
metadata: {'producer': 'macOS Version 15.7.2 (Build 24G325) Quartz PDFContext', 'creator': 'Firefox', 'creationdate': "D:20251120032747Z00'00'", 'title': 'How we built ‘BARITO’ to enhance logging - 5 min read', 'moddate': "