In [2]:
!python --version

Python 3.10.9


## Milvus

In [1]:
from pymilvus import utility, Collection, DataType, IndexType, Milvus, connections

_HOST = 'localhost'
_PORT = '19530'

print(f"\nCreate connection...")
conn = connections.connect(host=_HOST, port=_PORT)

print(f"\nList connections:")
print(connections.list_connections())


Create connection...

List connections:
[('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x0000028A22707340>)]


In [2]:
def list_collections():
    print(f"=== list collections ===")
    lst_collections = utility.list_collections()
    print(f"{lst_collections}\n")

def use_collection(collection_name: str):
    # Load the collection
    collection = Collection(name=collection_name)
    return collection
    
def get_collection_entity(collection_name: str='tls_milvus') -> None:
    print(f"=== {collection_name} entity ===")
    collection = use_collection(collection_name)
    print(f"{collection_name} - The number of entity: {collection.num_entities}\n")
    
def get_collection_info(collection_name: str) -> None:
    print(f"=== {collection_name} info ===")
    milvus = Milvus(host='10.194.54.173', port='19530')
    collection = use_collection(collection_name)
    
    # print(collection)
    # print(f"collection name: {collection.name}")
    # print(collection.description)
    print(f'Num of entity: {collection.num_entities}')
    print(f"schema ---")
    for idx,field in enumerate(collection.schema.fields):
        print(idx, field)
    print()
    
def get_collection_data(collection_name: str, condition: str="page in [1,2]") -> list:
    collection = use_collection(collection_name)
    res = collection.query(
      expr = condition, 
      output_fields = ["source", "page", "doc_id", "vector", "text"],
      consistency_level="Strong"
    )
    return res
    
def drop_collection(collection_name: str) -> None:
    collection = use_collection(collection_name)
    collection.drop()
    
def delete_eneity(collection_name: str, condition: str):
    collection = use_collection(collection_name)
    collection.delete(condition)
    
def disconnect(db="default"):
    connections.disconnect(db)   

In [3]:
list_collections()

=== list collections ===
[]



In [4]:
# drop_collection('test_5g2')

## PDF to Milvus

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceHubEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import Milvus

class PdfToMilvus:
    def __init__(self, doc_id: int, collection_name: str):
        self.doc_id = doc_id
        self.collection_name = collection_name
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        )
        self.chunk_size = 500
        self.chunk_overlap = 250
        self.store_chunck = 100
        self.MilvusHost = "localhost"
        self.MilvusPort = 19530

    def load_pdf(self, filepath: str) -> list:
        self.filepath = filepath
        doc = PyPDFLoader(self.filepath).load()
        return doc

    def split_text(self, doc) -> list:
        docs_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
        )
        sp_doc = docs_splitter.split_documents(doc)
        return sp_doc

    def transform_doc_to_vector_into_milvus(self, sp_doc):
        for i in range(0, len(sp_doc), self.store_chunck):
            store_doc = sp_doc[i : i + self.store_chunck]

            vector_store = Milvus.from_documents(
                documents=store_doc,
                embedding=self.embeddings,
                collection_name=self.collection_name,
                connection_args={
                    "host": self.MilvusHost,
                    "port": self.MilvusPort,
                },
            )

In [9]:
# ================== settings ==================
doc_id = 1
collection_name = "test_5g"
filepath = "data/5G頻率政策與產業發展白皮書.pdf"
# ================== settings ==================

pdf_to_milvus = PdfToMilvus(doc_id, collection_name)

doc = pdf_to_milvus.load_pdf(filepath)
sp_doc = pdf_to_milvus.split_text(doc)
print(len(sp_doc))
print(type(sp_doc))

pdf_to_milvus.transform_doc_to_vector_into_milvus(sp_doc)

50
<class 'list'>
