In [1]:
from pymilvus import (
    connections,
    Collection,
    FieldSchema,
    CollectionSchema,
    DataType,
    utility,
)
import pandas as pd
from scipy import sparse
import numpy as np

connections.connect(uri="http://localhost:19530")  # Replace with your Milvus server IP

In [2]:
# utility.drop_collection("thu_vien_phap_luat")

In [3]:
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=8012),
    FieldSchema(name="title_dense", dtype=DataType.FLOAT_VECTOR, dim=1024),
    FieldSchema(name="title_sparse", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="title_text", dtype=DataType.VARCHAR, max_length=8012),
    FieldSchema(name="content_dense", dtype=DataType.FLOAT_VECTOR, dim=1024),
    FieldSchema(name="content_sparse", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="content_text", dtype=DataType.VARCHAR, max_length=8012),
]

schema = CollectionSchema(fields=fields, enable_dynamic_field=False)

collection = Collection(name="thu_vien_phap_luat", schema=schema)

In [4]:
dense_index = {
    "index_type": "HNSW",
    "metric_type": "COSINE",
    "M": 64,
    "efConstruction": 80,
}
collection.create_index("title_dense", dense_index)
collection.create_index("content_dense", dense_index)

sparse_index = {
    "index_type": "SPARSE_WAND",
    "metric_type": "IP",
}
collection.create_index("title_sparse", sparse_index)
collection.create_index("content_sparse", sparse_index)

Status(code=0, message=)

In [5]:
from glob import glob

folders = glob("../features/*")
folders.sort()
folders

['../features\\p0-9999',
 '../features\\p10000-19999',
 '../features\\p20000-29999',
 '../features\\p30000-39999',
 '../features\\p50000-59999',
 '../features\\p60000-69999',
 '../features\\p70000-79999']

In [6]:
def load_data(folder):
    df = pd.read_parquet(f"{folder}/encoded_data.parquet")
    title_sparse = sparse.load_npz(f"{folder}/title_sparse.npz")
    df["title_sparse"] = list(title_sparse)

    df = df.explode("content_text")
    content_sparse = sparse.load_npz(f"{folder}/content_sparse.npz")

    with open(f"{folder}/content_dense.npy", "rb") as file:
        content_dense = np.load(file)

    df["content_sparse"] = list(content_sparse)
    df["content_dense"] = list(content_dense)
    return df

In [7]:
def insert_to_collection(df, collection, batch_size=1000):
    for i in range(0, len(df), batch_size):
        print(f"Inserting {i}-{i+batch_size}...")
        collection.insert(
            df.iloc[i : i + batch_size][
                [
                    "url",
                    "title_sparse",
                    "title_dense",
                    "title",
                    "content_sparse",
                    "content_dense",
                    "content_text",
                ]
            ]
            .rename(columns={"title": "title_text"})
            .to_dict("records")
        )

In [8]:
for folder in folders:
    df = load_data(folder)
    print(folder, df.shape)
    insert_to_collection(df, collection)
    print("complete inserted df")
    print()
    # break

../features\p10000-19999 (85206, 11)
Inserting 0-1000...
Inserting 1000-2000...
Inserting 2000-3000...
Inserting 3000-4000...
Inserting 4000-5000...
Inserting 5000-6000...
Inserting 6000-7000...
Inserting 7000-8000...
Inserting 8000-9000...
Inserting 9000-10000...
Inserting 10000-11000...
Inserting 11000-12000...
Inserting 12000-13000...
Inserting 13000-14000...
Inserting 14000-15000...
Inserting 15000-16000...
Inserting 16000-17000...
Inserting 17000-18000...
Inserting 18000-19000...
Inserting 19000-20000...
Inserting 20000-21000...
Inserting 21000-22000...
Inserting 22000-23000...
Inserting 23000-24000...
Inserting 24000-25000...
Inserting 25000-26000...
Inserting 26000-27000...
Inserting 27000-28000...
Inserting 28000-29000...
Inserting 29000-30000...
Inserting 30000-31000...
Inserting 31000-32000...
Inserting 32000-33000...
Inserting 33000-34000...
Inserting 34000-35000...
Inserting 35000-36000...
Inserting 36000-37000...
Inserting 37000-38000...
Inserting 38000-39000...
Inserting 

RPC error: [describe_collection], <MilvusException: (code=<bound method _MultiThreadedRendezvous.code of <_MultiThreadedRendezvous of RPC that terminated with:
	status = StatusCode.UNAVAILABLE
	details = "failed to connect to all addresses; last error: UNAVAILABLE: ipv6:%5B::1%5D:19530: ConnectEx: Connection refused (No connection could be made because the target machine actively refused it.
 -- 10061)"
	debug_error_string = "UNKNOWN:Error received from peer  {grpc_message:"failed to connect to all addresses; last error: UNAVAILABLE: ipv6:%5B::1%5D:19530: ConnectEx: Connection refused (No connection could be made because the target machine actively refused it.\r\n -- 10061)", grpc_status:14, created_time:"2024-09-26T19:00:43.3314423+00:00"}"
>>, message=Retry run out of 75 retry times, message=failed to connect to all addresses; last error: UNAVAILABLE: ipv6:%5B::1%5D:19530: ConnectEx: Connection refused (No connection could be made because the target machine actively refused it.
 -- 1

MilvusException: <MilvusException: (code=<bound method _MultiThreadedRendezvous.code of <_MultiThreadedRendezvous of RPC that terminated with:
	status = StatusCode.UNAVAILABLE
	details = "failed to connect to all addresses; last error: UNAVAILABLE: ipv6:%5B::1%5D:19530: ConnectEx: Connection refused (No connection could be made because the target machine actively refused it.
 -- 10061)"
	debug_error_string = "UNKNOWN:Error received from peer  {grpc_message:"failed to connect to all addresses; last error: UNAVAILABLE: ipv6:%5B::1%5D:19530: ConnectEx: Connection refused (No connection could be made because the target machine actively refused it.\r\n -- 10061)", grpc_status:14, created_time:"2024-09-26T19:00:43.3314423+00:00"}"
>>, message=Retry run out of 75 retry times, message=failed to connect to all addresses; last error: UNAVAILABLE: ipv6:%5B::1%5D:19530: ConnectEx: Connection refused (No connection could be made because the target machine actively refused it.
 -- 10061))>