In [9]:
# Required libraries installation
!pip install -q einops==0.7.0 langchain==0.1.9 pypdf==4.0.2 pymilvus==2.3.6 sentence-transformers==2.4.0 redis

# Importing necessary libraries
import os
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
# Base parameters for Milvus connection
MILVUS_HOST = "vectordb-milvus.milvus.svc.cluster.local"
MILVUS_PORT = 19530
MILVUS_USERNAME = "root"
MILVUS_PASSWORD = "Milvus"
MILVUS_COLLECTION = "catalogo_ba_gov"

# Path to your local Markdown file
markdown_file_path = "markdown/IPTU.md"

# Read the Markdown file content
try:
    with open(markdown_file_path, "r", encoding="utf-8") as f:
        markdown_content = f.read()
    print(f"Successfully loaded Markdown file: {markdown_file_path}")
except Exception as e:
    print(f"Error reading the file {markdown_file_path}: {e}")
    raise

# Create the embeddings function
embeddings = HuggingFaceEmbeddings(model_kwargs={"trust_remote_code": True}, show_progress=True)

# Initialize Milvus connection and collection
db = Milvus(
    embedding_function=embeddings,
    connection_args={
        "host": MILVUS_HOST,
        "port": MILVUS_PORT,
        "user": MILVUS_USERNAME,
        "password": MILVUS_PASSWORD
    },
    collection_name=MILVUS_COLLECTION,
    metadata_field="metadata",
    text_field="page_content",
    auto_id=True,
    drop_old=False
)

# Splitting the Markdown content into chunks
# Adjusted chunk size for Qwen/Qwen2-0.5B-Instruct: 256 tokens with a 30-token overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=30)
chunks = text_splitter.create_documents(
    [markdown_content],
    metadatas=[{"source": markdown_file_path}]
)

Successfully loaded Markdown file: markdown/IPTU.md


In [13]:
# Adding the document chunks to Milvus
db.add_documents(chunks)
print("Markdown content uploaded to Milvus successfully!")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Markdown content uploaded to Milvus successfully!
