In [9]:
%pip install pymupdf
%pip install pymilvus
%pip install pytesseract
%pip install langchain-community
%pip install langchain_community
%pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.15.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3

In [3]:
import os
import re
import fitz  # PyMuPDF

# === Configuration ===
PDF_INPUT_DIR = "./pdf_inputs"
OUTPUT_DIR = "./pdf_extracted_text"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def create_output_folder(title):
    folder_name = re.sub(r'[^a-zA-Z0-9]+', '_', title).strip('_')
    folder_path = os.path.join(OUTPUT_DIR, folder_name)
    os.makedirs(folder_path, exist_ok=True)
    return folder_path

def save_text(content, path):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(content)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

def process_pdf(pdf_path):
    filename = os.path.basename(pdf_path)
    title = os.path.splitext(filename)[0]
    print(f"Processing: {filename}")

    folder = create_output_folder(title)

    # Extract and save text
    text = extract_text_from_pdf(pdf_path)
    save_text(text, os.path.join(folder, 'document.md'))

if __name__ == '__main__':
    pdf_files = [f for f in os.listdir(PDF_INPUT_DIR) if f.lower().endswith(".pdf")]
    for pdf_file in pdf_files:
        try:
            full_path = os.path.join(PDF_INPUT_DIR, pdf_file)
            process_pdf(full_path)
        except Exception as err:
            print(f"Error processing {pdf_file}: {err}")


Processing: 1_Explaining Relationships Among Research Papers 2024.pdf
Processing: 2305.08487v2.pdf
Processing: 2305.08828v2.pdf
Processing: 2405.10936v2.pdf
Processing: 2502.09457v1.pdf
Processing: 2502.17956v1.pdf
Processing: 2503.10267v2.pdf
Processing: chatcite.pdf
Processing: compressive summarizer .pdf
Processing: Research Papers_ update .pdf


In [4]:
from pymilvus import connections, utility
from dotenv import load_dotenv

MILVUS_URI = os.getenv("MILVUS_URI")
TOKEN = os.getenv("MILVUS_TOKEN")
# Disconnect any existing 'default' connection first
connections.disconnect("default")

# Then connect with the new URI and token
connections.connect("default", uri=MILVUS_URI, token=TOKEN)

collection_name = "knowledge_base"
dim = 384  # Dimension of the embedding model

# Check and reset collection if it already exists
if utility.has_collection(collection_name):
    utility.drop_collection(collection_name)


In [5]:
from pymilvus import FieldSchema, DataType, CollectionSchema, Collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=10000),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields, description="Knowledge base embeddings")
collection = Collection(name=collection_name, schema=schema)

In [6]:
import os
from langchain.docstore.document import Document

EXTRACTED_TEXT_DIR = "./pdf_extracted_text"

def load_documents_from_extracted_text(folder_path=EXTRACTED_TEXT_DIR):
    documents = []

    for subfolder in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder)
        if not os.path.isdir(subfolder_path):
            continue

        md_file_path = os.path.join(subfolder_path, "document.md")
        if not os.path.exists(md_file_path):
            continue

        try:
            with open(md_file_path, "r", encoding="utf-8") as f:
                text = f.read().strip()
                if text:
                    documents.append(Document(
                        page_content=text,
                        metadata={"source": subfolder}
                    ))
        except Exception as e:
            print(f"Error reading {md_file_path}: {e}")

    return documents


In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings  # updated import

documents = load_documents_from_extracted_text("./pdf_extracted_text")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(documents)

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
contents = [chunk.page_content for chunk in chunks]
embeddings = embedding_model.embed_documents(contents)

collection.insert([contents, embeddings])
collection.flush()

# Create an index for efficient vector search
index_params = {"index_type": "AUTOINDEX", "metric_type": "IP", "params": {}}
collection.create_index("embedding", index_params)
collection.load()


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
