In [1]:
import os
import docx
import pymupdf
import json

from chromadb import PersistentClient
from chromadb.utils.embedding_functions import GoogleGenerativeAiEmbeddingFunction
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [2]:
vector_store = PersistentClient()
collection = vector_store.get_or_create_collection(name="my_database", embedding_function=GoogleGenerativeAiEmbeddingFunction(model_name="models/embedding-001", api_key=os.getenv("GOOGLE_API_KEY")))

  from .autonotebook import tqdm as notebook_tqdm


## **Chunking Strategy**
- `.txt` and `.docx`: sliding window
- `.md`: recursive split 
- `.pdf`: llm based arragement 
- `.json`: pure qa pairs

### **Load Markdown**

In [3]:
def get_md_chunks(text: str) -> list:
    text = text.replace("###", "---h3---").replace("##", "---h2---").replace("#", "---h1---").replace("\n\n", "\n")
    recursive_splitter_md = RecursiveCharacterTextSplitter(separators=["---h2---", "---h3---", "\n"], chunk_size=256)
    md_chunks = recursive_splitter_md.split_text(text)
    return md_chunks

### **Load Pdf**

In [4]:
from langchain_google_genai import GoogleGenerativeAI
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template("""
    請整理以下文字，用專案主題：<專案主題>\n**標題：<標題>內容：<內容>**\n\n**標題：<標題>內容：<內容>**\n\n ...... 的格式回傳。{pdf_text}""")
pdf_arranger = prompt_template | GoogleGenerativeAI(model="gemini-2.5-flash-lite")

def get_pdf_chunks(text: str) -> list:
    formatted_text = pdf_arranger.invoke({"pdf_text": text})
    splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=256, chunk_overlap=128)
    pdf_chunks = splitter.split_text(formatted_text)
    return pdf_chunks

### **Load Docx and txt**

In [5]:
def get_docx_and_txt_chunks(text: str) -> list:
    splitter = CharacterTextSplitter(chunk_size=256, chunk_overlap=128, separator="\n")
    txt_chunks = splitter.split_text(text)
    return txt_chunks

### **Load Json**

In [6]:
def get_json_chunks(data: dict) -> list:
    return [str(item) for item in data]

# **Add Documents**

In [7]:
for file in os.listdir("raw_database"):
    file_path = os.path.join("raw_database", file)
    if file.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = get_docx_and_txt_chunks(text)
    elif file.endswith(".docx"):
        doc = docx.Document(file_path)
        text = "\n".join(paragraph.text for paragraph in doc.paragraphs)
        chunks = get_docx_and_txt_chunks(text)
    elif file.endswith(".md"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = get_md_chunks(text)
    elif file.endswith(".pdf"):
        doc = pymupdf.open(file_path)
        text = "\n".join(page.get_text() for page in doc)
        chunks = get_pdf_chunks(text)
    elif file.endswith(".json"):
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        chunks = get_json_chunks(data)
    else:
        continue

    collection.add(
        documents=chunks,
        metadatas=[{"source": file, "chunk_index": i} for i in range(len(chunks))],
        ids=[f"{file}_{i}" for i in range(len(chunks))]
    )