In [2]:
import os
import docx
import pymupdf
import json
import faiss

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.docstore.document import Document

# **初始化 vector store**

In [3]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

index = faiss.IndexFlatL2(len(embeddings.embed_query("test")))  # faiss.IndexIVF, faiss.IndexIVFPQ
    
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

# **Chunking**
Strategies (example):
- `.txt` and `.docx`: sliding window
- `.md`: recursive split 
- `.pdf`: llm based split
- `.json`: pure qa pairs

### **Load Markdown**

In [4]:
def get_md_chunks(text: str) -> list:
    text = text.replace("###", "---h3---").replace("##", "---h2---").replace("#", "---h1---").replace("\n\n", "\n")
    recursive_splitter_md = RecursiveCharacterTextSplitter(separators=["---h2---", "---h3---", "\n"], chunk_size=256)
    md_chunks = recursive_splitter_md.split_text(text)
    return md_chunks

### **Load Pdf**

In [5]:
from langchain_google_genai import GoogleGenerativeAI
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template("""
    請整理以下文字，用專案主題：<專案主題>\n**標題：<標題>內容：<內容>**\n\n**標題：<標題>內容：<內容>**\n\n ...... 的格式回傳。{pdf_text}""")
pdf_arranger = prompt_template | GoogleGenerativeAI(model="gemini-2.5-flash-lite")

def get_pdf_chunks(text: str) -> list:
    formatted_text = pdf_arranger.invoke({"pdf_text": text})
    splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=256, chunk_overlap=128)
    pdf_chunks = splitter.split_text(formatted_text)
    return pdf_chunks

### **Load Docx and txt**

In [6]:
def get_docx_and_txt_chunks(text: str) -> list:
    splitter = CharacterTextSplitter(chunk_size=256, chunk_overlap=128, separator="\n")
    txt_chunks = splitter.split_text(text)
    return txt_chunks

### **Load Json**

In [7]:
def get_json_chunks(data: dict) -> list:
    return [str(item) for item in data]

# **Add Documents**

In [8]:
import time 

docs = []
ids = []

for file in os.listdir("../raw_database"):
    file_path = os.path.join("../raw_database", file)
    if file.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = get_docx_and_txt_chunks(text)
    elif file.endswith(".docx"):
        doc = docx.Document(file_path)
        text = "\n".join(paragraph.text for paragraph in doc.paragraphs)
        chunks = get_docx_and_txt_chunks(text)
    elif file.endswith(".md"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = get_md_chunks(text)
    elif file.endswith(".pdf"):
        doc = pymupdf.open(file_path)
        text = "\n".join(page.get_text() for page in doc)
        chunks = get_pdf_chunks(text)
    elif file.endswith(".json"):
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        chunks = get_json_chunks(data)
    else:
        continue
    
    # wrap in a Document object
    docs.extend(
        [
            Document(
                page_content=chunk, 
                metadata={
                    "source": file,
                    "chunk_index": i,
                    "last_update": time.time(),
                }
            ) 
            for i, chunk in enumerate(chunks)
        ]
    )

    # create unique IDs for each document
    ids.extend(
        [
            f"{file}_{i}"
            for i in range(len(chunks))
        ]
    )

In [9]:
vector_store.add_documents(docs, ids=ids)

['course_python_data_analysis_intro.txt_0',
 'course_python_data_analysis_intro.txt_1',
 'project_marketing_comp.pdf_0',
 'project_marketing_comp.pdf_1',
 'project_marketing_comp.pdf_2',
 'project_football_ig.pdf_0',
 'project_football_ig.pdf_1',
 'project_football_ig.pdf_2',
 'project_football_ig.pdf_3',
 'course_digital_marketing_notes.docx_0',
 'course_digital_marketing_notes.docx_1',
 'course_digital_marketing_notes.docx_2',
 'course_digital_marketing_notes.docx_3',
 'course_digital_marketing_notes.docx_4',
 'course_digital_marketing_notes.docx_5',
 'course_digital_marketing_notes.docx_6',
 'course_digital_marketing_notes.docx_7',
 'course_digital_marketing_notes.docx_8',
 'course_digital_marketing_notes.docx_9',
 'course_digital_marketing_notes.docx_10',
 'course_digital_marketing_notes.docx_11',
 'course_digital_marketing_notes.docx_12',
 'course_digital_marketing_notes.docx_13',
 'course_digital_marketing_notes.docx_14',
 'course_digital_marketing_notes.docx_15',
 'course_digita

In [10]:
vector_store.save_local("../vector_store")

In [11]:
# vector_store.delete(ids=[...])