# 构建向量数据库
读取文本文件，构建嵌入式向量数据库

In [1]:
import os
from langchain_community.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. 加载 Markdown 文件
loader = DirectoryLoader(
    path="data/",
    glob="**/*.md",  # 仅匹配 .md 文件
    loader_cls=UnstructuredMarkdownLoader,
    show_progress=True
)
documents = loader.load()

  0%|          | 0/75 [00:00<?, ?it/s]Error loading file data\08.组织级工作区\01.EPG工作区\01.过程改进计划及建议\03-组织过程改进计划.doc.md


ModuleNotFoundError: No module named 'unstructured'

In [None]:
# 2. 分块（结合 Markdown 标题和字符分割）
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3")
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = []
for doc in documents:
    md_chunks = markdown_splitter.split_text(doc.page_content)
    for chunk in md_chunks:
        chunk.metadata.update(doc.metadata)
        chunks.extend(text_splitter.split_documents([chunk]))

In [None]:
# 3. 嵌入和向量存储
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db"
)
print("向量存储已构建并保存到 ./chroma_db")