In [42]:
from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
import glob

# 配置信息
PDF_FILES_PATH = 'KB3/pdfs/*.pdf'
TXT_FILES_PATH = 'KB3/pdfs/test1.txt'
EMBEDDING_MODEL = "nomic-embed-text"
FAISS_DB_PATH = 'faiss_db'
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100


def load_and_split_files(pdf_path, txt_path):
    try:
        pdf_files = glob.glob(pdf_path)
        txt_files = glob.glob(txt_path)

        if not pdf_files and not txt_files:
            print(f"未找到符合路径 {pdf_path} 的 PDF 文件和符合路径 {txt_path} 的 TXT 文件。")
            return []

        all_docs = []

        # # 加载 PDF 文件
        # for file in pdf_files:
        #     loader = PDFPlumberLoader(file)
        #     docs = loader.load()
        #     all_docs.extend(docs)

        # 加载 TXT 文件
        for file in txt_files:
            loader = TextLoader(file)
            docs = loader.load()
            all_docs.extend(docs)

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=800,
                                                       add_start_index=True)
        all_splits = text_splitter.split_documents(all_docs)
        print(f"成功加载并分割 {len(all_splits)} 个文本块。")
        return all_splits
    except Exception as e:
        print(f"加载和切分文件时出错: {e}")
        return []


def build_vector_store():
    all_splits = load_and_split_files(PDF_FILES_PATH, TXT_FILES_PATH)
    if not all_splits:
        return None
    try:
        local_embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)
        vectorstore = FAISS.from_documents(documents=all_splits, embedding=local_embeddings,normalize_L2=True)

        vectorstore.save_local(FAISS_DB_PATH)
        print("向量存储构建完成并持久化到目录")
        return vectorstore
    except Exception as e:
        print(f"初始化向量存储时出错: {e}")
        return None

    

In [43]:
build_vector_store()

成功加载并分割 2 个文本块。
向量存储构建完成并持久化到目录


<langchain_community.vectorstores.faiss.FAISS at 0x7366e4d64fe0>