In [None]:
pip install langchain chromadb fastapi uvicorn pandas PyPDF2 openpyxl requests

第二步：读取和分块文档

In [None]:
from langchain.document_loaders import PyPDFLoader, UnstructuredExcelLoader
from langchain.text_splitter import RecursiveJsonSplitter, RecursiveCharacterTextSplitter

def load_and_split(file_path):
    if file_path.endswith('.pdf'):
        loader = PyPDFLoader(file_path)
    elif file_path.endswith('.xlsx', '.xls'):
        loader = UnstructuredExcelLoader(file_path, mode = 'elements')
    else: 
        raise ValueError(f"Unsupported file type: {file_path}")
    
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 512, chunk_overlap = 64)
    
    return text_splitter.split_documents(documents)

# 示例调用
chunks = load_and_split("./data/电动汽车安全要求.pdf")

for chunk in chunks:
    print(chunk.page_content[:100]) # 打印前100字符验证
    

第三步：使用 Ollama 的 bge-m3 模型生成 Embedding

In [None]:
from langchain.embeddings.base import Embeddings
import requests

class OllamaEmbeddings(Embeddings):
    def __init__(self, model_name: str = 'bge-m3:latest', base_url: str = 'http://localhost:11434'):
        self.model_name = model_name
        self.base_url = base_url
        
    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            response = requests.post(f"{self.base_url}/api/embeddings", json={
                "model": self.model_name,
                "prompt": text
            })
            if response.status_code != 200:
                raise Exception(f"Error embedding text: {response.text}")
            embeddings.append(response.json()['embedding'])
        return embeddings 
    
    def embed_query(self, text):
        return self.embed_documents([text])[0]
    
# 使用示例
embeddings = OllamaEmbeddings(model_name="bge-m3:latest")
query_embedding = embeddings.embed_query("高压维修断开装置要求有哪些？")
print(len(query_embedding))  # 输出维度，通常是 1024 维
         

第四步：将向量数据写入 Chroma 数据库

In [None]:
from langchain.vectorstores import Chroma

# 初始化 Chroma 向量数据库
vectorstore = Chroma(
    persist_directory="./chroma_db",  # 本地存储路径
    embedding_function=embeddings
)

# 添加文档到向量库
vectorstore.add_documents(documents=chunks)
vectorstore.persist()  # 持久化保存

In [5]:
from fastapi import FastAPI
from langchain.vectorstores import Chroma
from langchain.embeddings.base import Embeddings
import requests
import os

app = FastAPI()

# 自定义 Ollama Embeddings 类（同上）
class OllamaEmbeddings(Embeddings):
    def __init__(self, model_name="bge-m3:latest", base_url="http://localhost:11434"):
        self.model_name = model_name
        self.base_url = base_url

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            response = requests.post(f"{self.base_url}/api/embeddings", json={
                "model": self.model_name,
                "prompt": text
            })
            if response.status_code != 200:
                raise Exception(f"Error embedding text: {response.text}")
            embeddings.append(response.json()['embedding'])
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

# 加载向量数据库
embeddings = OllamaEmbeddings()
vectorstore = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embeddings
)

@app.get("/search")
async def search(query: str, k: int = 5):
    results = vectorstore.similarity_search_with_score(query, k=k)
    formatted = [{"content": doc.page_content, "score": score} for doc, score in results]
    return {"results": formatted}

In [None]:
uvicorn main:app --reload