In [82]:
# 環境設置（需先安裝 llama-index-core 與 langchain）
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
from llama_index.core.node_parser import SimpleNodeParser
from langchain.agents import initialize_agent, Tool
# from langchain_openai import ChatOpenAI
import logging
from langchain_ollama import OllamaLLM
from langchain_community.utilities import SQLDatabase
import numpy as np

In [84]:
from sentence_transformers import SentenceTransformer
import faiss
from llama_index.vector_stores.faiss import FaissVectorStore
# from llama_index.core.embeddings.mock_embed_model import MockEmbedding
# from llama_index.core import Settings
from llama_index.core.schema import TextNode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [99]:
import os

In [64]:
class LLMInitializer:
    """LLM模型初始化類"""
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
    
    def init_ollama_model(
        self, 
        model: str = "deepseek-r1:7b",
        base_url: str = "http://localhost:11434",
        **kwargs
    ) -> OllamaLLM:
        """初始化OllamaLLM模型
        
        Args:
            model: Ollama模型名稱
            base_url: Ollama服務URL
            **kwargs: 額外的模型參數
            
        Returns:
            初始化後的OllamaLLM實例
        """
        try:
            return OllamaLLM(
                model=model,
                base_url=base_url,
                streaming=True,
                **kwargs
            )
        except Exception as e:
            self.logger.error(f"OllamaLLM模型初始化失敗: {str(e)}")

In [70]:
# embed_model = SentenceTransformer("../.././../../../Embedding_Models/paraphrase-multilingual-MiniLM-L12-v2/")
# embed_model = SentenceTransformer("../.././../../../Embedding_Models/text2vec-base-chinese/")

In [71]:

# 自定義文件處理流程
# def custom_embedding_pipeline(documents):
#     # 生成嵌入向量
#     texts = [doc.text for doc in documents]
#     embeddings = embed_model.encode(texts, convert_to_tensor=False)
#     # 建立 FAISS 索引
#     dimension = embeddings.shape[1]
#     faiss_index = faiss.IndexFlatL2(dimension)
#     faiss_index.add(np.array(embeddings).astype('float32'))
#     # 建立向量儲存上下文
#     vector_store = FaissVectorStore(faiss_index=faiss_index)
#     storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
#     return storage_context

In [104]:
# 修正後的嵌入流程與索引建立
# 全局設定嵌入模型
# Settings.embed_model = SentenceTransformer("shibing624/text2vec-base-chinese")
# Settings.embed_model=HuggingFaceEmbedding(model_name="../.././../../../Embedding_Models/text2vec-base-chinese/")

def custom_embedding_pipeline(documents):
    persist_dir = "./storage"
    os.makedirs(persist_dir, exist_ok=True)
    # 显式生成嵌入向量
    texts = [doc.text for doc in documents]
    embeddings = Settings.embed_model.get_text_embedding_batch(texts)
    
    # 构建带嵌入的节点
    nodes = [
        TextNode(
            text=doc.text, 
            embedding=emb,
            metadata={"source": doc.metadata.get("file_name", "unknown")}
        ) 
        for doc, emb in zip(documents, embeddings)
    ]
    
    # 验证嵌入维度
    assert all(emb is not None for emb in embeddings), "存在未生成嵌入的节点"
    dimension = len(embeddings[0])
    
    # 构建FAISS索引
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(np.array(embeddings).astype('float32'))
    
    # 创建存储上下文
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(
        vector_store=vector_store,
        persist_dir=persist_dir
    )
    # storage_context.persist(persist_dir=persist_dir,docstore_fname="docstore.json",vector_store_fname="my_vector_store.json")
    storage_context.persist(persist_dir=persist_dir)
    return storage_context




In [None]:
# 1. LlamaIndex 建立法律條文索引系統
legal_docs_dir = "../../../misc/law"
# 載入文件
legal_docs = SimpleDirectoryReader(legal_docs_dir).load_data()
# 全局設定嵌入模型
Settings.embed_model = HuggingFaceEmbedding(
    model_name="shibing624/text2vec-base-chinese",
    embed_batch_size=32  # 最佳化批量處理
)
# 建立索引時移除 embed_model 參數
# 自動處理嵌入生成
legal_index = VectorStoreIndex.from_documents(
    documents=legal_docs,
    storage_context=custom_embedding_pipeline(legal_docs),
    show_progress=True
)
# legal_index = VectorStoreIndex.from_documents(
#     embed_model= MockEmbedding(embed_dim=1),
#     documents=legal_docs,
#     storage_context=custom_embedding_pipeline(legal_docs),
#     show_progress=True
# )

query_engine = legal_index.as_query_engine(similarity_top_k=3)

# 2. 封裝成 LangChain 工具
def legal_retriever(query: str) -> str:
    response = query_engine.query(query)
    return str(response)

legal_tool = Tool(
    name="Legal Clause Retriever",
    func=legal_retriever,
    description="檢索最新法律條文與司法解釋"
)

# 3. LangChain 構建客服流程
# llm = ChatOpenAI(model="gpt-4-turbo", temperature=0.3)
llm = LLMInitializer()

tools = [legal_tool]

agent = initialize_agent(
    tools,
    llm,
    agent="structured-chat-zero-shot-react-description",
    verbose=True
)

# 4. 整合應用範例
response = agent.run(
    "用戶詢問合約中的不可抗力條款，請檢索民法相關規定並用台灣口語解釋"
)
print(response)


In [None]:
db_tool = SQLDatabase.from_uri("sqlite:///contracts.db")
tools.append(db_tool)