In [13]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
from langchain_community.vectorstores import FAISS

In [16]:
def create_faiss_index(questions, model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", index_path="./db/qa_index_new.faiss"):
    """
    建立 FAISS 索引
    
    參數:
    questions: list of str - 問題列表
    model_name: str - 使用的模型名稱
    index_path: str - 索引儲存路徑
    """
    # 初始化模型
    model = SentenceTransformer(model_name)
    
    # 生成嵌入向量
    print("正在生成問題嵌入向量...")
    question_embeddings = []
    batch_size = 32
    
    for i in tqdm(range(0, len(questions), batch_size)):
        batch = questions[i:i + batch_size]
        embeddings = model.encode(batch, convert_to_tensor=True)
        question_embeddings.append(embeddings.cpu().numpy())
    
    question_embeddings = np.vstack(question_embeddings)
    
    # 確保向量類型為 float32
    question_embeddings = question_embeddings.astype(np.float32)
    
    # 獲取向量維度
    dimension = question_embeddings.shape[1]  # 對於 mpnet-base-v2 應該是 768
    print(f"向量維度: {dimension}")
    
    # 建立 FAISS 索引
    print("建立 FAISS 索引...")
    quantizer = faiss.IndexFlatL2(dimension)
    index = faiss.IndexIVFFlat(quantizer, dimension, 100)  # 100 是聚類中心數量
    
    # 訓練索引
    print("訓練索引...")
    index.train(question_embeddings)
    
    # 添加向量到索引
    print("添加向量到索引...")
    index.add(question_embeddings)
    
    # 儲存索引
    print(f"儲存索引到 {index_path}")
    faiss.write_index(index, index_path)
    
    print(f"索引建立完成，共包含 {index.ntotal} 個向量")
    return index

# 使用示例
def main():
    # 假設你有一個問題列表
    # questions = [
    #     "如何處理系統錯誤？",
    #     "如何重置密碼？",
    #     # ... 更多問題
    # ]
    qKey = "Question"
    qAns = "Answer"
    df = pd.read_csv("./data/qa.csv")
    df_clean = df.groupby(qKey)[qAns].apply(lambda x: '\n'.join(x.unique())).reset_index()
    df_clean[qAns] = df_clean[qAns].str.replace(r'(\w+)\s*=\s*(\d+)', r'\1=\2', regex=True)
    questions = df_clean[qKey].str.strip().tolist()
    answers = df_clean[qAns].str.strip().tolist()
    model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
    question_embeddings = model.encode(
        questions,
        convert_to_tensor=False,
        show_progress_bar=True
    )
    print(question_embeddings.shape)
    dimension = question_embeddings.shape[1]#(#__1)  # 向量維度(通常為384/768)
    index = faiss.IndexFlatL2(dimension)
    # quantizer = faiss.IndexFlatL2(dimension)
    # index = faiss.IndexIVFFlat(quantizer, dimension, 3900)
    # index.train(question_embeddings)
    index.add(question_embeddings)
    faiss.write_index(index, "./db2/q_index.faiss")
    print("index created and saved");
    # 建立Answer Vector DB
    metadatas = [{
        "answer": ans, 
        "source": "內部技術資料庫",
        "last_updated": "2025-02"
    } for ans in answers]
    # 建立可持久化向量庫
    vector_db = FAISS.from_embeddings(
        text_embeddings=list(zip(questions, question_embeddings)),
        embedding=SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2"),
        metadatas=metadatas
    )
    # 保存完整向量庫
    vector_db.save_local("./db2/qa_vecdb_faiss_new")
    print("vector db saved")
    

# if __name__ == "__main__":
#     main()


In [17]:
main()

Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 26.30it/s]


(145, 768)
index created and saved


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


vector db saved
