In [41]:
# !pip install langchain-huggingface

In [2]:
import pandas as pd
from langchain_community.vectorstores import FAISS
# from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# !pip install langchain-community langchain-core

In [4]:
# def process_excel(excel_file=None, save_file=None):
#     try:
#         # 讀取 Excel 檔案
#         df = pd.read_excel(excel_file)
        
#         # 只選取需要的欄位
#         selected_columns = ['Idx', 'Question', 'Answer']
#         df_selected = df[selected_columns]
        
#         # 移除空值的列
#         df_selected = df_selected.dropna(how='all')
        
#         # 儲存為 CSV 檔案，使用逗號作為分隔符號
#         # output_file = 'processed_issues.csv'
#         df_selected.to_csv(save_file, index=False, encoding='utf-8')
        
#         print(f'成功將資料儲存至 {save_file}')
        
#     except Exception as e:
#         print(f'處理過程中發生錯誤: {str(e)}')


In [5]:
# 'issue checklist_all_1226a4_0204_146_MOD.xlsx'
# if __name__ == "__main__":
#     _save_path = "./qa.csv"
#     _excel_file = "./checklist_all_1226a4_0204_146_MOD.xlsx"
#     process_excel(excel_file=_excel_file, save_file=_save_path)

In [6]:
# 建立參數映射表：
"""
這裡建立了一個字典，將技術參數名稱映射到中文描述。這樣做的目的是提高可讀性，
讓非技術人員或需要本地化的使用者更容易理解參數的實際功能。例如，
在檢索結果中顯示“電源管理開關”而不是原始的參數名稱，有助於快速理解。

需要考慮用戶的場景，他們處理的是技術問答數據，涉及BIOS/EC設定，
參數名稱通常複雜且不易理解。因此，格式標準化和參數映射是提升系統可用性的重要步驟。
此外，這些處理能增強向量化後的一致性，提高檢索的準確性，因為統一的格式和清晰的參數描述有助於模型更好地理解內容。
"""
param_mapping = {
"PcdCfgPeApmEnable": "電源管理開關",
"gEfiAmdAgesaPkgTokenSpaceGuid.PcdAcpController": "音頻控制器配置"
}

In [1]:
def embeddingQA_from_csv(csv_file=None,qKey="Question", qAns="Answer"):
    df = pd.read_csv(csv_file)
    # 合併重複問題的答案
    df_clean = df.groupby(qKey)[qAns].apply(lambda x: '\n'.join(x.unique())).reset_index()
    # 正規化代碼格式
    """
    這是一個使用正則表達式的字串替換操作。目的是將Answer欄位中的參數設定格式標準化。
    例如，將類似"PcdCfgPeApmEnable = 0"中的空格去除，變成"PcdCfgPeApmEnable=0"。
    這樣做的好處是統一格式，避免後續處理時因格式不一致導致問題，比如在檢索或解析時無法正確識別參數和數值。
    """
    df_clean[qAns] = df_clean[qAns].str.replace(r'(\w+)\s*=\s*(\d+)', r'\1=\2', regex=True)
    # 提取文本數據
    questions = df_clean[qKey].str.strip().tolist()
    answers = df_clean[qAns].str.strip().tolist()
    model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    question_embeddings = model.encode(
        questions,
        convert_to_tensor=False,
        show_progress_bar=True
    )
    print(question_embeddings.shape)
    dimension = question_embeddings.shape[1]#(#__1)  # 向量維度(通常為384/768)
    quantizer = faiss.IndexFlatL2(dimension)
    index = faiss.IndexIVFFlat(quantizer, dimension, 100)
    index.train(question_embeddings)
    index.add(question_embeddings)
    faiss.write_index(index, "qa_index_.faiss")
    #integrated with langchain
    # 創建元數據結構
    metadatas = [{
        "answer": ans, 
        "source": "內部技術資料庫",
        "last_updated": "2025-02"
    } for ans in answers]
    # 建立可持久化向量庫
    vector_db = FAISS.from_embeddings(
        text_embeddings=list(zip(questions, question_embeddings)),
        embedding=HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"),
        metadatas=metadatas
    )
    # 保存完整向量庫
    vector_db.save_local("tech_support_faiss")

In [8]:
# embeddingQA_from_csv("qa.csv")

In [45]:
"""
import faiss
import numpy as np

def load_faiss_index(index_path):
    #載入預訓練的FAISS索引
    try:
        index = faiss.read_index(index_path)
        print(f"成功載入FAISS索引，包含 {index.ntotal} 個向量")
        return index
    except Exception as e:
        print(f"索引載入失敗: {str(e)}")
        return None

# 使用示例
index = load_faiss_index("tech_support_index.faiss")


from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

def load_vector_db(db_folder):
    #載入完整的向量資料庫
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    )
    
    try:
        db = FAISS.load_local(
            folder_path=db_folder,
            embeddings=embeddings,
            allow_dangerous_deserialization=True  # 必要安全參數
        )
        print(f"載入成功，共 {db.index.ntotal} 筆技術問答")
        return db
    except Exception as e:
        print(f"向量庫載入異常: {str(e)}")
        return None

# 使用示例
vector_db = load_vector_db("tech_support_faiss")

"""

'\nimport faiss\nimport numpy as np\n\ndef load_faiss_index(index_path):\n    #載入預訓練的FAISS索引\n    try:\n        index = faiss.read_index(index_path)\n        print(f"成功載入FAISS索引，包含 {index.ntotal} 個向量")\n        return index\n    except Exception as e:\n        print(f"索引載入失敗: {str(e)}")\n        return None\n\n# 使用示例\nindex = load_faiss_index("tech_support_index.faiss")\n\n\nfrom langchain_community.vectorstores import FAISS\nfrom langchain_community.embeddings import HuggingFaceEmbeddings\n\ndef load_vector_db(db_folder):\n    #載入完整的向量資料庫\n    embeddings = HuggingFaceEmbeddings(\n        model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"\n    )\n    \n    try:\n        db = FAISS.load_local(\n            folder_path=db_folder,\n            embeddings=embeddings,\n            allow_dangerous_deserialization=True  # 必要安全參數\n        )\n        print(f"載入成功，共 {db.index.ntotal} 筆技術問答")\n        return db\n    except Exception as e:\n        print(f"向量庫載入異常: {str(e)}")

In [None]:
# 動態加載映射表
def load_parameter_mappings(config_path):
    with open(config_path, 'r', encoding='utf-8') as f:
        return json.load(f)


In [11]:
def read_embedding_and_query(idx_path=None, local_db_path=None):
    load_index(idx_path=idx_path);
    load_local_db(local_db_path=local_db_path);

def load_index(idx_path=None):
    #載入預訓練的FAISS索引
    try:
        index = faiss.read_index(idx_path)
        print(f"成功載入FAISS索引，包含 {index.ntotal} 個向量")
        return index
    except Exception as e:
        print(f"索引載入失敗: {str(e)}")
        return None

def load_local_db(local_db_path=None):
    #載入完整的向量資料庫
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    )
    try:
        db = FAISS.load_local(
            folder_path=local_db_path,
            embeddings=embeddings,
            allow_dangerous_deserialization=True  # 必要安全參數
        )
        print(f"載入成功，共 {db.index.ntotal} 筆技術問答")
        return db
    except Exception as e:
        print(f"向量庫載入異常: {str(e)}")
        return None
    

In [12]:
_idx_path = "./qa_index.faiss"
_local_db = "./tech_support_faiss/"
read_embedding_and_query(idx_path=_idx_path,local_db_path=_local_db)

成功載入FAISS索引，包含 145 個向量
載入成功，共 145 筆技術問答
