In [1]:
import os

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

from langchain_openai import ChatOpenAI

chat = ChatOpenAI(
    openai_api_key=os.environ.get('OPENAI_API_KEY'),
    openai_api_base=os.environ.get('CHATGPT_API_ENDPOINT')   
)



In [2]:
from langchain.document_loaders import PyPDFLoader

loaders = [
    PyPDFLoader("docs/01.pdf"),
    PyPDFLoader("docs/02.pdf"),
    PyPDFLoader("docs/03.pdf"),
    PyPDFLoader("docs/04.pdf")
]

docs = []

for loader in loaders:
    docs.extend(loader.load())

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_documents(docs)

len(chunks)    

109

In [3]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    base_url=os.environ.get('CHATGPT_API_ENDPOINT')
)

In [4]:
from langchain.llms import Ollama
chat = Ollama(model="openchat:latest")

  chat = Ollama(model="openchat:latest")


In [5]:
from langchain.vectorstores import  FAISS

faissVectorDb = FAISS.from_documents(docs, embeddings)

In [6]:
query = "在香港有什麼美味的食物？ 如果有，請給出該店的地址"

docs = faissVectorDb.similarity_search(query)

docs[0].page_content[:200]

'1110\n小店美食\n13  添好運點心專門店\n全球最便宜的米芝蓮星級餐廳之一，雖然在台\n灣已經開了多間分店，但要品嚐正港風味的點\n心，還是要到本地的街坊小店。菜單上的選擇\n或許不如茶樓多，但價格親民，招牌點心酥皮\n焗叉燒包，保證讓人一試愛上。\n地址：深水埗福榮街9至11號 \n電話：+852 2788 1226\n網址：www.timhowan.com.hk\n14  新香園\n來到這家老字號茶餐廳，必吃'

In [7]:
from langchain.chains import RetrievalQA

In [8]:
retriever = faissVectorDb.as_retriever()

In [9]:
model = RetrievalQA.from_chain_type(
    retriever=retriever,
    llm=chat,
    chain_type="stuff",
    verbose=True
)

In [10]:
response = model(
    { "query": query },
    return_only_outputs=True
)
print(response["result"])

  response = model(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 在香港，有很多美味的食物。以下是一些建議：

1. 蘇記車仔麵 - 深水埗福榮街109、111-117E、121及123C號。
2. 文記車仔麵 - 同一條街的三家店位於深水埗福榮街109、111-117E、121及123C號。
3. 飛鷹餐廳 - 深水埗荔枝角道256號。
4. 綠林甜品 - 深水埗元州街77至79號。

這些店家提供了傳統和新型的香港美食，如車仔麵、牛排、湯頭、甜點等。


如何保存和加載FAISS

In [11]:
!rm -rf ./db

In [38]:
persist_directory = "./db"

faissVectorDb.save_local(persist_directory)

localdb = FAISS.load_local(
    folder_path=persist_directory,
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)

查看FAISS VectorDb的內容

In [13]:
import pandas as pd

In [65]:
def store_to_df(vectordb):
    v_dict = faissVectorDb.docstore._dict
    data_rows = []
    for k in v_dict.keys():
        doc_name = v_dict[k].metadata["source"].split("/")[-1]
        page_number = v_dict[k].metadata["page"] + 1
        page_content = v_dict[k].page_content
        data_rows.append(
            {
                "chunk_id": k,
                "doc_name": doc_name,
                "page_number": page_number,
                "page_content": page_content
            }
        )

    vector_df = pd.DataFrame(data_rows)
    return vector_df

def show_vectorStore(vectordb):
    vector_df = store_to_df(vectordb)
    display(vector_df)

show_vectorStore(localdb)

Unnamed: 0,chunk_id,doc_name,page_number,page_content
0,2aed02b6-0cbf-4013-8440-3cfa4277a67a,01.pdf,1,衝\n一\n波\n$$07&3JOEE$$07&3...
1,a4133326-b13e-42fe-a7a3-7dd6bf650dfd,01.pdf,2,網路報導專區\n下載PDF手冊\n閱讀\n動態電子書\n23　中　環\t 蘭桂坊Bar Ho...
2,8514b71e-1d5f-4c34-90d3-5bbd99fd06c6,01.pdf,3,04\n地址：香港九龍太子砵蘭街418號地下 電話：＋852-2392-9283\n營業時間...
3,1220f9a9-178e-4894-bbbf-c2726c5ee5c1,01.pdf,4,06\n地址：香港九龍旺角通菜街1A-1L 威達 Deli2商業大廈1字樓 B 舖 電話：＋...
4,7ef97d1e-2612-4073-b51e-07da9f4f6698,01.pdf,5,08\n地址：香港銅鑼灣謝斐道477-481號肇明大廈地下及1樓 A 室 電話：＋852-2...
...,...,...,...,...
83,327de8f1-dde2-413c-95ea-46657d115137,04.pdf,25,4746\n添好運點心專門店 \np.11\n寶華扎作 \np.34\n汝州街 \n（珠仔街...
84,47ae1904-9a4d-41d1-a726-be047b871b2a,04.pdf,26,4948\n 4948\nDoughnut \np.9\n福榮街 \np.31\n合益泰小...
85,5955aed7-927a-426b-95fb-77ed771b6bee,04.pdf,27,5150\n福榮街 \np.31\n公和荳品廠 \np.10\n1\n2\n3\n汝州街 \...
86,0167bad6-7384-4c2a-9135-78c7e315a8aa,04.pdf,28,香港旅遊發展局旅客服務\nHONG KONG TRAVEL BUDDY \n歡迎旅客掃描QR...


如何刪除Vector DB中的檔案

In [40]:
#刪除對應PDF的文檔
def delete_document(store, document_name):
    vector_df = store_to_df(store)
    chunk_lists = vector_df.loc[vector_df["doc_name"]==document_name]["chunk_id"].tolist()
    store.delete(ids=chunk_lists)

#更新向量數據庫，同時更新RetrievalQA(答問系統)
def refresh_model(new_store):
    retriever = new_store.as_retriever()
    model = RetrievalQA.from_chain_type(
        llm=chat,
        chain_type="stuff",
        retriever=retriever,
        verbose=True
    )
    return model

In [63]:
delete_document(localdb, "01.pdf")

ValueError: Some specified ids do not exist in the current store. Ids not found: {'d6ecac64-6f21-4029-9595-5ac1392d1819', '2b773050-edf7-44e3-8d98-d30c2c54861d', '2552946a-d283-4400-a940-8377c4b86792', 'a9c11743-3320-4276-8e6c-5eb1f51a2f77', 'da3c88a7-1784-443e-8b70-48bd1c53a6b5', 'd0e2b6f9-4dfe-4a19-a3f0-b66cc21aed58', '161bea2b-c5af-4166-b3aa-a9b31ebe7ea2', '1220f9a9-178e-4894-bbbf-c2726c5ee5c1', 'a3d3bce2-1539-471f-81aa-30bde8c5f3ef', '10a16c5b-1c04-4654-b28b-1931011d3249', '082904f6-ef86-4e51-8fca-d0a4ca96c6cb', '43d974df-1b8d-456d-a38b-d437132f5b3f', 'a4133326-b13e-42fe-a7a3-7dd6bf650dfd', '3d32ba34-95de-4d8d-b0d2-1a74886b074b', '7ef97d1e-2612-4073-b51e-07da9f4f6698', '8514b71e-1d5f-4c34-90d3-5bbd99fd06c6', '2aed02b6-0cbf-4013-8440-3cfa4277a67a', '0cc79b5e-a53e-41af-a2a3-55bf1e61aead', '1755041a-31ab-494b-81dc-08df5093ab19', '27413390-f656-46b8-b831-130e6897f591', '3b5d950f-ef86-4eda-93f3-f5a54f886d05', '73aa854f-0213-45c4-bf0f-5258457a927a', '335a5260-5afd-499c-91a4-d554cf277bee'}

In [66]:
show_vectorStore(localdb)

Unnamed: 0,chunk_id,doc_name,page_number,page_content
0,2aed02b6-0cbf-4013-8440-3cfa4277a67a,01.pdf,1,衝\n一\n波\n$$07&3JOEE$$07&3...
1,a4133326-b13e-42fe-a7a3-7dd6bf650dfd,01.pdf,2,網路報導專區\n下載PDF手冊\n閱讀\n動態電子書\n23　中　環\t 蘭桂坊Bar Ho...
2,8514b71e-1d5f-4c34-90d3-5bbd99fd06c6,01.pdf,3,04\n地址：香港九龍太子砵蘭街418號地下 電話：＋852-2392-9283\n營業時間...
3,1220f9a9-178e-4894-bbbf-c2726c5ee5c1,01.pdf,4,06\n地址：香港九龍旺角通菜街1A-1L 威達 Deli2商業大廈1字樓 B 舖 電話：＋...
4,7ef97d1e-2612-4073-b51e-07da9f4f6698,01.pdf,5,08\n地址：香港銅鑼灣謝斐道477-481號肇明大廈地下及1樓 A 室 電話：＋852-2...
...,...,...,...,...
83,327de8f1-dde2-413c-95ea-46657d115137,04.pdf,25,4746\n添好運點心專門店 \np.11\n寶華扎作 \np.34\n汝州街 \n（珠仔街...
84,47ae1904-9a4d-41d1-a726-be047b871b2a,04.pdf,26,4948\n 4948\nDoughnut \np.9\n福榮街 \np.31\n合益泰小...
85,5955aed7-927a-426b-95fb-77ed771b6bee,04.pdf,27,5150\n福榮街 \np.31\n公和荳品廠 \np.10\n1\n2\n3\n汝州街 \...
86,0167bad6-7384-4c2a-9135-78c7e315a8aa,04.pdf,28,香港旅遊發展局旅客服務\nHONG KONG TRAVEL BUDDY \n歡迎旅客掃描QR...


In [49]:
model = refresh_model(localdb)

query = "泰昌餅家的地址是哪裡？"

response = model(
    { "query": query},
    return_only_outputs=True
)
print(response['result'])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 I don't know


如何添加資料到VectorDB

In [58]:
def add_vector_store(store, directory):
    loader = PyPDFLoader(directory)

    doc = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    chunks = text_splitter.split_documents(doc)

    extension = FAISS.from_documents(chunks, embeddings)
    store.merge_from(extension)

In [59]:
add_vector_store(localdb, "docs/05.pdf")

In [60]:
show_vectorStore(localdb)

Unnamed: 0,chunk_id,doc_name,page_number,page_content
0,2aed02b6-0cbf-4013-8440-3cfa4277a67a,01.pdf,1,衝\n一\n波\n$$07&3JOEE$$07&3...
1,a4133326-b13e-42fe-a7a3-7dd6bf650dfd,01.pdf,2,網路報導專區\n下載PDF手冊\n閱讀\n動態電子書\n23　中　環\t 蘭桂坊Bar Ho...
2,8514b71e-1d5f-4c34-90d3-5bbd99fd06c6,01.pdf,3,04\n地址：香港九龍太子砵蘭街418號地下 電話：＋852-2392-9283\n營業時間...
3,1220f9a9-178e-4894-bbbf-c2726c5ee5c1,01.pdf,4,06\n地址：香港九龍旺角通菜街1A-1L 威達 Deli2商業大廈1字樓 B 舖 電話：＋...
4,7ef97d1e-2612-4073-b51e-07da9f4f6698,01.pdf,5,08\n地址：香港銅鑼灣謝斐道477-481號肇明大廈地下及1樓 A 室 電話：＋852-2...
...,...,...,...,...
83,327de8f1-dde2-413c-95ea-46657d115137,04.pdf,25,4746\n添好運點心專門店 \np.11\n寶華扎作 \np.34\n汝州街 \n（珠仔街...
84,47ae1904-9a4d-41d1-a726-be047b871b2a,04.pdf,26,4948\n 4948\nDoughnut \np.9\n福榮街 \np.31\n合益泰小...
85,5955aed7-927a-426b-95fb-77ed771b6bee,04.pdf,27,5150\n福榮街 \np.31\n公和荳品廠 \np.10\n1\n2\n3\n汝州街 \...
86,0167bad6-7384-4c2a-9135-78c7e315a8aa,04.pdf,28,香港旅遊發展局旅客服務\nHONG KONG TRAVEL BUDDY \n歡迎旅客掃描QR...


In [64]:
model = refresh_model(localdb)

query = "和昌飯店有什麼好吃的？ 它的地址和電話是什麼?"

response = model(
    { "query": query},
    return_only_outputs=True
)
print(response['result'])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 和昌飯店（Good Chop Restaurant）位於香港上環區域，提供著名的擔肉蠻類食物。它的地址為：1/F, 18-20 Wellington Street, Central, Hong Kong。電話號碼是：+852 2869 3888。
