# 超簡易RAG實作

## 0. 前置作業

In [28]:
import os
import shutil
from langchain_community.document_loaders import PyPDFDirectoryLoader # 讀取pdf檔案並轉換成文字
from langchain_text_splitters import RecursiveCharacterTextSplitter   # 將長文字分割成句子
from langchain_chroma import Chroma                                   # 將向量儲存在向量資料庫
from langchain_core.documents import Document                         # 文件格式
from langchain_core.prompts import ChatPromptTemplate                 # 對話模板
from langchain_community.llms.ollama import Ollama


In [3]:
# 設定路徑
CHROMA_PATH = "chroma"
DATA_PATH = "data"

## 1. 將資料儲存進向量資料庫
![](./static/PreProduction.excalidraw.svg)

### ❗刪除資料庫 (非必要)

In [13]:
if input("❗Do you want to remove the chroma directory? (y/n): ").lower == "y":
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    else:
        print("Chroma directory does not exist.")

### 導入文件

In [5]:
documents = PyPDFDirectoryLoader(DATA_PATH).load()

### 將文件分塊

In [6]:
chunker = RecursiveCharacterTextSplitter(
          chunk_size=800,
          chunk_overlap=80,
          length_function=len,
          is_separator_regex=False,
          )
chunks = chunker.split_documents(documents)

### 定義一個Embedding模型

In [5]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
def get_embedding_function():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    # embeddings = OllamaEmbeddings(model="mxbai-embed-large")
    return embeddings

### 儲存資料到ChromaDB

In [8]:
# Load the existing database.
db = Chroma(
     persist_directory=CHROMA_PATH,
     embedding_function=get_embedding_function()
     )

計算ID 避免在查看時沒辦法尋找

ID格式如下-> 哪個文件 : 第幾頁 : 第幾塊

In [9]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [10]:
chunks_with_ids = calculate_chunk_ids(chunks)

顯示現有的向量資料庫內容個數

In [11]:
existing_items = db.get(include=[])  # IDs are always included by default
existing_ids = set(existing_items["ids"])
print(f"Number of existing documents in DB: {len(existing_ids)}")

Number of existing documents in DB: 0


只新增從未出現過的文件，有新增過的文件不再新增

In [12]:
new_chunks = []
for chunk in chunks_with_ids:
    if chunk.metadata["id"] not in existing_ids:
        new_chunks.append(chunk)

if len(new_chunks):
    print(f"👉 Adding new documents: {len(new_chunks)}")
    new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
    db.add_documents(new_chunks, ids=new_chunk_ids)
    print("✅ Done")
else:
    print("✅ No new documents to add")

👉 Adding new documents: 41


## 2. 尋找向量資料庫內容
![](./static/InProduction1.excalidraw.svg)

### 載入資料庫

In [18]:
db = Chroma(persist_directory=CHROMA_PATH, 
            embedding_function = get_embedding_function()
           )

### 搜尋資料庫

In [19]:
# 輸入想問的問題
user_query = "How much total money does a player start with in Monopoly? (Answer with the number only)"

In [22]:
results = db.similarity_search_with_score(user_query, k=5) # k=5 代表取前五個最相似的結果(可以自行調整)
for result,score in results:
    print(f"SIM:{score:.3f}\t metadata: {result.metadata}")
    # print(result.page_content) # 如果要看原文的話，可以把這行的註解拿掉

SIM:227.124	 metadata: {'id': 'data\\monopoly.pdf:0:0', 'page': 0, 'source': 'data\\monopoly.pdf'}
SIM:249.217	 metadata: {'id': 'data\\monopoly.pdf:0:1', 'page': 0, 'source': 'data\\monopoly.pdf'}
SIM:261.994	 metadata: {'id': 'data\\monopoly.pdf:2:0', 'page': 2, 'source': 'data\\monopoly.pdf'}
SIM:265.650	 metadata: {'id': 'data\\monopoly.pdf:1:2', 'page': 1, 'source': 'data\\monopoly.pdf'}
SIM:268.354	 metadata: {'id': 'data\\monopoly.pdf:2:1', 'page': 2, 'source': 'data\\monopoly.pdf'}


## 3. 讓LLM根據搜尋結果回答問題
![](./static/InProduction2.excalidraw.svg)

#### 建立一個提示詞模板

In [26]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

#### 將找到的答案套入模板

In [27]:
context_text = "\n\n---\n\n".join([doc.page_content for doc,score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=user_query)
prompt

'Human: \nAnswer the question based only on the following context:\n\nMONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ \n2 to 8 Players \nContents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance \nand Community Chest cards, Title Deed cards, play money and a Banker\'s tray. \nNow there\'s a faster way to play MONOPOLY. Choose to play by \nthe classic rules for buying, renting and selling properties or use the \nSpeed Die to get into the action faster. If you\'ve never played the classic \nMONOPOLY game, refer to the Classic Rules beginning on the next page. \nIf you already know how to play and want to use the Speed Die, just \nread the section below for the additional Speed Die rules. \nSPEED DIE RULES \nLearnins how to Play with the S~eed Die IS as \n/ \nfast as playing with i\'t. \n1. When starting the game, hand out an extra $1,000 to each player\n\n---\n\n1. When starting the game, hand out an extra $1,000 to each player \n(two $5005 should work). The game

#### 建立大語言模型做回覆

In [29]:
model = Ollama(model="llama3.1")
response_text = model.invoke(prompt)

In [31]:
sources = [doc.metadata.get("id", None) for doc,score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response: 1500
Sources: ['data\\monopoly.pdf:0:0', 'data\\monopoly.pdf:0:1', 'data\\monopoly.pdf:2:0', 'data\\monopoly.pdf:1:2', 'data\\monopoly.pdf:2:1']
