<a href="https://colab.research.google.com/github/zhe0/prac/blob/main/simple_rag_by_gguf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

rag chatbot example
* 偏小的語言模組
* 減少幻覺程度

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# 1. 定義 Transformer 自動編碼器模型
# ------------------------------------
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        src2 = self.self_attn(src, src, src, attn_mask=src_mask,
                              key_padding_mask=src_key_padding_mask)[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
                tgt_key_padding_mask=None, memory_key_padding_mask=None):
        tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
                              key_padding_mask=tgt_key_padding_mask)[0]
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
                                   key_padding_mask=memory_key_padding_mask)[0]
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt


class TransformerAutoencoder(nn.Module):
    def __init__(self, feature_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
        super().__init__()
        self.embedding_encoder = nn.Linear(feature_size, d_model) # 輸入特徵 Embedding 層
        self.embedding_decoder = nn.Linear(d_model, feature_size) # 輸出特徵 De-embedding 層 (用於重建)

        # Encoder 層堆疊
        encoder_layers = [TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout) for _ in range(num_encoder_layers)]
        self.encoder = nn.Sequential(*encoder_layers)

        # Decoder 層堆疊
        decoder_layers = [TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout) for _ in range(num_decoder_layers)]
        self.decoder = nn.Sequential(*decoder_layers)


    def forward(self, src):
        # Encoder 部分
        enc_src = self.embedding_encoder(src) # 將輸入特徵轉換到 d_model 維度 [batch_size, seq_len, d_model]
        memory = self.encoder(enc_src)       # 通過 Encoder 層 [batch_size, seq_len, d_model]

        # Decoder 部分 (簡單的直接將 encoder 的 memory 作為 decoder 的輸入)
        dec_output = self.decoder(memory, memory) # 通過 Decoder 層，memory 同時作為 decoder 的 tgt 和 memory [batch_size, seq_len, d_model]
        output = self.embedding_decoder(dec_output) # 將 decoder 的輸出轉換回原始特徵維度 [batch_size, seq_len, feature_size]

        return output



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m809.1/809.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.9/36.9 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.1/286.1 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [None]:
# 2. 從 CSV 檔案載入資料並準備訓練/測試集
# ------------------------------------
def load_and_prepare_data(csv_file='transaction_data.csv', feature_cols=['V1', 'V2'], test_size=100):
    """
    從 CSV 檔案載入資料，選擇特徵欄位，並分割成訓練集和測試集。

    Args:
        csv_file (str): CSV 檔案路徑 (假設已上傳到 Colab).
        feature_cols (list): 要使用的特徵欄位名稱列表.
        test_size (int): 測試集大小 (取最後幾 rows 作為測試集).

    Returns:
        tuple: 訓練集 (torch.Tensor), 測試集 (torch.Tensor), 真實標籤 (測試集, torch.Tensor, 如果 CSV 包含 'Class' 欄位).
    """
    df = pd.read_csv(csv_file)

    # 選擇特徵欄位
    X = df[feature_cols].values
    y = None # 預設沒有標籤
    if 'Class' in df.columns:
        y = df['Class'].values # 如果 CSV 包含 'Class' 欄位，則讀取標籤

    # 將資料轉換成 PyTorch tensors
    X_tensor = torch.tensor(X, dtype=torch.float32)
    if y is not None:
        y_tensor = torch.tensor(y, dtype=torch.long)
    else:
        y_tensor = None

    # 分割訓練集和測試集 (取最後 test_size rows 作為測試集)
    train_data = X_tensor[:-test_size]
    test_data = X_tensor[-test_size:]
    test_labels = None
    if y_tensor is not None:
        test_labels = y_tensor[-test_size:]

    return train_data, test_data, test_labels


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


gemma-2b-it-q8_0.gguf:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 21 key-value pairs and 164 tensors from /root/.cache/huggingface/hub/models--lmstudio-ai--gemma-2b-it-GGUF/snapshots/a0b140bfb922a743f89dd0682a24a17516071ab9/gemma-2b-it-q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma
llama_model_loader: - kv   1:                               general.name str              = gemma-2b-it
llama_model_loader: - kv   2:                       gemma.context_length u32              = 8192
llama_model_loader: - kv   3:                          gemma.block_count u32              = 18
llama_model_loader: - kv   4:                     gemma.embedding_length u32              = 2048
llama_model_loader: - kv   5:                  gemma.feed_forward_length u32              = 16384
llama_model_loader: - kv   6:                 gemma.attention.he

'Sure, here is the introduction and translation of the question and answer:\n\n**Question:** 你有沒有聽過「FUBON」在台灣的說法嗎？\n\n**Answer:** 我沒有聽過「FUBON」在台灣的說法。'

In [None]:
# 3. 設定超參數和模型、優化器、損失函數
# ------------------------------------
feature_size = 2 # 輸入特徵維度 (V1, V2)
sequence_length = 1 # 每個樣本視為長度為 1 的序列 (因為我們目前是獨立處理每個 row)
d_model = 64       # Transformer 模型中的 embedding dimension
nhead = 2          # Multi-head attention head 數量
num_encoder_layers = 2 # Encoder 層數
num_decoder_layers = 2 # Decoder 層數
dim_feedforward = 128 # Feedforward network hidden layer dimension
dropout = 0.1
learning_rate = 0.001
epochs = 30        # 增加 epochs 讓模型有更多訓練機會
batch_size = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 檢查是否有 GPU 可用，有的話使用 GPU 加速

model = TransformerAutoencoder(feature_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate) # 使用 Adam 優化器
criterion = nn.MSELoss(reduction='mean') # 使用均方誤差 (MSE) 作為重建誤差損失函數




Document(page_content='1 \n 金融業運用 人工智慧 (AI)之核心原則與相關推動 政策  \n一、前言  \n近來 AI1在金融服務 領域的應用日益增加， 為金融產業\n提供客戶服務 帶來效益，亦同時衍生一些新的風險問題 及監\n理挑戰。 為協助金融機構善用 AI科技優勢，並 能有效管理\n風險、確保公平、保護消費者權益、維護系統安全及實現永\n續發展，本會 依據行政院「 臺灣 AI行動計畫 2.0」政策規劃\n及「數位政策法制協調 專案會議」之推動策略 ，並參考全球\n主要國家監理機關及國際組織之相關指導原則， 及結合我國\n金融市場發展狀況及本會監理政策方向，擬定適合我國金融\n業的 6項AI應用核心原則，以 期引導金融業在兼顧消費者\n權益、金融市場秩序及社會責任下，積極投入科技創新，促\n進金融服務升級。  \n以下就AI之影響及國際組織或 主要國家對運用 AI之立\n場與規定、我國金融業運用 AI現況、訂定 AI原則及政策 之\n必要性、我國金融業運用 AI之6項核心原則 ，以及本會因\n應AI發展推動之配套政策等事項進行說明。  \n \n \n1 由於 AI 技術與日俱進，因此各國際組織或各國並未定義 AI，反而聚焦「 AI 系統」，本文交替\n使用 AI與AI系統。依據經濟合作暨發展組織 (OECD)對「AI 系統」之定義，係指一種以機器為\n基礎的系統，在給予設定之一組目的下，能透過產製輸出品 (例如預測、建議或決策 )來影響環境。\n它使用以機器或人類為基礎的數據與輸入品來 (1)感知真實或虛擬環境； (2)萃取這些感知，並透\n過自動分析 (例如，使用機器學習 )或人工分析，再轉化為模型；以及 (3)使用模型推理來形成結果\n的選項。 AI系統係設計以不同的自主程度來運作。  ', metadata={'source': './ai.pdf', 'page': 0})

In [None]:
# 4. 載入資料並建立 DataLoader
# ------------------------------------
train_data, test_data, test_labels = load_and_prepare_data(
    csv_file='transaction_data.csv', # 假設你的 CSV 檔案名為 transaction_data.csv
    feature_cols=['V1', 'V2'],      # 使用 V1 和 V2 欄位作為特徵
    test_size=100                    # 使用最後 100 rows 作為測試集
)

# 建立訓練集 DataLoader
train_dataset = torch.utils.data.TensorDataset(train_data, train_data) # Autoencoder 的輸入和輸出都是相同的資料 (重建)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 建立測試集 DataLoader (注意: 測試集不需要 shuffle)
test_dataset = torch.utils.data.TensorDataset(test_data, test_data)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)




In [None]:
# 5. 訓練模型 (Fine-tuning)
# ------------------------------------
history = {'train_loss': []} # 紀錄訓練過程中的 loss

for epoch in range(epochs):
    model.train() # 設定模型為訓練模式
    train_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(train_dataloader):
        inputs = inputs.unsqueeze(1).to(device) # 將輸入資料增加一個 sequence length 維度 (batch_size, seq_len=1, feature_size), 並移動到 GPU (如果有的話)
        targets = targets.unsqueeze(1).to(device) # 同樣處理 targets

        optimizer.zero_grad() # 清空梯度
        outputs = model(inputs) # 前向傳播
        loss = criterion(outputs, targets) # 計算 loss (重建誤差)
        loss.backward()         # 反向傳播
        optimizer.step()        # 更新模型參數

        train_loss += loss.item() # 累加 batch loss

    avg_train_loss = train_loss / len(train_dataloader) # 計算平均 train loss
    history['train_loss'].append(avg_train_loss)      # 紀錄平均 train loss
    print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}")




In [None]:
# 6. 異常偵測和評估 (使用測試集)
# ------------------------------------
model.eval() # 設定模型為評估模式 (關閉 dropout 等)
reconstruction_errors = [] # 儲存測試集樣本的重建誤差
predicted_anomalies = [] # 儲存預測的異常標籤
true_labels_list = []   # 儲存真實標籤 (如果有的話)

with torch.no_grad(): # 在評估階段，不需要計算梯度
    for inputs, targets in test_dataloader:
        inputs = inputs.unsqueeze(1).to(device) # 處理輸入資料，增加 sequence length 維度並移動到 GPU
        targets = targets.unsqueeze(1).to(device)
        outputs = model(inputs) # 前向傳播，取得重建後的輸出
        loss = criterion(outputs, targets) # 計算重建誤差 (MSE)
        reconstruction_errors.extend(loss.cpu().numpy()) # 紀錄 batch 的平均重建誤差

        # 簡單的異常判斷: 將重建誤差與閾值比較
        threshold = np.percentile(reconstruction_errors, 85) # 使用重建誤差的 85 百分位數作為閾值 (可調整)
        batch_predicted_anomalies = (loss.cpu().numpy() > threshold).astype(int) # 大於閾值判斷為異常 (1), 否則正常 (0)
        predicted_anomalies.extend(batch_predicted_anomalies)

        if test_labels is not None:
            batch_true_labels = test_labels[len(true_labels_list): len(true_labels_list) + len(inputs)].numpy() # 取得當前 batch 的真實標籤
            true_labels_list.extend(batch_true_labels) # 紀錄真實標籤


# 將 reconstruction errors 轉換為每個樣本的誤差 (這裡簡化為 batch 平均誤差，實際應用中建議計算每個樣本的誤差)
# 由於我們是計算 batch 的平均 MSE Loss, 這裡簡化處理，實際應用中建議調整 dataloader 每次只輸出 batch_size=1 來計算每個樣本的誤差
reconstruction_errors_samples = np.repeat(np.array(reconstruction_errors), batch_size)[:len(test_data)] # 粗略將 batch error 擴展到樣本 (需調整)
predicted_anomalies_samples = np.repeat(np.array(predicted_anomalies), batch_size)[:len(test_data)] # 粗略將 batch anomaly label 擴展到樣本


# 評估模型 (如果測試集有真實標籤)
if test_labels is not None:
    print("\n--- 評估結果 (基於測試集真實標籤) ---")
    print("Confusion Matrix:\n", confusion_matrix(true_labels_list[:len(predicted_anomalies_samples)], predicted_anomalies_samples)) # 注意 label 長度對齊
    print("\nClassification Report:\n", classification_report(true_labels_list[:len(predicted_anomalies_samples)], predicted_anomalies_samples))






HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
), model_name='amu/tao-8k', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [None]:
# 7. 繪製訓練 Loss 曲線 和 異常分數分佈 (可選)
# ------------------------------------
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history['train_loss'])
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1, 2, 2)
plt.hist(reconstruction_errors_samples, bins=50)
plt.axvline(x=threshold, color='r', linestyle='--', label=f'Threshold ({threshold:.2f})')
plt.title('Test Reconstruction Error Distribution')
plt.xlabel('Reconstruction Error')
plt.ylabel('Frequency')
plt.legend()

plt.tight_layout()
plt.show()


print("\n--- 程式碼執行完畢 ---")
print("請查看訓練 Loss 曲線、測試集重建誤差分佈、以及評估結果 (如果測試集有真實標籤)。")
print("異常判斷閾值設定為重建誤差的 85 百分位數，您可以調整這個閾值來改變異常偵測的靈敏度。")

共 476 頁
最後一頁內容:  page_content='當之教育及培訓，使員工能適應 AI帶來之變革，並盡\n可能維護其 工作權益。' metadata={'source': './ai.pdf', 'page': 25}
全部內容:  現象，保護自然環境，從而促進包容性成長、永續發
展及社會福祉。 (二)金融機構在 AI系統運用過程中， 宜對一般員工 提供適 當之教育及培訓，使員工能適應 AI帶來之變革，並盡
可能維護其 工作權益。


(1,
 '。 (二)金融機構 使用 AI與消費者直接互動 時，應適當揭露 。 \n六、促進永續發展 (一)金融機構在運用 AI系統時，應確保其發展策略及執行 與永續發展之原則相結合 ，包括減少經濟、社會等不平 等現象，保護自然環境，從而促進包容性成長、永續發\n展及社會福祉。 (二)金融機構在 AI系統運用過程中， 宜對一般員工 提供適 當之教育及培訓，使員工能適應 AI帶來之變革，並盡\n可能維護其 工作權益。')

In [None]:
%%time
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embeddings, persist_directory="db")
db

CPU times: user 6min 29s, sys: 5min 7s, total: 11min 36s
Wall time: 11min 49s


<langchain_community.vectorstores.chroma.Chroma at 0x7bccd6c3fd90>

In [None]:
# from langchain_community.vectorstores import FAISS
# results = db.similarity_search("", k=3)
# for i in range(len(results)):
#   print(results[i])
#   print('='*10)
# # results[-1]

# retriever = db.as_retriever(search_type="mmr")
# docs = retriever.get_relevant_documents("what about SupTech")
# docs

In [None]:
from langchain import PromptTemplate
from langchain.chains.retrieval_qa.base import RetrievalQA
template = """
role: 你是台灣的金融監督管理委員會之「窗口」，並且說著正體中文
question: {question}
answer: 回覆之前請檢視自己的答案，不可以捏照回答。並且詳盡可能回答你的客戶
context: {context}
temperature=0.0
"""


qa_prompt = PromptTemplate(
    input_variables=["question", "context"],
    template=template,
)


qa_chain = RetrievalQA.from_chain_type(llm,retriever=db.as_retriever(search_kwargs={'k':2}), chain_type_kwargs={"prompt": qa_prompt}, return_source_documents=True)
result = qa_chain({"question": myquery, "context": myapp})


print(qa_chain)


res=llm.invoke(prompt)
print(res)

NameError: name 'myquery' is not defined

**您好！我是台灣金融監督管理委員會的「窗口」，我專門為您解答有關監理科技 (SupTech) 的相關問題。**

**SuperTech 的定義是什麼？**

SuperTech 是指利用數碼技術和網路技術來建立和營運金融產品和服務的領域。

**SuperTech 的監理範圍是什麼？**

SuperTech 的監理範圍涵蓋但不限於：

* 產品設計與開發
* 交易處理
* 投資管理
* 客戶服務
* 安全與合規性

**SuperTech 的監理義務是什麼？**

SuperTech 的監理義務包括：

* 定期監控產品和服務的設計與開發過程
* 定期監控交易處理的過程
* 定期監控投資管理的過程
* 定期監控客戶服務的過程
* 定期監控安全與合規性的狀況
* 採取適當的措施來確保產品和服務安全與合規

**如何遵守 SuperTech 的監理義務？**

SuperTech 的監理義務可以透過以下方式實現：

* 確保產品和服務的設計與開發過程符合安全與合規標準
* 建立嚴格的交易處理流程
* 建立完善的投資管理制度
* 建立嚴格的客戶服務系統
* 建立完善的安全與合規性管理系統

**如何與 SuperTech 的監理相關聯繫？**

您可以透過以下方式與 SuperTech 的監理相關聯繫：

* 金管會網站上的資訊中心
* 金管會辦理的監理報告
* 金管會辦理的公開會議


## Test Library Setup

Next, let's create our test "library."

For simplicity's sake, let's say that our "library" is simply a **nested directory of `.epub` files**. We can easily see this solution generalizing to, say, a Calibre library with a `metadata.db` database file. We'll leave that extension as an exercise for the reader. 😇

Let's pull two `.epub` files from [Project Gutenberg](https://www.gutenberg.org/) for our library.

In [None]:
!mkdir -p ".test/library/jane-austen"
!mkdir -p ".test/library/victor-hugo"
!wget https://www.gutenberg.org/ebooks/1342.epub.noimages -O ".test/library/jane-austen/pride-and-prejudice.epub"
!wget https://www.gutenberg.org/ebooks/135.epub.noimages -O ".test/library/victor-hugo/les-miserables.epub"

## RAG with LlamaIndex

RAG with LlamaIndex, at its core, consists of the following broad phases:

1. **Loading**, in which you tell LlamaIndex where your data lives and how to
   load it;
2. **Indexing**, in which you augment your loaded data to facilitate querying, e.g. with vector embeddings;
3. **Querying**, in which you configure an LLM to act as the query interface for
   your indexed data.

This explanation only scratches at the surface of what's possible with
LlamaIndex. For more in-depth details, I highly recommend reading the
["High-Level Concepts" page of the LlamaIndex
documentation](https://docs.llamaindex.ai/en/stable/getting_started/concepts.html).

### Loading

Naturally, let's start with the **loading** phase.

I mentioned before that LlamaIndex is designed specifically for RAG. This
immediately becomes obvious from its
[`SimpleDirectoryReader`](https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader.html)
construct, which ✨ **magically** ✨ supports a whole host of multi-model file
types for free. Conveniently for us, `.epub` is in the supported set.

In [None]:
from llama_index.core import SimpleDirectoryReader

loader = SimpleDirectoryReader(
    input_dir="./.test/",
    recursive=True,
    required_exts=[".epub"],
)

documents = loader.load_data()

`SimpleDirectoryReader.load_data()` converts our ebooks into a set of [`Document`s](https://docs.llamaindex.ai/en/stable/api/llama_index.core.schema.Document.html) for LlamaIndex to work with.

One important thing to note here is that the documents **have not been chunked at this stage** -- that will happen during indexing. Read on...

### Indexing

Next up after **loading** the data is to **index** it. This will allow our RAG pipeline to look up the relevant context for our query to pass to our LLM to **augment** their generated response. This is also where document chunking will take place.

[`VectorStoreIndex`](https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index.html)
is a "default" entrypoint for indexing in LlamaIndex. By default,
`VectorStoreIndex` uses a simple, in-memory dictionary to store the indices, but
LlamaIndex also supports [a wide variety of vector storage
solutions](https://docs.llamaindex.ai/en/stable/module_guides/storing/vector_stores.html)
for you to graduate to as you scale.

<Tip>
By default, LlamaIndex uses a chunk size of 1024 and a chunk overlap of
20. For more details, see the [LlamaIndex
documentation](https://docs.llamaindex.ai/en/stable/optimizing/basic_strategies/basic_strategies.html#chunk-sizes).
</Tip>


Like mentioned before, we'll use the
[`BAAI/bge-small-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) to
generate our embeddings. By default, [LlamaIndex uses
OpenAI](https://docs.llamaindex.ai/en/stable/getting_started/starter_example.html)
(specifically `gpt-3.5-turbo`), which we'd like to avoid given our desire for a lightweight, locally-runnable end-to-end solution.

Thankfully, LlamaIndex supports retrieving embedding models from Hugging Face through the convenient `HuggingFaceEmbedding` class, so we'll use that here.

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

We'll pass that in to `VectorStoreIndex` as our embedding model to circumvent the OpenAI default behavior.

In [None]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(
    documents,
    embed_model=embedding_model,
)

### Querying

Now for the final piece of the RAG puzzle -- wiring up the query layer.

We'll use Llama 2 for the purposes of this recipe, but I encourage readers to play around with different models to see which produces the "best" responses here.

First let's start up the Ollama server. Unfortunately, there is no support in the [Ollama Python client](https://github.com/ollama/ollama-python) for actually starting and stopping the server itself, so we'll have to pop out of Python land for this.

In a separate terminal, run: `ollama serve`. Remember to terminate this after we're done here!

Now let's hook Llama 2 up to LlamaIndex and use it as the basis of our query engine.

In [None]:
from llama_index.llms.ollama import Ollama

llama = Ollama(
    model="llama2",
    request_timeout=40.0,
)

query_engine = index.as_query_engine(llm=llama)

## Final Result

With that, our basic RAG librarian is set up and we can start asking questions about our library. For example:

In [None]:
print(query_engine.query("What are the titles of all the books available? Show me the context used to derive your answer."))

Based on the context provided, there are two books available:

1. "Pride and Prejudice" by Jane Austen
2. "Les Misérables" by Victor Hugo

The context used to derive this answer includes:

* The file path for each book, which provides information about the location of the book files on the computer.
* The titles of the books, which are mentioned in the context as being available for reading.
* A list of words associated with each book, such as "epub" and "notebooks", which provide additional information about the format and storage location of each book.


In [None]:
print(query_engine.query("Who is the main character of 'Pride and Prejudice'?"))

The main character of 'Pride and Prejudice' is Elizabeth Bennet.


## Conclusion and Future Improvements

We've demonstrated how to build a basic RAG-based "librarian" that runs entirely locally, even on Apple silicon Macs. In doing so, we've also carried out a "grand tour" of LlamaIndex and how it streamlines the process of setting up RAG-based applications.

That said, we've really only scratched the surface of what's possible here. Here are some ideas of how to refine and build upon this foundation.

### Forcing Citations

To guard against the risk of our librarian hallucinating, how might we require that it provide citations for everything that it says?

### Using Extended Metadata

Ebook library management solutions like [Calibre](https://calibre-ebook.com/) create additional metadata for ebooks in a library. This can provide information such as publisher or edition that might not be readily available in the text of the book itself. How could we extend our RAG pipeline to account for additional sources of information that aren't `.epub` files?

### Efficient Indexing

If we were to collect everything we built here into a script/executable, the resulting script would re-index our library on each invocation. For our tiny test library of two files, this is "fine," but for any library of non-trivial size this will very quickly become annoying for users. How could we persist the embedding indices and only update them when the contents of the library have meaningfully changed, e.g. new books have been added?