In [None]:
%pip install -qU  langchain langchain_community langchain_nvidia_ai_endpoints langchain_milvus pymupdf ragas

## 載入Emebedding

In [None]:
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
embeddings = NVIDIAEmbeddings(model="nemollm-embedding")

# from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_community.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-zh-v1.5")

## 創建Milvus的資料庫
若已有則無須創建

In [None]:
from langchain_milvus import Milvus

# from langchain_community.vectorstores import Milvus

# The easiest way is to use Milvus Lite where everything is stored in a local file.
# If you have a Milvus server you can use the server URI such as "http://localhost:19530".
# URI = "./milvus_example.db"
URI = "http://localhost:19530"

vector_store = Milvus(
    embedding_function=embeddings,
    connection_args={"uri": URI},
    index_params={"index_type": "FLAT", "metric_type": "L2"},
    auto_id=True,
)

## 定義Text Splitter

In [None]:
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

EMBEDDING_MODEL = "BAAI/bge-large-zh-v1.5"
CHUNK_SIZE = 100
CHUNK_OVERLAP = 20


def get_text_splitter(
    embedding_model_name="BAAI/bge-large-zh-v1.5", chunk_size=506, chunk_overlap=200
) -> SentenceTransformersTokenTextSplitter:
    """Return the token text splitter instance from langchain.

    Returns:
        SentenceTransformersTokenTextSplitter: Splitting text to tokens using sentence model tokenizer
    """
    # Chunksize and chunk overlap can up updated using APP_TEXTSPLITTER_CHUNKSIZE and APP_TEXTSPLITTER_CHUNKOVERLAP respectively
    return SentenceTransformersTokenTextSplitter(
        model_name=embedding_model_name,
        tokens_per_chunk=chunk_size,
        chunk_overlap=chunk_overlap,
    )

text_splitter = get_text_splitter(EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP)

## 資料存取一: PDF
### 讀取資料夾內所有的PDF

In [None]:
import os
from langchain_community.document_loaders import PyMuPDFLoader

def find_pdfs(folder_path):
    pdf_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith('.pdf'):
                full_path = os.path.join(root, file)
                pdf_files.append(full_path)
    return pdf_files

folder_path = "/workspace/pdf/"

pdf_paths = find_pdfs(folder_path)

### 分割PDF的內容後，存入Milvus資料庫中

In [None]:
for pdf_path in pdf_paths:
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    texts = text_splitter.split_documents(docs)
    vector_store.add_documents(documents=texts)

## 資料存取二: 純文字

In [None]:
text = """\
今天天氣真好"""

### 分割純文字的內容後，存入Milvus資料庫中

In [None]:
documents = text_splitter.create_documents([text], metadatas=[{'source': '', 'file_path': '', 'page': 0, 'total_pages': 0, 'format': '', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': '', 'creationDate': '', 'modDate': '', 'trapped': ''}])
vector_store.add_documents(documents=documents)

## 查詢Milvus資料庫內有多少個Chunks?

In [None]:
from pymilvus import Collection, connections

# 連接到 Milvus
connections.connect(uri=URI)
# connections.connect("LangChainCollection", host="localhost", port="19530")

collection_name = vector_store.collection_name
collection = Collection(collection_name)

# 查詢資料總數
print(f"Collection '{collection.name}' has {collection.num_entities} entities.")

## 刪除Milvus資料庫的內文件

In [None]:
# Delete All Files
vector_store.delete(expr=f"pk >= 0")

# By metadata
# filename = "tweet"
# vector_store.delete(expr=f"source == '{filename}'")

## 搜尋資料庫中的文件
### 方法一: vector_store.similarity_search()

In [None]:
K = 5

results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=K,
)

# results = vector_store.similarity_search(
#     "LangChain provides abstractions to make working with LLMs easy",
#     k=K,
#     expr='source == "tweet"',
# )

for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

### 方法二: retriever.invoke()

In [None]:
retriever = vector_store.as_retriever(
    search_kwargs={"k": K}
)

# retriever = vector_store.as_retriever(
#     search_kwargs={"k": K, "expr": 'source == "tweet"'}
# )

retrieved_docs = retriever.invoke(
    "LangChain provides abstractions to make working with LLMs easy"
)

for res in retrieved_docs:
    print(f"* {res.page_content} [{res.metadata}]")

## 建立RAG的流程

In [None]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_nvidia_ai_endpoints import ChatNVIDIA

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    

llm = ChatNVIDIA(
    base_url="http://0.0.0.0:8000/v1",
    model="meta/llama3-8b-instruct",
    temperature=0.1,
    max_tokens=1000,
    top_p=1.0,
)

prompt = hub.pull("rlm/rag-prompt")

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("What is Task Decomposition?")

In [None]:
question = "請問探針卡是哪個處室管理的?"
rag_chain.invoke(question)

## 評估RAG準確度

In [None]:
# 準備欲驗證的問題
sample_queries = [
    "Who introduced the theory of relativity?",
    "Who was the first computer programmer?",
    "What did Isaac Newton contribute to science?",
    "Who won two Nobel Prizes for research on radioactivity?",
    "What is the theory of evolution by natural selection?",
]

In [None]:
dataset = []

for query in sample_queries:
    relevant_docs = retriever.invoke(query)
    response = rag_chain.invoke(query)
    dataset.append(
        {
            "user_input": query,
            "retrieved_contexts": [page.page_content for page in relevant_docs],
            "response": response,
        }
    )

## 使用RAGAS來協助評估RAG的準確度

In [None]:
from ragas import EvaluationDataset
evaluation_dataset = EvaluationDataset.from_list(dataset)

In [None]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

evaluator_llm = LangchainLLMWrapper(llm)
evaluator_embeddings = retriever.vectorstore.embeddings
from ragas.metrics import AnswerRelevancy, Faithfulness, ContextUtilization 

result = evaluate(dataset=evaluation_dataset,
                  metrics=[AnswerRelevancy(), ContextUtilization()],
                  llm=evaluator_llm,
                  embeddings=evaluator_embeddings
                 )
result