In [None]:

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.schema import Document
from langchain.embeddings import AzureOpenAIEmbeddings
from dotenv import load_dotenv,find_dotenv
import logging
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import FastEmbedEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chat_models import AzureChatOpenAI
from datasets import Dataset

In [None]:
def readPDF(source_path):
    try:
        document_pages_lc = None
        document_pages_lc = PyPDFLoader(source_path).load()

        return document_pages_lc
    except Exception as e:
        logging.error(f'Error readPDF(): {e}')
        return None


In [None]:
def readMSWord(source_path):
    try:
        #整个文档每分页的单词数量.
        one_page_size = 300 
        document_pages_lc = None
        #此方法并未返回与PDF加载器相同的对象，例如，无法识别文档页面。因此，下面构建了自定义逻辑.
        document_pages_lc = UnstructuredWordDocumentLoader(source_path).load() 
        document_pages_lc_list = []        
        
        # UnstructuredWordDocumentLoader将整个文档作为一个单独的页面返回，所以需要实现自定义的分割
        for page in document_pages_lc:                       
            
            #分割文档为单词
            page_words = page.page_content.split(' ') 

            #分割文档为每页包含one_page_size个单词的页面
            for i in range((len(page_words) // one_page_size)+1): 

                doc = Document(page_content=' '.join(page_words[i*one_page_size:(i+1)*one_page_size]),
                               metadata={"source":page.metadata["source"], "page":i})
                document_pages_lc_list.append(doc)                   
        
        return document_pages_lc_list
    except Exception as e:
        logging.error(f'Error readMSWord_old(): {e}')
        return None

In [None]:
def getDocumentExtension(documentPath):
    try:
        return os.path.basename(documentPath).split('.')[len(os.path.basename(documentPath).split('.'))-1]
    except Exception as e:
        logging.error(f'Error getDocumentExtension(): {e}')    
        return None

In [None]:
def getEmbeddingEntireDoc(documentPath):

    try:
        docType = None
        document_pages_lc = None
    
        #Get document type
        docType = getDocumentExtension(documentPath).lower()
        
        if docType == 'pdf':
            document_pages_lc = readPDF(documentPath)
        # Custom word doc 自定义Word文档处理，因为它没有像PDF加载器那样的页面元数据, 
        # 此外，文档并未像PDF那样默认被分割成多个页面。请查看 readMSWord() 方法以获取更多详细信息
        elif docType == 'docx' or docType == 'doc':
            document_pages_lc = readMSWord(documentPath)
        return document_pages_lc
    except Exception as e:
        logging.error(f'Error getEmbeddingEntireDoc(): {e}')
        return None    

In [None]:

docPath = './doc/故宫简介.pdf'
documents = getEmbeddingEntireDoc(docPath)

# 按照设置分割文档
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

print(f'Number of chunks: {len(chunks)}')


In [None]:


# 从.env 文件中导入OpenAI API 的秘钥
load_dotenv(find_dotenv())

azure_embeddings = AzureOpenAIEmbeddings(
    openai_api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
    azure_deployment=os.environ.get("AZURE_EMBEDDING_DEPLOYMENT"),
    model=os.environ.get("AZURE_EMBEDDING_MODEL"),
)


# vector_store = Chroma.from_documents(documents=chunks, embedding=azure_embeddings, persist_directory="./chroma_db")
vector_store = Chroma.from_documents(documents=chunks, embedding=FastEmbedEmbeddings())
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={
        "k": 3,
        "score_threshold": 0.5,
    },
)


In [None]:
# Define LLM
llm = AzureChatOpenAI(
                openai_api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
                azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
        )


# Define prompt template
template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

# 设置运行的流水线
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)


In [None]:

questions = ["故宫的长宽和面积是多少?", 
             "故宫的三大殿是什么?",
             "御花园原名叫什么?",
            ]
ground_truths = [["故宫东西宽７５０米，南北长９６０米，面积达到７２万平方米"],
                ["太和殿，中和殿和保和殿"],
                ["宫后苑"]]
answers = []
contexts = []

# Inference
for query in questions:
  answers.append(rag_chain.invoke(query))
  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])
# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}
# Convert dict to dataset
dataset = Dataset.from_dict(data)

print(f"result: {data}")


In [None]:
from ragas import evaluate

from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)



result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
    embeddings=azure_embeddings,
    llm=llm,

)
df = result.to_pandas()
df.head()
