In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
OPENAI_API_BASE = os.environ['OPENAI_API_BASE']

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader('pdf_data/893.pdf')
pages = loader.load_and_split()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# chunk_overlap=0 表示没有重叠的字符
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=0)
split_docs = text_splitter.split_documents(pages)
print(f'{len(split_docs)} split_docs')

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

persist_directory = "chroma_storage_893"

if os.path.exists(persist_directory) != True:
    # 把分块内容处理成 embeddings
    vectorstore = Chroma.from_documents(
        split_docs, embeddings, persist_directory=persist_directory)
    # 结果持久化
    vectorstore.persist()

In [None]:
query = '这篇论文和 Transformer 的相关性是什么？'
# 加载保存在本地的文档数据
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embeddings)
# 查询与提问最相关的2个文档
search_docs = vectordb.similarity_search(query, 2)
print(search_docs)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
# 构建llm 
# temperature=0 意味着需要最保守的回答 减小模型产生幻觉的几率
llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY,
                 model_name='gpt-3.5-turbo', api_base=OPENAI_API_BASE)
chain = load_qa_chain(llm, chain_type='stuff')
results = chain.run(input_documents=search_docs, question=query)
print(f'Q: {query}')
print(f'A: {results}')

In [None]:
import xml.etree.ElementTree as ET

speak = ET.Element('speak')
speak.set('xmlns', 'http://www.w3.org/2001/10/synthesis')
speak.set('xmlns:mstts', 'http://www.w3.org/2001/mstts')
speak.set('xmlns:emo', 'http://www.w3.org/2009/10/emotionml')
speak.set('version', '1.0')
speak.set('xml:lang', 'en-US')

voice = ET.SubElement(speak, 'voice')
voice.set('name', 'zh-TW-YunJheNeural')

prosody = ET.SubElement(voice, 'prosody')
prosody.set('rate', '20%')
prosody.set('pitch', '20%')
prosody.text = query + results

ET.dump(speak)
tree = ET.ElementTree(speak)
tree.write(persist_directory + '/SSML.xml', encoding='utf-8')