In [1]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain, LLMChain, StuffDocumentsChain
from langchain.memory import ConversationBufferMemory
from chainlit.input_widget import Slider, Select, TextInput
from langchain.prompts import PromptTemplate
from rank_bm25 import BM25Okapi
from langchain.schema import BaseRetriever, Document
from typing import List
import chainlit as cl

load_dotenv()

True

In [2]:
file_path = f'C:/Users/JiaxinSun/OneDrive/Documents/MyLLM/report.pdf'

docs = PyMuPDFLoader(file_path).load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800, chunk_overlap=100
)
split_docs = text_splitter.split_documents(docs)

In [3]:
# 创建嵌入模型
embeddings = OpenAIEmbeddings()  # 创建嵌入模型

# 创建文档向量存储，指定本地存储路径
docsearch = await cl.make_async(FAISS.from_documents)(
    split_docs, embeddings
)

2024-10-09 10:16:02 - HTTP Request: POST https://api.bianxieai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-10-09 10:16:03 - Loading faiss with AVX2 support.
2024-10-09 10:16:03 - Successfully loaded faiss with AVX2 support.


In [4]:
docsearch.save_local(folder_path='./vector_store', index_name='report.pdf')

In [5]:
try:
    docsearch = FAISS.load_local(
        folder_path = './vector_store',
        embeddings = embeddings,
        index_name = 'report.pdf',
        allow_dangerous_deserialization = True
    )
    # 如果加载成功，输出成功消息
    print("FAISS索引加载成功！")
    
except Exception as e:
    # 如果加载失败
    print(f"FAISS索引加载失败: {e}")
    docsearch = await cl.make_async(FAISS.from_documents)(
        split_docs, embeddings
    )

FAISS索引加载成功！


In [2]:
client = chromadb.PersistentClient(path='./vector_store')
collections = client.list_collections()  # 使用实例调用方法

2024-10-08 19:22:36 - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [3]:
collections

[Collection(id=77ed1261-388e-45fa-aacb-56290c9a32c0, name=report.pdf)]

In [9]:
collection_name = 'report.pdf'
docsearch = await cl.make_async(client.get_collection)(collection_name)  # 使用实例调用方法

In [10]:
embeddings = OpenAIEmbeddings()
query = '最近的市场行情'
query_embedding = embeddings.embed_query(query)

2024-10-08 19:32:28 - HTTP Request: POST https://api.bianxieai.com/v1/embeddings "HTTP/1.1 200 OK"


In [17]:
doc_embeddings = docsearch.query(
    query_embeddings=query_embedding,
    n_results=5,
)

In [14]:
corpus = [doc.page_content for doc in split_docs]
bm25 = BM25Okapi(corpus)

In [15]:
bm25_scores = bm25.get_scores(query.split())
ranked_docs = sorted(zip(docs, bm25_scores), key=lambda x: x[1], reverse=True)


In [16]:
[doc for doc, _ in ranked_docs[:5]]

[Document(metadata={'source': 'C:/Users/JiaxinSun/OneDrive/Documents/MyLLM/report.pdf', 'file_path': 'C:/Users/JiaxinSun/OneDrive/Documents/MyLLM/report.pdf', 'page': 0, 'total_pages': 15, 'format': 'PDF 1.7', 'title': '私行全球资产每周聚焦', 'author': '平安私人银行投研团队', 'subject': '', 'keywords': '', 'creator': 'Microsoft® PowerPoint® 2019', 'producer': 'Microsoft® PowerPoint® 2019', 'creationDate': "D:20240527091139+08'00'", 'modDate': "D:20240527091139+08'00'", 'trapped': ''}, page_content='平安银行私行财富部\n2024年5月24日\n经济震荡期，政策观察期\n——2024年6月大类资产配置展望\n'),
 Document(metadata={'source': 'C:/Users/JiaxinSun/OneDrive/Documents/MyLLM/report.pdf', 'file_path': 'C:/Users/JiaxinSun/OneDrive/Documents/MyLLM/report.pdf', 'page': 1, 'total_pages': 15, 'format': 'PDF 1.7', 'title': '私行全球资产每周聚焦', 'author': '平安私人银行投研团队', 'subject': '', 'keywords': '', 'creator': 'Microsoft® PowerPoint® 2019', 'producer': 'Microsoft® PowerPoint® 2019', 'creationDate': "D:20240527091139+08'00'", 'modDate': "D:20240527091139+08'00'", 'tr