In [1]:
! pip install langchain openai chromadb tiktoken

Collecting langchain
  Downloading langchain-0.1.19-py3-none-any.whl.metadata (13 kB)
Collecting openai
  Downloading openai-1.27.0-py3-none-any.whl.metadata (21 kB)
Collecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl.metadata (7.3 kB)
Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain-community<0.1,>=0.0.38 (from langchain)
  Downloading langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting langchain-core<0.2.0,>=0.1.52 (from langchain)
  Downloading langchain_core-0.1.52-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downloading langchain_text_splitters-0.0.1-py3-none-any.whl.metadata (2.0 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.56-py3-none-any.whl.metadata (13 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.1-py3-none-any.whl.metadata (4.

In [2]:
from langchain.retrievers import ParentDocumentRetriever

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore

# Load pdf
loader = PyPDFLoader("https://arxiv.org/pdf/2309.10305v2.pdf")
data = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data[:6])

import os
from getpass import getpass

OPENAI_API_KEY = getpass()

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


 ························································


### 检索完整的文档

In [5]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OpenAIEmbeddings()
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [6]:
retriever.add_documents(data[:6], ids=None)

In [15]:
len(list(store.yield_keys()))

6

In [16]:
sub_docs = vectorstore.similarity_search("What is baichuan2 ？")

In [17]:
sub_docs

[Document(page_content='2-7B achieves nearly 30% higher performance\ncompared to Baichuan 1-7B. Specifically, Baichuan\n2 is optimized to improve performance on math\nand code problems. On the GSM8K (Cobbe\net al., 2021) and HumanEval (Chen et al., 2021)\nevaluations, Baichuan 2 nearly doubles the results\nof the Baichuan 1. In addition, Baichuan 2 also\ndemonstrates strong performance on medical and', metadata={'doc_id': '8c98dba1-dd9a-441d-a0dd-8c15a1e1f4c3', 'page': 1, 'source': 'https://arxiv.org/pdf/2309.10305v2.pdf'}),
 Document(page_content='2-7B achieves nearly 30% higher performance\ncompared to Baichuan 1-7B. Specifically, Baichuan\n2 is optimized to improve performance on math\nand code problems. On the GSM8K (Cobbe\net al., 2021) and HumanEval (Chen et al., 2021)\nevaluations, Baichuan 2 nearly doubles the results\nof the Baichuan 1. In addition, Baichuan 2 also\ndemonstrates strong performance on medical and', metadata={'doc_id': '09ce6004-3fea-431b-b3ca-6f163e3c269b', 'pa

In [18]:
len(sub_docs[0].page_content)

372

In [19]:
retrieved_docs = retriever.get_relevant_documents("What is baichuan2 ？")


In [20]:
len(retrieved_docs[0].page_content)

4526

In [21]:
len(retrieved_docs)

1

####  检索较大的文本块

In [22]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=OpenAIEmbeddings()
)
# The storage layer for the parent documents
store = InMemoryStore()

In [23]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [24]:
retriever.add_documents(data[:6], ids=None)

In [25]:
len(list(store.yield_keys()))

15

In [26]:
sub_docs = vectorstore.similarity_search("What is baichuan2 ？")

In [27]:
len(sub_docs[0].page_content)

263

In [28]:
retrieved_docs = retriever.get_relevant_documents("What is baichuan2 ？")

In [29]:
len(retrieved_docs[0].page_content)

1964