In [1]:
from langchain.retrievers import ParentDocumentRetriever

In [2]:
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [8]:
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import OllamaLLM

# Configure Ollama LLM
ollama_llm = OllamaLLM(
    model="llama3.2:latest",
    base_url="http://localhost:11434",
    temperature=0.1
)

# Configure embedding model
ollama_embedding = OllamaEmbeddings(
    model="nomic-embed-text:latest",  # Correct parameter name is `model`
    base_url="http://localhost:11434",  # Base URL for the Ollama service
)




In [6]:
loaders = [
    TextLoader("../../00-example_data/paul_graham_essay.txt"),
    TextLoader("../../00-example_data/state_of_the_union.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [9]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=ollama_embedding
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [10]:
retriever.add_documents(docs, ids=None)

This should yield two keys, because we added two documents.

In [None]:
list(store.yield_keys())

Let's now call the vector store search functionality - we should see that it returns small chunks (since we're storing the small chunks).

In [12]:
sub_docs = vectorstore.similarity_search("justice breyer")

In [None]:
len(sub_docs[0].page_content)

In [None]:
print(sub_docs[0].page_content)

Let's now retrieve from the overall retriever. This should return large documents - since it returns the documents where the smaller chunks are located.

In [14]:
retrieved_docs = retriever.invoke("justice breyer")

In [None]:
len(retrieved_docs[0].page_content)

#### Retrieving larger chunks

In [17]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=ollama_embedding
)
# The storage layer for the parent documents
store = InMemoryStore()

In [18]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [19]:
retriever.add_documents(docs)

We can see that there are much more than two documents now - these are the larger chunks.

In [None]:
len(list(store.yield_keys()))

Let's make sure the underlying vector store still retrieves the small chunks.

In [21]:
sub_docs = vectorstore.similarity_search("justice breyer")

In [None]:
print(sub_docs[0].page_content)

In [23]:
retrieved_docs = retriever.invoke("justice breyer")

In [None]:
len(retrieved_docs[0].page_content)

In [None]:
print(retrieved_docs[0].page_content)