In [1]:
import os
import uuid
from dotenv import load_dotenv

import tiktoken
from transformers import AutoTokenizer

from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain.retrievers import ParentDocumentRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever, SearchType
from langchain.storage import InMemoryByteStore, InMemoryStore
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import DistanceStrategy
from langchain_community.document_loaders import TextLoader
from langchain_community.storage import SQLStore
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_aws import BedrockEmbeddings, ChatBedrockConverse
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
load_dotenv()

True

In [3]:
database_uri: str = os.getenv("POSTGRES_URI")
database_uri = f"postgresql+psycopg://{database_uri.split('//')[-1]}"

In [4]:
loaders = [
    TextLoader("data/paul_graham_essay.txt"),
    TextLoader("data/state_of_the_union.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
print(len(docs))

2


In [5]:
for doc in docs:
    print(doc, end="\n\n")

page_content='

What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.

The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.

The language we used was an early version of Fortran. You had to type programs on punch cards, then s

In [6]:
openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
embeddings = BedrockEmbeddings(model_id="cohere.embed-multilingual-v3")

# Token function

In [7]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="Cohere/multilingual-22-12")
text = "Hellö World, this is my input string!"
enc = tokenizer(text)
print("Encoded input:")
print(enc)

inv_vocab = {v: k for k, v in tokenizer.vocab.items()}
tokens = [inv_vocab[token_id] for token_id in enc['input_ids']]
print("Tokens:")
print(tokens)

number_of_tokens = len(enc['input_ids'])
print("Number of tokens:", number_of_tokens)


Encoded input:
{'input_ids': [101, 72172, 24459, 17153, 117, 15272, 15002, 15456, 34429, 32542, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Tokens:
['[CLS]', 'Hell', '##ö', 'World', ',', 'this', 'is', 'my', 'input', 'string', '!', '[SEP]']
Number of tokens: 12


In [8]:
pretrained_model_name = "Cohere/multilingual-22-12"

In [9]:
def number_tokens_from_text(text: str, pretrained_model_name: str = pretrained_model_name) -> int:
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name)
    enc = tokenizer(text)
    
    return len(enc['input_ids'])

In [10]:
print(number_tokens_from_text(docs[0].page_content))

Token indices sequence length is longer than the specified maximum sequence length for this model (16903 > 512). Running this sequence through the model will result in indexing errors


16903


# Retrieving full documents

In [42]:
# This text splitter is used to create the child documents
# child_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50, length_function=number_tokens_from_text, add_start_index=True)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=50, add_start_index=True)
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    embeddings=embeddings,
    connection=database_uri,
    collection_name="full_documents",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [43]:
retriever.add_documents(docs, ids=None)

In [44]:
list(store.yield_keys())

['f819d07d-d23c-4a1e-b4cc-bc2bd5008d57',
 '377a126c-db06-4061-8b06-baeae802f7e2']

In [45]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [46]:
print(sub_docs[0].page_content)

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. 

And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. 

We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  

We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.  

We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. 

We’re securing commitm

In [47]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

1920 And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. 

And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. 

We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  

We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.  

We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. 

We’re securing co

In [48]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

1


In [49]:
len(retrieved_docs[0].page_content)

38540

# Retrieving larger chunks

In [50]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=8192, add_start_index=True)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, add_start_index=True)
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    embeddings=embeddings,
    connection=database_uri,
    collection_name="split_parents",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryStore()

In [51]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [52]:
retriever.add_documents(docs, ids=None)

In [53]:
len(list(store.yield_keys()))

15

In [54]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [55]:
print(sub_docs[0].page_content)

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. 

And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. 

We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  

We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.  

We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. 

We’re securing commitm

In [56]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

1920 And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. 

And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. 

We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  

We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.  

We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. 

We’re securing co

In [57]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

2


In [58]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

7994

----------------------------------------------------------------------------------------------------

8170

----------------------------------------------------------------------------------------------------



In [59]:
for doc in retrieved_docs:
    # print(len(doc.page_content), end="\n\n" + "-"*100 + "\n\n")
    print(doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

If you’re immunocompromised or have some other vulnerability, we have treatments and free high-quality masks. 

We’re leaving no one behind or ignoring anyone’s needs as we move forward. 

And on testing, we have made hundreds of millions of tests available for you to order for free.   

Even if you already ordered free tests tonight, I am announcing that you can order more from covidtests.gov starting next week. 

Second – we must prepare for new variants. Over the past year, we’ve gotten much better at detecting new variants. 

If necessary, we’ll be able to deploy new vaccines within 100 days instead of many more months or years.  

And, if Congress provides the funds we need, we’ll have new stockpiles of tests, masks, and pills ready if needed. 

I cannot promise a new variant won’t come. But I can promise you we’ll do everything within our power to be ready if it does.  

Third – we can end the shutdown of schools and businesses. We have the tools we need. 

It’s time for American

# PostgreSQL Docstore

In [7]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=8192, chunk_overlap=80, add_start_index=True)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=20, add_start_index=True)
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    embeddings=embeddings,
    connection=database_uri,
    collection_name="split_parents2",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents

In [8]:
store = SQLStore(namespace="split_parents2", db_url=database_uri)

In [9]:
store.create_schema()

In [10]:
store.mset([("key1", b"value1"), ("key2", b"value2")])

In [11]:
# store.mset([("key3", "value3")])

In [12]:
values = store.mget(["key1", "key2"])  # Returns [b"value1", b"value2"]
for value in values:
    print(value)

b'value1'
b'value2'


In [13]:
store.mdelete(["key1"])

In [14]:
for key in store.yield_keys():
    print(key)

key2


In [15]:
store.mdelete(["key1", "key2"])

In [16]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [17]:
retriever.add_documents(docs, ids=None)

In [18]:
len(list(store.yield_keys()))

15

In [19]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [20]:
print(sub_docs[0].page_content)

Ban assault weapons and high-capacity magazines. 

Repeal the liability shield that makes gun manufacturers the only industry in America that can’t be sued. 

These laws don’t infringe on the Second Amendment. They save lives. 

The most fundamental right in America is the right to vote – and to have it counted. And it’s under assault. 

In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. 

We cannot let this happen. 

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a Pre

In [21]:
for doc in sub_docs:
    print(len(doc.page_content), doc.metadata, doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

1920 {'doc_id': 'ad278fa4-45ce-4a7a-b4c6-71234c69e04b', 'source': 'data/state_of_the_union.txt', 'start_index': 4039} Ban assault weapons and high-capacity magazines. 

Repeal the liability shield that makes gun manufacturers the only industry in America that can’t be sued. 

These laws don’t infringe on the Second Amendment. They save lives. 

The most fundamental right in America is the right to vote – and to have it counted. And it’s under assault. 

In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. 

We cannot let this happen. 

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supr

In [22]:
for doc in sub_docs:
    print(doc.metadata, end="\n\n" + "-" * 100 + "\n\n")

{'doc_id': 'ad278fa4-45ce-4a7a-b4c6-71234c69e04b', 'source': 'data/state_of_the_union.txt', 'start_index': 4039}

----------------------------------------------------------------------------------------------------

{'doc_id': '9772c3a0-a9df-4ec2-9b52-4180262016c1', 'source': 'data/state_of_the_union.txt', 'start_index': 3991}

----------------------------------------------------------------------------------------------------

{'doc_id': 'ad278fa4-45ce-4a7a-b4c6-71234c69e04b', 'source': 'data/state_of_the_union.txt', 'start_index': 5963}

----------------------------------------------------------------------------------------------------

{'doc_id': 'f8f03ea0-1361-4fa2-84fe-ade5c72212ca', 'source': 'data/state_of_the_union.txt', 'start_index': 4043}

----------------------------------------------------------------------------------------------------



In [23]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

3


In [24]:
for doc in retrieved_docs:
    print(len(doc.page_content), doc.metadata, doc.page_content, end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

8133 {'source': 'data/state_of_the_union.txt', 'start_index': 24368} Second – we must prepare for new variants. Over the past year, we’ve gotten much better at detecting new variants. 

If necessary, we’ll be able to deploy new vaccines within 100 days instead of many more months or years.  

And, if Congress provides the funds we need, we’ll have new stockpiles of tests, masks, and pills ready if needed. 

I cannot promise a new variant won’t come. But I can promise you we’ll do everything within our power to be ready if it does.  

Third – we can end the shutdown of schools and businesses. We have the tools we need. 

It’s time for Americans to get back to work and fill our great downtowns again.  People working from home can feel safe to begin to return to the office.   

We’re doing that here in the federal government. The vast majority of federal workers will once again work in person. 

Our schools are open. Let’s keep it that way. Our kids need to be in school. 

And with 75% of

In [25]:
for doc in retrieved_docs:
    # print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    print(doc.metadata, end="\n\n" + "-"*100 + "\n\n")

{'source': 'data/state_of_the_union.txt', 'start_index': 24368}

----------------------------------------------------------------------------------------------------

{'source': 'data/state_of_the_union.txt', 'start_index': 16252}

----------------------------------------------------------------------------------------------------

{'source': 'data/state_of_the_union.txt', 'start_index': 32469}

----------------------------------------------------------------------------------------------------



# MultiVector Retriever

In [41]:
loaders = [
    TextLoader("data/paul_graham_essay.txt"),
    TextLoader("data/state_of_the_union.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)
print(len(docs))

12


In [42]:
for doc in docs:
    print(len(doc.page_content), doc, end="\n\n" + "-" * 100 + "\n\n")

9728 page_content='What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.

The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.

The language we used was an early version of Fortran. You had to type programs on punch cards, the

## Smaller chunks

In [48]:
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    embeddings=embeddings,
    connection=database_uri,
    collection_name="split_parents3",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = SQLStore(namespace="split_parents3", db_url=database_uri)
# store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)

doc_ids = [str(uuid.uuid4()) for _ in docs]

In [49]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=2028)

In [50]:
sub_docs = []
for i, doc in enumerate(docs):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [51]:
num_embeddings = retriever.vectorstore.add_documents(sub_docs)

In [52]:
len(num_embeddings)

69

In [53]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [54]:
len(list(store.yield_keys()))

12

In [55]:
# Vectorstore alone retrieves the small chunks
retriever.vectorstore.similarity_search("justice breyer")[0]

Document(id='378de358-9d59-4789-a064-ca16b719abc9', metadata={'doc_id': '335779ac-4099-4c93-a067-279d928fbcbe', 'source': 'data/state_of_the_union.txt'}, page_content='In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. \n\nWe cannot let this happen. \n\nTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.')

In [56]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [57]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

737 In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. 

We cannot let this happen. 

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.

----------------------------------------------------------------------------------------------------

1939 One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did

In [58]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

2


In [59]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")

9875

----------------------------------------------------------------------------------------------------

9194

----------------------------------------------------------------------------------------------------



In [60]:
retriever.search_type = SearchType.mmr

In [61]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

4


In [62]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")

9875

----------------------------------------------------------------------------------------------------

9194

----------------------------------------------------------------------------------------------------

9840

----------------------------------------------------------------------------------------------------

6975

----------------------------------------------------------------------------------------------------



## Summary

In [63]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    # | ChatOpenAI(model_name="gpt-4o", max_retries=0)
    | ChatBedrockConverse(model="anthropic.claude-3-5-sonnet-20240620-v1:0", temperature=0)
    | StrOutputParser()
)

In [64]:
summaries = chain.batch(docs, {"max_concurrency": 5})

In [65]:
print(len(summaries))

12


In [66]:
for summary in summaries:
    print(len(summary), summary, end="\n\n" + "-" * 100 + "\n\n")

1349 This document is a personal essay by Paul Graham, detailing his early experiences with programming and his journey through education and career choices. Here's a summary of the key points:

1. Graham's early interests were writing and programming, starting with an IBM 1401 in junior high school.

2. He got his first personal computer, a TRS-80, around 1980 and began programming more seriously.

3. In college, he initially studied philosophy but found it disappointing and switched to AI.

4. He learned Lisp, which greatly expanded his concept of programming.

5. For his undergraduate thesis, he reverse-engineered SHRDLU, an early natural language understanding program.

6. In graduate school at Harvard, he realized that AI, as practiced at the time, was not going to achieve true natural language understanding.

7. Disillusioned with AI, he focused on Lisp and began writing a book about it.

8. He struggled with the choice between theoretical and systems work in Computer Science, pr

In [75]:
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    embeddings=openai_embeddings,
    connection=database_uri,
    collection_name="summaries",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = SQLStore(namespace="summaries", db_url=database_uri)
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [76]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

In [77]:
for doc in summary_docs:
    print(doc, end="\n\n" + "-" * 100 + "\n\n")

page_content='This document is a personal essay by Paul Graham, detailing his early experiences with programming and his journey through education and career choices. Here's a summary of the key points:

1. Graham's early interests were writing and programming, starting with an IBM 1401 in junior high school.

2. He got his first personal computer, a TRS-80, around 1980 and began programming more seriously.

3. In college, he initially studied philosophy but found it disappointing and switched to AI.

4. He learned Lisp, which greatly expanded his concept of programming.

5. For his undergraduate thesis, he reverse-engineered SHRDLU, an early natural language understanding program.

6. In graduate school at Harvard, he realized that AI, as practiced at the time, was not going to achieve true natural language understanding.

7. Disillusioned with AI, he focused on Lisp and began writing a book about it.

8. He struggled with the choice between theoretical and systems work in Computer Sc

In [78]:
num_embeddings = retriever.vectorstore.add_documents(summary_docs)

In [79]:
len(num_embeddings)

12

In [80]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [81]:
# We can also add the original chunks to the vectorstore if we so want
for i, doc in enumerate(docs):
    doc.metadata[id_key] = doc_ids[i]
retriever.vectorstore.add_documents(docs)

['5c9c702d-2cc6-487e-80d2-1b2a0895cad3',
 '196fb7e9-e781-460b-8e48-8a53e3b2f7b6',
 '8614b946-de41-4962-b45b-76c5d25bdd4e',
 '88247436-5c63-40bd-9dbd-8a0e9e795aff',
 'd8174992-77a7-4d81-b17c-01d82326cac6',
 '2afe4901-ad2f-43f8-9135-a164dfb05fa0',
 '653cae1b-1fd2-413a-976f-afea60a44a9e',
 '4b37c125-637d-49cd-a8db-db7e3f1dae2d',
 'd069b04e-6118-45c6-996a-ae281bb59145',
 'd44acd11-dcf3-4658-be91-9cfc613b9ea3',
 'd60a86ec-a3a3-426c-a7c6-5009ddc384e7',
 '8ed49ab8-4cab-4d70-abbe-d1d05ccb0e9a']

In [82]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [83]:
for doc in sub_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

9875

----------------------------------------------------------------------------------------------------

1427

----------------------------------------------------------------------------------------------------

1157

----------------------------------------------------------------------------------------------------

9194

----------------------------------------------------------------------------------------------------



In [84]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

2


In [85]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

9875

----------------------------------------------------------------------------------------------------

9194

----------------------------------------------------------------------------------------------------



## Hypothetical Queries

In [86]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [87]:
chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 3 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        "Generate a list of exactly 3 hypothetical questions that the below document could be used to answer:\n\n{doc}"
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [88]:
len(docs)

12

In [89]:
chain.invoke(docs[0])

['What were some of the challenges and limitations faced by early computer programmers using the IBM 1401?',
 "How did the transition from punch card computers to microcomputers impact the author's programming experience?",
 'What realizations led the author to shift focus from AI to Lisp during their graduate studies?']

In [90]:
hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})

In [92]:
len(hypothetical_questions)

12

In [93]:
for questions in hypothetical_questions:
    print(questions, end="\n\n" + "-" * 100 + "\n\n")

["How did the author's early experiences with computers influence their career path?", 'What led the author to transition from studying philosophy to artificial intelligence?', "How did the author's perspective on AI change during their graduate studies?"]

----------------------------------------------------------------------------------------------------

['What challenges might someone face when transitioning from a computer science PhD program to a career in art?', "How can the experience of attending prestigious educational institutions differ from one's expectations?", "What are the potential impacts of working in a job that doesn't align with one's skills or interests?"]

----------------------------------------------------------------------------------------------------

['What are the potential drawbacks of technology companies being run by sales people instead of product people?', "How can the concept of 'the low end eating the high end' apply to business strategy in startups

In [94]:
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    embeddings=openai_embeddings,
    connection=database_uri,
    collection_name="hypo-questions",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = SQLStore(namespace="hypo-questions", db_url=database_uri)
# store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [95]:
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )

In [96]:
len(question_docs)

36

In [97]:
for doc in question_docs:
    print(doc, end="\n\n" + "-" * 100 + "\n\n")

page_content='How did the author's early experiences with computers influence their career path?' metadata={'doc_id': '6b697c48-457a-496f-b174-670317a8c906'}

----------------------------------------------------------------------------------------------------

page_content='What led the author to transition from studying philosophy to artificial intelligence?' metadata={'doc_id': '6b697c48-457a-496f-b174-670317a8c906'}

----------------------------------------------------------------------------------------------------

page_content='How did the author's perspective on AI change during their graduate studies?' metadata={'doc_id': '6b697c48-457a-496f-b174-670317a8c906'}

----------------------------------------------------------------------------------------------------

page_content='What challenges might someone face when transitioning from a computer science PhD program to a career in art?' metadata={'doc_id': '2536a746-be20-4cfa-9e31-41f7c0e8589d'}

---------------------------------

In [98]:
num_embeddings = retriever.vectorstore.add_documents(question_docs)

In [99]:
len(num_embeddings)

36

In [100]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [101]:
for i in list(zip(doc_ids, docs)):
    print(type(i))
    print(i, end="\n\n" + "-" * 100 + "\n\n")

<class 'tuple'>
('6b697c48-457a-496f-b174-670317a8c906', Document(metadata={'source': 'data/paul_graham_essay.txt', 'doc_id': '2794fc67-3f50-4f5e-9fe6-c81771d1ad41'}, page_content='What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card

In [102]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [103]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

106 What are the proposed measures to reform the U.S. immigration system and provide a pathway to citizenship?

----------------------------------------------------------------------------------------------------

112 How does the Bipartisan Infrastructure Law aim to enhance America's infrastructure and economic competitiveness?

----------------------------------------------------------------------------------------------------

79 What factors influenced the author's decision to eventually leave Y Combinator?

----------------------------------------------------------------------------------------------------

101 In what ways did the creation of Hacker News impact the operations and stress levels at Y Combinator?

----------------------------------------------------------------------------------------------------



In [104]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

4


In [105]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

9194

----------------------------------------------------------------------------------------------------

9902

----------------------------------------------------------------------------------------------------

9844

----------------------------------------------------------------------------------------------------

9675

----------------------------------------------------------------------------------------------------



In [106]:
for doc in retrieved_docs:
    # print(len(doc.page_content), end="\n\n" + "-"*100 + "\n\n")
    print(doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. 

And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. 

We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  

We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.  

We’re putting i