In [26]:
import os
import uuid

from dotenv import load_dotenv
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain.retrievers import ParentDocumentRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever, SearchType
from langchain.storage import InMemoryByteStore, InMemoryStore
from langchain.vectorstores.pgvector import DistanceStrategy, PGVector
from langchain_community.document_loaders import TextLoader
from langchain_community.storage import SQLStore
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
load_dotenv()

True

In [3]:
database_uri: str = os.getenv("DATABASE_URL")

In [4]:
loaders = [
    TextLoader("paul_graham_essay.txt"),
    TextLoader("state_of_the_union.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
print(len(docs))

2


In [5]:
for doc in docs:
    print(doc, end="\n\n")

page_content='What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was an early version of Fortran. You had to type programs on punch card

In [6]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Retrieving full documents

In [26]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400, add_start_index=True)
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="full_documents",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

  warn_deprecated(


In [27]:
retriever.add_documents(docs, ids=None)

In [28]:
list(store.yield_keys())

['0b8d9e38-55c3-4b9a-83c3-6977432e6070',
 'f3658d11-d817-4adc-bd7c-13662a8ded45']

In [29]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [30]:
print(sub_docs[0].page_content)

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.


In [31]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

390 Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.

----------------------------------------------------------------------------------------------------

332 One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.

----------------------------------------------------------------------------------------------------

324 A former top litigator in private practice. A former feder

In [32]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

1


In [33]:
len(retrieved_docs[0].page_content)

38540

# Retrieving larger chunks

In [17]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, add_start_index=True)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400, add_start_index=True)
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="split_parents",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryStore()

  warn_deprecated(


In [18]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [36]:
retriever.add_documents(docs, ids=None)

In [37]:
len(list(store.yield_keys()))

66

In [38]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [39]:
print(sub_docs[0].page_content)

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.


In [40]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

390 Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.

----------------------------------------------------------------------------------------------------

332 One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.

----------------------------------------------------------------------------------------------------

324 A former top litigator in private practice. A former feder

In [41]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

2


In [42]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

1849

----------------------------------------------------------------------------------------------------

1897

----------------------------------------------------------------------------------------------------



In [43]:
for doc in retrieved_docs:
    # print(len(doc.page_content), end="\n\n" + "-"*100 + "\n\n")
    print(doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. 

We cannot let this happen. 

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal publi

# PostgreSQL Docstore

In [19]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, add_start_index=True)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400, add_start_index=True)
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="split_parents2",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents

In [20]:
store = SQLStore(namespace="split_parents2", db_url=database_uri)

In [21]:
store.create_schema()

In [145]:
store.mset([("key1", b"value1"), ("key2", b"value2")])

In [1]:
# store.mset([("key3", "value3")])

In [147]:
values = store.mget(["key1", "key2"])  # Returns [b"value1", b"value2"]
for value in values:
    print(value)

b'value1'
b'value2'


In [148]:
store.mdelete(["key1"])

In [149]:
for key in store.yield_keys():
    print(key)

key2


In [157]:
store.mdelete(["key1", "key2"])

In [22]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [23]:
retriever.add_documents(docs, ids=None)

In [24]:
len(list(store.yield_keys()))

198

In [170]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [171]:
print(sub_docs[0].page_content)

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.


In [172]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

390 Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.

----------------------------------------------------------------------------------------------------

332 One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.

----------------------------------------------------------------------------------------------------

324 A former top litigator in private practice. A former feder

In [173]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

2


In [174]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

1849

----------------------------------------------------------------------------------------------------

1897

----------------------------------------------------------------------------------------------------



In [175]:
for doc in retrieved_docs:
    # print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. 

We cannot let this happen. 

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal publi

# MultiVector Retriever

In [68]:
loaders = [
    TextLoader("paul_graham_essay.txt"),
    TextLoader("state_of_the_union.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)
print(len(docs))

12


In [69]:
for doc in docs:
    print(len(doc.page_content), doc, end="\n\n" + "-" * 100 + "\n\n")

9728 page_content='What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was an early version of Fortran. You had to type programs on punch

## Smaller chunks

In [70]:
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="full_documents2",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)

doc_ids = [str(uuid.uuid4()) for _ in docs]

In [71]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [72]:
sub_docs = []
for i, doc in enumerate(docs):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [73]:
num_embeddings = retriever.vectorstore.add_documents(sub_docs)

In [74]:
len(num_embeddings)

489

In [75]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [76]:
len(list(store.yield_keys()))

12

In [77]:
# Vectorstore alone retrieves the small chunks
retriever.vectorstore.similarity_search("justice breyer")[0]

Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'source': 'state_of_the_union.txt', 'doc_id': 'c46d11e4-80e6-4ac3-8729-11463b70cdca'})

In [78]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [79]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

390 Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.

----------------------------------------------------------------------------------------------------

332 One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.

----------------------------------------------------------------------------------------------------

324 A former top litigator in private practice. A former feder

In [80]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

2


In [81]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")

9875

----------------------------------------------------------------------------------------------------

9194

----------------------------------------------------------------------------------------------------



In [89]:
retriever.search_type = SearchType.mmr

In [90]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

3


In [91]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")

9875

----------------------------------------------------------------------------------------------------

9194

----------------------------------------------------------------------------------------------------

9947

----------------------------------------------------------------------------------------------------



## Summary

In [92]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(model_name="gpt-4o", max_retries=0)
    | StrOutputParser()
)

In [93]:
summaries = chain.batch(docs, {"max_concurrency": 5})

In [73]:
print(len(summaries))

12


In [94]:
for summary in summaries:
    print(len(summary), summary, end="\n\n" + "-" * 100 + "\n\n")

1542 The document is a personal narrative recounting the author's journey through early interests in writing and programming, leading to a career in computer science and a shift towards art. Before college, the author dabbled in writing short stories and programming on an IBM 1401, although these early efforts were limited by the technology and his own knowledge. The advent of microcomputers, like the TRS-80, allowed for more interactive programming and sparked a deeper engagement, leading him to create simple applications and games.

In college, the author initially planned to study philosophy but found it unfulfilling and switched to artificial intelligence (AI), inspired by literature and documentaries. Lacking formal AI courses, he taught himself Lisp and became deeply involved in AI research, culminating in an undergraduate thesis on reverse-engineering SHRDLU.

The author attended Harvard for graduate school, where he became disillusioned with the prevailing AI methodologies, fin

In [95]:
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="summaries",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

  warn_deprecated(


In [96]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

In [97]:
for doc in summary_docs:
    print(doc, end="\n\n" + "-" * 100 + "\n\n")

page_content="The document is a personal narrative recounting the author's journey through early interests in writing and programming, leading to a career in computer science and a shift towards art. Before college, the author dabbled in writing short stories and programming on an IBM 1401, although these early efforts were limited by the technology and his own knowledge. The advent of microcomputers, like the TRS-80, allowed for more interactive programming and sparked a deeper engagement, leading him to create simple applications and games.\n\nIn college, the author initially planned to study philosophy but found it unfulfilling and switched to artificial intelligence (AI), inspired by literature and documentaries. Lacking formal AI courses, he taught himself Lisp and became deeply involved in AI research, culminating in an undergraduate thesis on reverse-engineering SHRDLU.\n\nThe author attended Harvard for graduate school, where he became disillusioned with the prevailing AI metho

In [98]:
num_embeddings = retriever.vectorstore.add_documents(summary_docs)

In [99]:
len(num_embeddings)

12

In [100]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [101]:
# We can also add the original chunks to the vectorstore if we so want
for i, doc in enumerate(docs):
    doc.metadata[id_key] = doc_ids[i]
retriever.vectorstore.add_documents(docs)

['054a532f-9a20-42e6-933c-27f75928d899',
 'b64e4ba7-3a80-41fb-997c-040d75e8e7e6',
 'a96ad9b0-acf2-4f23-b980-1b0585c70ed8',
 'f69b5769-d814-4e0a-927f-817c7003922e',
 'b3607f41-4e9d-4d93-ab10-2a8ad0276d9d',
 'e3d57a04-737a-4f78-97b3-5c23bb011cfc',
 'e76e38f9-0334-4341-bc87-186bf8714dc7',
 'c163e30b-03a2-4300-b815-f388afedb3ca',
 '752f4e20-725e-4eea-a49c-639eafed7b89',
 '5071daa9-8e2c-460f-93d3-9e1b51bd12bb',
 '4e36658a-bdf0-4caf-aff7-5fd1ff699d79',
 'ea72f57f-8b88-4e40-8395-19228c34964f']

In [102]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [103]:
for doc in sub_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

9875

----------------------------------------------------------------------------------------------------

863

----------------------------------------------------------------------------------------------------

1894

----------------------------------------------------------------------------------------------------

9194

----------------------------------------------------------------------------------------------------



In [104]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

2


In [105]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

9875

----------------------------------------------------------------------------------------------------

9194

----------------------------------------------------------------------------------------------------



## Hypothetical Queries

In [106]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [107]:
chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 3 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        "Generate a list of exactly 3 hypothetical questions that the below document could be used to answer:\n\n{doc}"
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [108]:
len(docs)

12

In [109]:
chain.invoke(docs[0])

["What were the author's initial interests before college?",
 'Why did the author switch from studying philosophy to AI?',
 'What realization did the author have about AI during their first year of grad school?']

In [110]:
hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})

In [111]:
len(hypothetical_questions)

12

In [112]:
for questions in hypothetical_questions:
    print(questions, end="\n\n" + "-" * 100 + "\n\n")

["What early experiences influenced the author's interest in programming?", "How did the author's perception of artificial intelligence change over time?", 'What motivated the author to pursue a career in art instead of computer science?']

----------------------------------------------------------------------------------------------------

['How can an individual transition from a technical field like computer science to a creative field like art?', 'What kind of challenges and sacrifices might one face when deciding to leave a PhD program to pursue a different passion?', "How can a person's early experiences and decisions shape their approach to learning and career changes?"]

----------------------------------------------------------------------------------------------------

['What are some reasons why a technology company might perform better when led by product people rather than sales people?', 'How can the choice of office space impact the productivity and morale of employees i

In [113]:
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="hypo-questions",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

  warn_deprecated(


In [114]:
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )

In [115]:
len(question_docs)

36

In [116]:
for doc in question_docs:
    print(doc, end="\n\n" + "-" * 100 + "\n\n")

page_content="What early experiences influenced the author's interest in programming?" metadata={'doc_id': 'd03cad85-fe6e-4571-9780-b9997d018382'}

----------------------------------------------------------------------------------------------------

page_content="How did the author's perception of artificial intelligence change over time?" metadata={'doc_id': 'd03cad85-fe6e-4571-9780-b9997d018382'}

----------------------------------------------------------------------------------------------------

page_content='What motivated the author to pursue a career in art instead of computer science?' metadata={'doc_id': 'd03cad85-fe6e-4571-9780-b9997d018382'}

----------------------------------------------------------------------------------------------------

page_content='How can an individual transition from a technical field like computer science to a creative field like art?' metadata={'doc_id': 'b946e2a7-7378-4fc6-b005-fdc4587a3301'}

----------------------------------------------------

In [117]:
num_embeddings = retriever.vectorstore.add_documents(question_docs)

In [118]:
len(num_embeddings)

36

In [119]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [120]:
for i in list(zip(doc_ids, docs)):
    print(type(i))
    print(i, end="\n\n" + "-" * 100 + "\n\n")

<class 'tuple'>
('d03cad85-fe6e-4571-9780-b9997d018382', Document(page_content='What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was a

In [121]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [122]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

71 How does the document describe the role of Judge Ketanji Brown Jackson?

----------------------------------------------------------------------------------------------------

165 How does the administration plan to ensure fair competition in the marketplace and what specific actions are being taken against corporations that exploit consumers?

----------------------------------------------------------------------------------------------------

79 How does the Bipartisan Infrastructure Law aim to address environmental issues?

----------------------------------------------------------------------------------------------------

96 What are the expected economic impacts of the Bipartisan Infrastructure Law on the middle class?

----------------------------------------------------------------------------------------------------



In [123]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

3


In [124]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

9194

----------------------------------------------------------------------------------------------------

9875

----------------------------------------------------------------------------------------------------

9902

----------------------------------------------------------------------------------------------------



In [125]:
for doc in retrieved_docs:
    # print(len(doc.page_content), end="\n\n" + "-"*100 + "\n\n")
    print(doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. 

And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. 

We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  

We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.  

We’re putting i