In [3]:
import os
import uuid

from dotenv import load_dotenv
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain.retrievers import ParentDocumentRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever, SearchType
from langchain.storage import InMemoryByteStore, InMemoryStore
from langchain.vectorstores.pgvector import DistanceStrategy, PGVector
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [4]:
load_dotenv()

True

In [5]:
database_uri: str = os.getenv("DATABASE_URL")

In [6]:
loaders = [
    TextLoader("paul_graham_essay.txt"),
    TextLoader("state_of_the_union.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
print(len(docs))

2


In [7]:
for doc in docs:
    print(doc, end="\n\n")

page_content='What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was an early version of Fortran. You had to type programs on punch card

In [8]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Retrieving full documents

In [9]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="full_documents",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

  warn_deprecated(
  warn_deprecated(


In [10]:
retriever.add_documents(docs, ids=None)

In [11]:
list(store.yield_keys())

['0f1486e2-b240-4063-80fe-b85135e4ee67',
 '1e911c67-ea5e-4f4b-bcb1-96f42538b857']

In [12]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [13]:
print(sub_docs[0].page_content)

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.


In [16]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

390 Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.

----------------------------------------------------------------------------------------------------

332 One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.

----------------------------------------------------------------------------------------------------

324 A former top litigator in private practice. A former feder

In [17]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

1


In [18]:
len(retrieved_docs[0].page_content)

38540

# Retrieving larger chunks

In [114]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="split_parents",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryStore()

  warn_deprecated(


In [115]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [116]:
retriever.add_documents(docs, ids=None)

In [117]:
len(list(store.yield_keys()))

66

In [118]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [119]:
print(sub_docs[0].page_content)

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.


In [120]:
for doc in sub_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

390

----------------------------------------------------------------------------------------------------

332

----------------------------------------------------------------------------------------------------

324

----------------------------------------------------------------------------------------------------

309

----------------------------------------------------------------------------------------------------



In [121]:
for doc in sub_docs:
    # print(len(doc.page_content), end="\n\n" + "-"*100 + "\n\n")
    print(doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.

----------------------------------------------------------------------------------------------------

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.

----------------------------------------------------------------------------------------------------

A former top litigator in private practice. A former federal public de

In [123]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

2


In [125]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

1849

----------------------------------------------------------------------------------------------------

1897

----------------------------------------------------------------------------------------------------



In [126]:
for doc in retrieved_docs:
    # print(len(doc.page_content), end="\n\n" + "-"*100 + "\n\n")
    print(doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. 

We cannot let this happen. 

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal publi

# MultiVector Retriever

In [127]:
loaders = [
    TextLoader("paul_graham_essay.txt"),
    TextLoader("state_of_the_union.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)
print(len(docs))

12


In [128]:
for doc in docs:
    print(doc, end="\n\n")

page_content='What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was an early version of Fortran. You had to type programs on punch card

## Smaller chunks

In [46]:
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="full_documents2",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)

doc_ids = [str(uuid.uuid4()) for _ in docs]

  warn_deprecated(


In [48]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [49]:
sub_docs = []
for i, doc in enumerate(docs):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [50]:
retriever.vectorstore.add_documents(sub_docs)

['56cb6f36-14cd-492b-b0ab-4f1a02b5b73a',
 '5edbd97e-8fe2-48fc-8efe-d3cb79557d40',
 'ec51f25b-7da6-49a5-b0f9-0f3334069c40',
 '3802fa8c-e486-426c-8636-b3eb009f5dcd',
 'deeaa05a-0863-4616-80ca-2d5c460f96ca',
 '1861e038-4fc0-4932-b0e8-011c7a94e692',
 'e3ec7540-f888-4d7f-8977-6b59d6516c44',
 'ad1b909f-b784-4ecd-aca7-881a22b01c4d',
 '1499134f-1554-41b1-9eb6-34fa2a36d47a',
 '77f6f1b6-a183-41c0-a6ee-7d5d5f0253c0',
 '5ae3fb23-b254-4028-a121-c27dc5e7a92a',
 'c8fdc195-97e3-4b1f-8e7a-50a53c71a066',
 '064b9b31-58d4-4136-8fcf-f14a64a0907d',
 '9ce091c7-5497-4cb2-9fc9-ad80b95d5b36',
 '97edb638-0c18-4af6-be42-d991eed51dbb',
 '22816b66-ec66-4d5d-b674-461f6eb2aa52',
 '3906f7de-5895-4368-b784-dcde5d53e100',
 '1d0057b6-a3c4-4187-88d4-a214f67359b2',
 'ac42ff0a-48b9-4b9f-bccc-02421f3c5ea9',
 '9377866e-7124-4c9b-bc1c-c8ca00f07624',
 '43e43dea-692a-431d-ba50-c4bf934f6035',
 'c22b01c9-0618-468d-abb0-fab3588776ca',
 'cb81963a-22cc-40c3-96ff-d213f6f5390e',
 '0413d292-128a-4484-81f0-6bcec1b53ec0',
 '725733bb-7571-

In [51]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [52]:
# Vectorstore alone retrieves the small chunks
retriever.vectorstore.similarity_search("justice breyer")[0]

Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'source': 'state_of_the_union.txt', 'doc_id': '6a85cc91-e32a-4d26-a362-81ba97c30c27'})

In [53]:
# Retriever returns larger chunks
len(retriever.invoke("justice breyer")[0].page_content)

9875

In [66]:
retriever.search_type = SearchType.mmr
len(retriever.invoke("justice breyer")[0].page_content)

9875

## Summary

In [69]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(model_name="gpt-4-turbo", max_retries=0)
    | StrOutputParser()
)

In [70]:
summaries = chain.batch(docs, {"max_concurrency": 5})

In [73]:
print(len(summaries))

12


In [71]:
for summary in summaries:
    print(summary)

The document is a reflective essay by the author about their experiences and evolution in programming, writing, and their academic and professional pursuits.

The author begins by discussing their early interests in writing short stories and programming during high school, using an IBM 1401. They describe their initial struggles with programming, mentioning how limited interactions with the computer were, which led to a lack of memorable programs from that time. The arrival of microcomputers significantly changed the computing landscape, allowing for more interactive and immediate programming experiences.

In college, the author initially intended to study philosophy but found it unsatisfactory and switched to artificial intelligence (AI), inspired by science fiction and advances in the field. They taught themselves Lisp, a programming language associated with AI, and worked on an undergraduate thesis that involved reverse-engineering an AI program called SHRDLU.

The author's exciteme

In [72]:
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="summaries",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

  warn_deprecated(


In [74]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

In [75]:
for doc in summary_docs:
    print(doc, end="\n\n")

page_content="The document is a reflective essay by the author about their experiences and evolution in programming, writing, and their academic and professional pursuits.\n\nThe author begins by discussing their early interests in writing short stories and programming during high school, using an IBM 1401. They describe their initial struggles with programming, mentioning how limited interactions with the computer were, which led to a lack of memorable programs from that time. The arrival of microcomputers significantly changed the computing landscape, allowing for more interactive and immediate programming experiences.\n\nIn college, the author initially intended to study philosophy but found it unsatisfactory and switched to artificial intelligence (AI), inspired by science fiction and advances in the field. They taught themselves Lisp, a programming language associated with AI, and worked on an undergraduate thesis that involved reverse-engineering an AI program called SHRDLU.\n\nT

In [76]:
retriever.vectorstore.add_documents(summary_docs)

['5c8d5d0a-ddf5-431b-bff0-97b5b48d7999',
 '458eee57-31a3-4da6-b001-94130d4d7502',
 '1ab85b80-1fdb-4a18-b6a3-efda4eaa4d6a',
 'f92c97fe-642b-434f-b07a-ca86affc087d',
 'bb1a1c0a-e8ae-4030-ade8-090c70483d4e',
 '65cf8d86-2a50-4a00-b113-ceeb22636e41',
 'd0be3239-3e67-43e4-9f1c-ab9bb9ea3d4b',
 'd78cc5ba-2479-4839-8f2b-6b334d927aab',
 '52915008-f46f-46bc-b98f-1f80bb4d27ab',
 '0d01433b-7a38-4ee1-8c4d-be19e4456f84',
 '9d975c62-f155-462a-885e-c01ad4e0dfce',
 'a9d96309-3f28-46fe-bc8b-d4ccae0da639']

In [77]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [78]:
# We can also add the original chunks to the vectorstore if we so want
for i, doc in enumerate(docs):
    doc.metadata[id_key] = doc_ids[i]
retriever.vectorstore.add_documents(docs)

['dec27cde-057a-4dbc-8a1d-3a6b327c0940',
 'c4b0e63a-00e8-4f3f-8e6c-d0abcd1577b3',
 '78153c1a-1cb1-4f37-b3ee-aa03a923cc87',
 '5d0932ed-b690-4f20-9147-02a21586b9a2',
 '9161de84-477c-44c6-aa19-304c2f88a745',
 '3f06cd33-234e-417f-b059-2cd593a04164',
 '47d84c5b-0d3a-46b6-a26a-22c5c5a20d17',
 '27573f2a-17a9-46a9-bb5e-f162b1a9ba24',
 '7cb61ee6-36a4-4c93-8318-cd1d54b267d1',
 '5e650e6d-13cc-4cdc-a5a3-625e4721bbf4',
 '85c267dd-f930-4509-9c47-47e8b9f4d9c4',
 '96e8315d-01e7-4e7c-bb3e-84cb414ac8ad']

In [85]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [88]:
for doc in sub_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

9875

----------------------------------------------------------------------------------------------------

1358

----------------------------------------------------------------------------------------------------

9194

----------------------------------------------------------------------------------------------------

1563

----------------------------------------------------------------------------------------------------



In [92]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

3


In [93]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

9875

----------------------------------------------------------------------------------------------------

9194

----------------------------------------------------------------------------------------------------

9902

----------------------------------------------------------------------------------------------------



## Hypothetical Queries

In [130]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [132]:
chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 3 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        "Generate a list of exactly 3 hypothetical questions that the below document could be used to answer:\n\n{doc}"
    )
    | ChatOpenAI(max_retries=0, model="gpt-4-turbo").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [133]:
len(docs)

12

In [134]:
chain.invoke(docs[0])

['How does early exposure to programming influence career choices in technology fields?',
 'What role does self-directed learning play in the development of expertise in emerging technologies?',
 'How can the evolution of computing technology from the 1980s to now impact the way we approach problem-solving in programming?']

In [135]:
hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})

In [139]:
len(hypothetical_questions)

12

In [141]:
for questions in hypothetical_questions:
    print(questions, end="\n\n")

['What might have happened if the author had continued to focus on philosophy instead of switching to AI during college?', "How might the author's career have differed if he had access to more advanced technology, like modern computers, during his initial programming experiences in high school?", "What could have been the impact on the author's perception of AI if his early experiences with programming had been more successful and engaging?"]

['Can pursuing art alongside a technical career provide a balanced and fulfilling life?', 'How effective are unconventional educational paths in nurturing personal growth and professional success?', 'Does the integration of arts into a technical education enhance creative problem-solving skills?']

['What can be learned from the leadership and management of technology companies based on experiences at Interleaf?', "How can art school experiences impact an artist's career trajectory and development of a signature style?", 'What are the implication

In [136]:
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    connection_string=database_uri,
    embedding_function=embeddings,
    collection_name="hypo-questions",
    distance_strategy=DistanceStrategy.COSINE,
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

  warn_deprecated(


In [142]:
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )

In [143]:
len(question_docs)

36

In [144]:
for doc in question_docs:
    print(doc, end="\n\n")

page_content='What might have happened if the author had continued to focus on philosophy instead of switching to AI during college?' metadata={'doc_id': 'f9bd568a-506b-4ff5-a041-127a348c1c71'}

page_content="How might the author's career have differed if he had access to more advanced technology, like modern computers, during his initial programming experiences in high school?" metadata={'doc_id': 'f9bd568a-506b-4ff5-a041-127a348c1c71'}

page_content="What could have been the impact on the author's perception of AI if his early experiences with programming had been more successful and engaging?" metadata={'doc_id': 'f9bd568a-506b-4ff5-a041-127a348c1c71'}

page_content='Can pursuing art alongside a technical career provide a balanced and fulfilling life?' metadata={'doc_id': '6f1c731a-1c46-4817-9913-d0fa4de145cf'}

page_content='How effective are unconventional educational paths in nurturing personal growth and professional success?' metadata={'doc_id': '6f1c731a-1c46-4817-9913-d0fa4de

In [145]:
retriever.vectorstore.add_documents(question_docs)

['047d9fb8-02e1-4a4e-b240-afb501dbdd9e',
 '841fb744-7515-40e9-961e-ae65829586f8',
 '3e1e5795-255e-4586-a50b-141fc42a25b8',
 '7577870c-13c0-4588-a4b5-574c0a955ad1',
 'dcf5c65c-cba5-4269-a64d-c595d1c9b23c',
 '65455aa1-0a77-4547-96da-91043939692b',
 'fea85472-007a-4a27-a7c2-dffb88352657',
 'fe2fa1a5-7711-46fe-ae03-3270df74fb4c',
 'db4ab6d5-0e5c-42e9-85c4-5a978532abea',
 '691573b8-06dc-43c1-83eb-46b68563aacc',
 'fbdb27df-bfa1-45b1-9618-0d76c8c1e233',
 '767dde1d-49f5-4272-aa1f-976090b2433a',
 'e33c9d22-6f42-4b85-97ed-9be63656a97d',
 '38c00540-965d-4eb5-b885-f8a87545708e',
 'a952f52a-0dd5-46ba-9789-f9149b05ff92',
 'a3b2b9d6-4cd4-4149-9bf5-de8be265e6d4',
 'b4858254-4fbb-42e6-be4c-27e489aac788',
 '8b8684ca-65b6-49cc-bf7d-b055a85eec63',
 'fdee6754-a594-4b02-af7d-443f12262e74',
 'f949b1dc-4f4b-4bde-a81e-c43e8125ad54',
 '8be614d9-01cb-42b3-b213-f6c0ef1e155f',
 'aaccd3fb-0e36-4c29-9b72-a6f4ba8b8bc1',
 'eeab37e6-0f42-4cb9-b2af-dbbded2e07f3',
 '0f8e5527-14ae-4920-82f3-8a675d58eaa7',
 '6727b474-c26f-

In [146]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [151]:
for i in list(zip(doc_ids, docs)):
    print(type(i))
    print(i, end="\n\n")

<class 'tuple'>
('f9bd568a-506b-4ff5-a041-127a348c1c71', Document(page_content='What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was a

In [153]:
sub_docs = vectorstore.similarity_search("justice breyer")
print(len(sub_docs))

4


In [161]:
for doc in sub_docs:
    print(len(doc.page_content), doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

94 What are the qualifications and background of President Biden's nominee for the Supreme Court?

----------------------------------------------------------------------------------------------------

152 How might the appointment of a chief prosecutor for pandemic fraud affect the enforcement of laws against those who committed fraud during the pandemic?

----------------------------------------------------------------------------------------------------

90 How is the current administration addressing the issue of immigration and border security?

----------------------------------------------------------------------------------------------------

89 What life event prompted the protagonist to consider changing leadership at Y Combinator?

----------------------------------------------------------------------------------------------------



In [162]:
retrieved_docs = retriever.invoke("justice breyer")
print(len(retrieved_docs))

3


In [164]:
for doc in retrieved_docs:
    print(len(doc.page_content), end="\n\n" + "-" * 100 + "\n\n")
    # print(doc.page_content, end="\n\n" + "-"*100 + "\n\n")

9194

----------------------------------------------------------------------------------------------------

9875

----------------------------------------------------------------------------------------------------

9844

----------------------------------------------------------------------------------------------------



In [165]:
for doc in retrieved_docs:
    # print(len(doc.page_content), end="\n\n" + "-"*100 + "\n\n")
    print(doc.page_content, end="\n\n" + "-" * 100 + "\n\n")

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. 

A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. 

And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. 

We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  

We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.  

We’re putting i