In [28]:
%pip install -qU pypdf

Note: you may need to restart the kernel to use updated packages.


In [29]:
%pip install -qU langchain-ollama

Note: you may need to restart the kernel to use updated packages.


In [30]:
from langchain_community.document_loaders import PyPDFLoader

from langchain_ollama import OllamaEmbeddings

from langchain_core.vectorstores import InMemoryVectorStore
from langchain.schema import Document

In [31]:


loader = PyPDFLoader(
    "../../00-example_data/layout-parser-paper.pdf",
)

In [32]:
docs = loader.load()
docs[0]

Document(metadata={'source': '../../00-example_data/layout-parser-paper.pdf', 'page': 0}, page_content='LayoutParser: A Uniﬁed Toolkit for Deep\nLearning Based Document Image Analysis\nZejiang Shen1 (\x00 ), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\nLee4, Jacob Carlson3, and Weining Li5\n1 Allen Institute for AI\nshannons@allenai.org\n2 Brown University\nruochen zhang@brown.edu\n3 Harvard University\n{melissadell,jacob carlson}@fas.harvard.edu\n4 University of Washington\nbcgl@cs.washington.edu\n5 University of Waterloo\nw422li@uwaterloo.ca\nAbstract. Recent advances in document image analysis (DIA) have been\nprimarily driven by the application of neural networks. Ideally, research\noutcomes could be easily deployed in production and extended for further\ninvestigation. However, various factors like loosely organized codebases\nand sophisticated model conﬁgurations complicate the easy reuse of im-\nportant innovations by a wide audience. Though there have been on-going\

In [33]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.split_documents(docs)

In [34]:
texts

[Document(metadata={'source': '../../00-example_data/layout-parser-paper.pdf', 'page': 0}, page_content='LayoutParser: A Uniﬁed Toolkit for Deep\nLearning Based Document Image Analysis'),
 Document(metadata={'source': '../../00-example_data/layout-parser-paper.pdf', 'page': 0}, page_content='Zejiang Shen1 (\x00 ), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain'),
 Document(metadata={'source': '../../00-example_data/layout-parser-paper.pdf', 'page': 0}, page_content='Lee4, Jacob Carlson3, and Weining Li5\n1 Allen Institute for AI\nshannons@allenai.org'),
 Document(metadata={'source': '../../00-example_data/layout-parser-paper.pdf', 'page': 0}, page_content='2 Brown University\nruochen zhang@brown.edu\n3 Harvard University'),
 Document(metadata={'source': '../../00-example_data/layout-parser-paper.pdf', 'page': 0}, page_content='{melissadell,jacob carlson}@fas.harvard.edu\n4 University of Washington\nbcgl@cs.washington.edu'),
 Document(metadata={'source': '../../00-example_data/

In [35]:
len(texts)

575

In [36]:
# Step 3: Configure the Ollama model for embeddings
embedding_model = OllamaEmbeddings(
    model="nomic-embed-text:latest",
    base_url="http://localhost:11434",  # Replace with your Ollama base URL
)

In [37]:
# Step 4: Generate embeddings and store them using FAISS
from langchain.docstore.document import Document

# Convert split texts to LangChain Document objects
documents = [Document(page_content=text.page_content) for text in texts]


In [38]:
documents

[Document(metadata={}, page_content='LayoutParser: A Uniﬁed Toolkit for Deep\nLearning Based Document Image Analysis'),
 Document(metadata={}, page_content='Zejiang Shen1 (\x00 ), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain'),
 Document(metadata={}, page_content='Lee4, Jacob Carlson3, and Weining Li5\n1 Allen Institute for AI\nshannons@allenai.org'),
 Document(metadata={}, page_content='2 Brown University\nruochen zhang@brown.edu\n3 Harvard University'),
 Document(metadata={}, page_content='{melissadell,jacob carlson}@fas.harvard.edu\n4 University of Washington\nbcgl@cs.washington.edu'),
 Document(metadata={}, page_content='5 University of Waterloo\nw422li@uwaterloo.ca'),
 Document(metadata={}, page_content='w422li@uwaterloo.ca\nAbstract. Recent advances in document image analysis (DIA) have been'),
 Document(metadata={}, page_content='primarily driven by the application of neural networks. Ideally, research'),
 Document(metadata={}, page_content='outcomes could be easily d

In [39]:
text_contents = [text.page_content for text in texts]

vector_store = InMemoryVectorStore.from_texts(
    texts=text_contents,
    embedding=embedding_model,
)

In [40]:
# Assuming vector_store is already created and populated
print(f"Number of documents in the vector store: {len(vector_store.store)}")


Number of documents in the vector store: 575


In [41]:
# List all document IDs
print("Document IDs in the vector store:")
for doc_id in vector_store.store.keys():
    print(doc_id)

Document IDs in the vector store:
69dda608-ed00-47c8-b5ce-4df41485b7b3
a49449ee-56a3-43c5-a9d0-5434c4857e7e
02e5c806-65b5-4647-9cd2-a569f8fceaa3
942e45c2-1637-498a-be93-d95ab0a834be
cf77af3f-e92a-46c7-a5ff-ffcb84781ffa
4dbf4762-6d0e-4eed-a0bd-54f732077285
253212c2-8500-4494-981d-edcd55636af9
7e0ea278-deab-4da6-9677-81b56ca1afe0
a3739785-7d8e-49a6-8ade-35a29706cc11
0c479acd-0a98-4a45-8dfb-61b8c625bbee
786854fe-d737-4ae5-8557-ca627d23bdf1
ce95831d-ca09-475c-8981-6150bbb184fa
419d66d5-6082-4451-8ec0-d94656b3e929
002a2b57-79dd-41f2-9fa2-28f16bbd0f37
dfb5fd12-912b-4b21-a3e8-023f457343d7
13b89379-a8db-4ed7-bcdc-b04cadce126d
8b6e27d2-ec7a-4334-8b43-ef598f168247
e64e6c25-650a-407a-b3e5-2b3cbc3e4436
9681145a-157c-4a40-b950-761a1e079de2
2d32b162-666d-42f9-bf7d-9611a1719ca6
cddf9e3e-1cb8-4abd-8a4f-95bf07bdd0a4
5874c52c-3c41-45a5-92f6-36d7b2a29937
adb50e2a-54a1-41d7-8ece-3fb29c39b2f3
ed720504-aad4-4ec5-94c3-cc236c689898
85f81864-53b9-410c-8041-ba141159469b
539cfb3d-85d0-4856-8182-b7fe76b774e9
e18d

In [42]:
# Print all document contents
print("Documents in the vector store:")
for doc_id, data in vector_store.store.items():
    print(f"Document ID: {doc_id}")
    #print(f"Content: {data['doc'].page_content[:200]}...")  # Print first 200 characters
    # Safely access 'Content' within 'data'
    print("Extracting all keys and values from Content:")
    for key, value in data.items():
        if isinstance(value, list) and len(value) > 10:  # Handle long lists (e.g., vector)
            print(f"{key}: {value[:10]}... (showing first 10 items)")
        else:
            print(f"{key}: {value}")
    #print(data)
    print("-" * 50)

Documents in the vector store:
Document ID: 69dda608-ed00-47c8-b5ce-4df41485b7b3
Extracting all keys and values from Content:
id: 69dda608-ed00-47c8-b5ce-4df41485b7b3
vector: [-0.0048948154, 0.034870207, -0.13520098, -0.11868732, -0.013242056, -0.059019495, 0.045409497, 0.024133539, -0.05943936, -0.04321857]... (showing first 10 items)
text: LayoutParser: A Uniﬁed Toolkit for Deep
Learning Based Document Image Analysis
metadata: {}
--------------------------------------------------
Document ID: a49449ee-56a3-43c5-a9d0-5434c4857e7e
Extracting all keys and values from Content:
id: a49449ee-56a3-43c5-a9d0-5434c4857e7e
vector: [-0.04287939, 0.0014060503, -0.15967117, 0.0065604122, 0.018431421, 0.041369267, 0.0139672505, -0.05127808, -0.01953128, 0.024859302]... (showing first 10 items)
text: Zejiang Shen1 (  ), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain
metadata: {}
--------------------------------------------------
Document ID: 02e5c806-65b5-4647-9cd2-a569f8fceaa3
Extracting 