In [None]:
%pip install -qU pypdf

In [None]:
%pip install -qU langchain-ollama

In [14]:
from langchain_community.document_loaders import PyPDFLoader

from langchain_ollama import OllamaEmbeddings

from langchain_core.vectorstores import InMemoryVectorStore
from langchain.schema import Document

In [15]:


loader = PyPDFLoader(
    "../../00-example_data/layout-parser-paper.pdf",
)

In [None]:
docs = loader.load()
docs[0]

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.split_documents(docs)

In [None]:
texts

In [None]:
len(texts)

In [None]:
# Step 3: Configure the Ollama model for embeddings
embedding_model = OllamaEmbeddings(
    model="nomic-embed-text:latest",
    base_url="http://localhost:11434",  # Replace with your Ollama base URL
)

In [10]:
# Step 4: Generate embeddings and store them using FAISS
from langchain.docstore.document import Document

# Convert split texts to LangChain Document objects
documents = [Document(page_content=text.page_content) for text in texts]


In [None]:
documents

In [25]:
text_contents = [text.page_content for text in texts]

vector_store = InMemoryVectorStore.from_texts(
    texts=text_contents,
    embedding=embedding_model,
)

In [None]:
# Assuming vector_store is already created and populated
print(f"Number of documents in the vector store: {len(vector_store.store)}")


In [None]:
# List all document IDs
print("Document IDs in the vector store:")
for doc_id in vector_store.store.keys():
    print(doc_id)

In [None]:
# Print all document contents
print("Documents in the vector store:")
for doc_id, data in vector_store.store.items():
    print(f"Document ID: {doc_id}")
    print(f"Content: {data['doc'].page_content[:200]}...")  # Print first 200 characters
    print("-" * 50)

In [None]:
# Print document metadata
print("Document metadata:")
for doc_id, data in vector_store.store.items():
    print(f"Document ID: {doc_id}")
    print(f"Metadata: {data['doc'].metadata}")
    print("-" * 50)

In [None]:
# Generalized exploration of vector_store
print(f"Number of documents in vector store: {len(vector_store.store)}")

# Inspect the structure of the first item
first_item = list(vector_store.store.values())[0]
print("First item structure:", first_item)

# Iterate and adapt based on structure
for doc_id, data in vector_store.store.items():
    print(f"Document ID: {doc_id}")
    if isinstance(data, Document):  # Check if it's a Document object
        print(f"Content: {data.page_content[:200]}...")
        print(f"Metadata: {data.metadata}")
    elif isinstance(data, dict):  # If it's a dictionary
        print(f"Content: {data.get('content', 'No content available')[:200]}...")
        print(f"Metadata: {data.get('metadata', 'No metadata available')}")
    print("-" * 50)


In [None]:
# Create an in-memory vector store
vector_store = InMemoryVectorStore.from_documents(texts, embedding_model)

In [None]:




# Create FAISS vector store
vector_store = FAISS.from_documents(documents, embedding_model)

print("Generated and stored embeddings for all chunks.")

# Step 5: Explore the generated embeddings
# Display the vector store information
print(f"Number of documents in vector store: {len(vector_store.index_to_docstore)}")

# Step 6: Perform a similarity search
query = "What is layout parsing?"
similar_docs = vector_store.similarity_search(query, k=3)

print("\nMost similar chunks to the query:")
for i, doc in enumerate(similar_docs, start=1):
    print(f"Chunk {i}:")
    print(doc.page_content[:200], "...")
    print("-" * 50)
