In [None]:
%pip install -qU pypdf

In [None]:
%pip install -qU langchain-ollama

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore


In [2]:
loader = PyPDFLoader(
    "../../00-example_data/layout-parser-paper.pdf",
)

In [None]:
docs = loader.load()
docs[0]

In [None]:
len(docs)

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.split_documents(docs)

In [None]:
texts

In [None]:
len(texts)

In [8]:
# Step 3: Configure the Ollama model for embeddings
embedding_model = OllamaEmbeddings(
    model="nomic-embed-text:latest",
    base_url="http://localhost:11434",  # Replace with your Ollama base URL
)

In [9]:
# Step 4: Generate embeddings and store them using FAISS
from langchain.docstore.document import Document

# Convert split texts to LangChain Document objects
documents = [Document(page_content=text.page_content) for text in texts]

In [None]:
documents

In [11]:
text_contents = [text.page_content for text in texts]

vector_store = InMemoryVectorStore.from_texts(
    texts=text_contents,
    embedding=embedding_model,
)

In [None]:
# Assuming vector_store is already created and populated
print(f"Number of documents in the vector store: {len(vector_store.store)}")

In [None]:
# List all document IDs
print("Document IDs in the vector store:")
for doc_id in vector_store.store.keys():
    print(doc_id)

In [None]:
# Print all document contents
print("Documents in the vector store:")
for doc_id, data in vector_store.store.items():
    print(f"Document ID: {doc_id}")
    #print(f"Content: {data['doc'].page_content[:200]}...")  # Print first 200 characters
    # Safely access 'Content' within 'data'
    print("Extracting all keys and values from Content:")
    for key, value in data.items():
        if isinstance(value, list) and len(value) > 10:  # Handle long lists (e.g., vector)
            print(f"{key}: {value[:10]}... (showing first 10 items)")
        else:
            print(f"{key}: {value}")
    #print(data)
    print("-" * 50)