In [1]:
%pip install langchain langchain_community langchain_ollama langchain_text_splitters

Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain_ollama import OllamaEmbeddings

embedding_model = OllamaEmbeddings(
    model="llama3.2:latest",
    base_url="http://localhost:11434"  # Ensure Ollama is running locally
)

In [3]:
from langchain_community.document_loaders import TextLoader

# Load a text document
file_path = "../../00-example_data/state_of_the_union.txt"
document_loader = TextLoader(file_path)
raw_documents = document_loader.load()

print("Number of documents loaded:", len(raw_documents))
print("Sample Document:", raw_documents[0].page_content[:200])

Number of documents loaded: 1
Sample Document: Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  

Last year COVID-19 kept us apart. 


In [4]:
from langchain_text_splitters import CharacterTextSplitter

# Split text into manageable chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(raw_documents)

print("Number of chunks created:", len(chunks))
print("Sample Chunk:", chunks[0].page_content[:200])

Number of chunks created: 49
Sample Chunk: Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  

Last year COVID-19 kept us apart. 


In [5]:
# Generate embeddings for each chunk
chunk_embeddings = [embedding_model.embed_query(chunk.page_content) for chunk in chunks]

print("Number of embeddings generated:", len(chunk_embeddings))
print("First Chunk Embedding (first 5 values):", chunk_embeddings[0][:5])

Number of embeddings generated: 49
First Chunk Embedding (first 5 values): [0.003959221, 0.012483264, -0.011752846, -0.010141397, -0.0100551145]


In [6]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.schema import Document

# Convert chunks to LangChain Documents
documents = [Document(page_content=chunk.page_content) for chunk in chunks]

# Create an in-memory vector store
vector_store = InMemoryVectorStore.from_documents(documents, embedding_model)

print("Number of documents in vector store:", len(vector_store.store))

Number of documents in vector store: 49


In [7]:
# Define a query
query = "What is the state of the union?"

# Search for similar chunks
similar_docs = vector_store.similarity_search(query, k=3)

print("Most similar documents:")
for i, doc in enumerate(similar_docs, start=1):
    print(f"Result {i}:")
    print(doc.page_content[:200])  # Print first 200 characters
    print("-" * 50)

Most similar documents:
Result 1:
To all Americans, I will be honest with you, as I’ve always promised. A Russian dictator, invading a foreign country, has costs around the world. 

And I’m taking robust action to make sure the pain o
--------------------------------------------------
Result 2:
We are choking off Russia’s access to technology that will sap its economic strength and weaken its military for years to come.  

Tonight I say to the Russian oligarchs and corrupt leaders who have b
--------------------------------------------------
Result 3:
And built the strongest, freest, and most prosperous nation the world has ever known. 

Now is the hour. 

Our moment of responsibility. 

Our test of resolve and conscience, of history itself. 

It i
--------------------------------------------------
