### Setup

In [1]:
# Load libraries and dependencies
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# Initialize constant variables
PROJECT_ID = "langchain-practice-459204"
REGION = "asia-southeast1"
BASE_MODEL = "text-embedding-004"

In [3]:
# Construct components
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.chat_models import init_chat_model

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

### Loading Documents
construct the loader to load documents.

In [4]:
loader = DirectoryLoader("data", glob="*.txt")
docs = loader.load()

# docs returned was a list with only 1 element
assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}\n")
print(f"Type: {type(docs[0])}")

Total characters: 162909

Type: <class 'langchain_core.documents.base.Document'>


### Splitting Documents
construct the splitter to split the 'docs' into small chunks.

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)
print(len(all_splits))
print(f"Type: {type(all_splits[0])}")

217
Type: <class 'langchain_core.documents.base.Document'>


### Storing Documents
- Construct the embedding model from other open-source models (construct-components STEP).
- Integrate the model into the VectorStore.
- Use the VectorStore to embed the vector value and store 'all_splits'.

In [6]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [None]:
batch_size = 20
left = 0
while left < len(all_splits):
    right = min(left + batch_size, len(all_splits))
    vector_store.add_documents(documents=all_splits[left:right])
    print(f"The chunk {right}th success")
    left = right

The chunk 20th success
The chunk 40th success
The chunk 60th success
The chunk 80th success
The chunk 100th success
The chunk 120th success
The chunk 140th success
The chunk 160th success
The chunk 180th success
The chunk 200th success
The chunk 217th success


Make the example to test the similarity search.

In [8]:
userQuestion = "What is the use of this ebook?"
mostRelatedChunks = vector_store.similarity_search(userQuestion, k = 3) # specify k = 3 to take only the top 3 most relevant
retrievedDoc = mostRelatedChunks[0].page_content
print(retrievedDoc)

easy. You may use this eBook for nearly any purpose such as creation of derivative works, reports, performances and research. Project Gutenberg eBooks may be modified and printed and given away—you may do practically ANYTHING in the United States with eBooks not protected by U.S. copyright law. Redistribution is subject to the trademark license, especially commercial redistribution.


### Wrap them into the LLM
Combine the user input question (userQuestion) and retrieved document (mostRelatedChunks) into the single prompt, then adding to the LLM.

In [9]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")
prompt = prompt.invoke({"question": userQuestion, "context": retrievedDoc})
response = llm.invoke(prompt)
print(response.content)