# Notebook 2 - Creating and Testing a vector database using LangChain Chroma.

## Part 1 - Creating a vector database using Chroma.

Initial imports

In [1]:
# imports:
import pandas as pd
from dotenv import load_dotenv
from langchain.document_loaders.csv_loader import CSVLoader
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
# from langchain import  LLMChain
# from pathlib import Path

# constants:
CSV_FILEPATH = '../data/real_estate_listings_formatted.csv'
VDB_PATH = '../vdb'
# environment variables:
load_dotenv()

True

Initializing the embeddings and the chat model and the LLM chain.

In [2]:
embeddings = OpenAIEmbeddings()
chat_llm = ChatOpenAI(temperature= 0.0,
                      model= "gpt-4.1",
                      max_tokens= 1000,
                      max_retries=1)

Reading the CSV file to understand the data context.

In [3]:
df = pd.read_csv(CSV_FILEPATH)
df.head()

Unnamed: 0,text
0,Neighborhood: Riverside Heights\nPrice: 320000...
1,Neighborhood: Old Town\nPrice: 185000 euro\nBe...
2,Neighborhood: Sunnydale\nPrice: 270000 euro\nB...
3,Neighborhood: City Center\nPrice: 450000 euro\...
4,Neighborhood: Maple Grove\nPrice: 210000 euro\...


Loading the CSV file using the CSVLoader and creating the Chroma vector database.

In [4]:
#load csv
loader = CSVLoader(CSV_FILEPATH)
docs = loader.load()
print('Length of docs: ',len(docs),'\n---')
print('1st doc text value:\n---\n',docs[0])

Length of docs:  50 
---
1st doc text value:
---
 page_content='text: Neighborhood: Riverside Heights
Price: 320000 euro
Bedrooms: 4
House Size: 210 square meters

Description: Spacious 4-bedroom family home with a modern kitchen, open-plan living area, and a sun-drenched patio overlooking a private garden. The master suite features a walk-in closet and en-suite bathroom. Perfect for entertaining guests or relaxing with family.

Neighborhood Description: Riverside Heights is known for its scenic river views, tree-lined streets, and family-friendly parks. Residents enjoy weekend farmers' markets and easy access to top-rated schools and local cafes.' metadata={'source': '../data/real_estate_listings_formatted.csv', 'row': 0}


In [5]:
#creating a chroma vector database
db = Chroma.from_documents(
    docs,
    embeddings,
    collection_name= 'real_estate_listings',
    persist_directory= VDB_PATH
)

## Part 2 - Checking results from Chroma using RAG approach.

Providing a query and running the RAG chain.

In [12]:
#query = "I would like to buy a luxury house nearby a lake. Recommend based on the provided context."
#"I would like to buy a small house in town, I am single so I do not want something big. Recommend based on the provided context."

query = "I want to buy a house in the old city. What is the best price?"

Running the RAG chain process with two different approaches.

In [13]:
use_chain_helper = True
if use_chain_helper:
    rag = RetrievalQA.from_chain_type(
        llm= chat_llm,
        chain_type="stuff",
        retriever=db.as_retriever(
            search_type="mmr", #
            search_kwargs={'k': 3,
                           'fetch_k': 10,
                           'lambda_mult': 0.5
                           }
        )
    )
    gen_output = rag.invoke(query)
    print(gen_output['result'])
else:
    similar_docs = db.similarity_search(query, k=3)
    prompt = PromptTemplate(
        template="{query}\nContext:\n{context}",
        input_variables=["query", "context"],
    )
    chain = load_qa_chain(llm= chat_llm, prompt=prompt, chain_type="stuff") # deprecated in newer versions
    print(chain.run(query=query, input_documents=similar_docs))
    print('\n---Retrieved Docs Log---\n',similar_docs)

Based on the available information, the best price for a house in the Old Town (old city) is **185,000 euro** for a charming 2-bedroom townhouse with 95 square meters of space.
