# Step 1: Setting Up the Python Application

In [30]:
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.llms import OpenAI
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_core.prompts.prompt import PromptTemplate
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.chains.question_answering.chain import load_qa_chain

In [9]:
import os

os.environ["OPENAI_API_KEY"] = "API_KEY_HERE"
os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"

In [10]:
# Initialise LLM

llm = OpenAI(
    model="gpt-3.5-turbo-instruct",
    temperature=0,
    max_tokens=2000
    # api_key="...",
    # base_url="...",
    # organization="...",
    # other params...
)
llm

OpenAI(client=<class 'openai.api_resources.completion.Completion'>, temperature=0.0, max_tokens=2000, model_kwargs={}, openai_api_key='voc-589459551126677348884666dc80e0ebfee7.97941068', openai_api_base='https://openai.vocareum.com/v1', openai_proxy='', logit_bias={})

# Step 2: Generating Real Estate Listings

#### Note: This prompt is mentioned for information on how I generated it using ChatGPT app. From there, the listings were saved in a csv file and used ahead in the project work.

In [None]:
# Do not need to run this cell

listing_prompt = """
Generate 12-15 real estate listings for home, which can help a home buyer to understand various parameters, home description,  locality description, etc..  Listing should be diverse and have diverse descriptions and neighborhoods. 
Make the descriptions in listings long and more descriptive as provided in the sample listing. Ensure that they remain diverse.

A sample listing is given below:

Neighborhood: Green Oaks
Price: $800,000
Bedrooms: 3
Bathrooms: 2
House Size: 2,000 sqft

Description: Welcome to this eco-friendly oasis nestled in the heart of Green Oaks. This charming 3-bedroom, 2-bathroom home boasts energy-efficient features such as solar panels and a well-insulated structure. Natural light floods the living spaces, highlighting the beautiful hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area lead to a spacious backyard with a vegetable garden, perfect for the eco-conscious family. Embrace sustainable living without compromising on style in this Green Oaks gem.

Neighborhood Description: Green Oaks is a close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, commuting is a breeze.
"""

out_listings = llm.invoke(listing_prompt)
print(out_listings)

# Step 3: Storing Listings in a Vector Database

## Vector Database Setup: 
Initialize and configure ChromaDB or a similar vector database to store real estate listings.
Generating and Storing Embeddings: Convert the LLM-generated listings into suitable embeddings that capture the semantic content of each listing, and store these embeddings in the vector database.

### Load data

In [11]:
csvLoader = CSVLoader(file_path="./Listings.csv")
listings = csvLoader.load()
listings

# Scope of improvements:
# 1. Create list of documents with custom metadata colums, like price, bedrooms, etc. can go in there. 
# 2. OR Use latest version for putting numeric columns in metadata_columns
# 3. Can use a LLM to summarise a row in more verbose description matching a human description of the house and use that as page_content for Document

[Document(metadata={'source': './Listings.csv', 'row': 0}, page_content='Neighborhood: Green Oaks\nPrice: $800,000\nBedrooms: 3\nBathrooms: 2\nHouse Size: 2,000 sqft\nDescription: Welcome to this eco-friendly oasis nestled in the heart of Green Oaks. This charming 3-bedroom, 2-bathroom home boasts energy-efficient features such as solar panels and a well-insulated structure. Natural light floods the living spaces, highlighting the beautiful hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area lead to a spacious backyard with a vegetable garden, perfect for the eco-conscious family. Embrace sustainable living without compromising on style in this Green Oaks gem.\nNeighborhood Description: Green Oaks is a close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public tran

### Initialise embeddings

In [12]:
embed = OpenAIEmbeddings()
embed

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, async_client=None, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='https://openai.vocareum.com/v1', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='voc-589459551126677348884666dc80e0ebfee7.97941068', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

### Create Vector store from the data

In [13]:
vectorStore = Chroma.from_documents(listings, embed)
vectorStore

<langchain_community.vectorstores.chroma.Chroma at 0x11c7db100>

In [14]:
#Test: Do not need to run this cell

# Sample query semantic similarity search

sample_query = """
Houses which are 3 bedrooms, 2 baths and are in the neighbourhood with lots of trees and near the ocean. 
"""

# find top 3 semantically similar documents to the query
result = vectorStore.similarity_search(query=sample_query, k=3)
result

[Document(metadata={'source': './Listings.csv', 'row': 3}, page_content="Neighborhood: Cedar Grove\nPrice: $720,000\nBedrooms: 3\nBathrooms: 2\nHouse Size: 2,100 sqft\nDescription: This beautifully updated single-story home in Cedar Grove blends modern comfort with natural serenity. Boasting 3 bedrooms and 2 full bathrooms, the home welcomes you with its airy open layout, vaulted ceilings, and large windows that frame picturesque views of the surrounding trees. The kitchen is designed for culinary enthusiasts, featuring quartz countertops, a walk-in pantry, and a large island. The master suite offers private backyard access and an ensuite bathroom with a soaking tub, dual vanities, and a walk-in shower.\nNeighborhood Description: Cedar Grove is known for its lush greenbelts, quiet walking trails, and friendly community events. It's an ideal location for families and retirees alike, with local parks, dog trails, and charming coffee shops that promote a laid-back lifestyle."),
 Document(

# Test: Use a basic query and RAG approach to get LLM output for user query. 

#### Test: Do not need to run this section

In [20]:

query = """
Based on the houses listing in the context, suggest houses which are at least 3 bedrooms and 2 baths and are in the neighbourhood with lots of trees. 
A balance between suburban tranquility and access to urban amenities like restaurants and theaters. 
Try to provide the description as close to the context and in similar format. If no such listing of house is found, say nothing matching found.
"""

# No result test case
'''
query = """
Based on the houses listing in the context, suggest houses which are at least 6 bedrooms and 4 baths and are in the neighbourhood with lots of trees. 
A balance between suburban tranquility and access to urban amenities like restaurants and theaters. 
Try to provide the description as close to the context and in similar format. If no such listing of house is found, say nothing matching found.
"""
'''

use_chain_helper = True
if use_chain_helper:
    rag = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorStore.as_retriever())
    print(rag.run(query))
else:
    similar_docs = vectorStore.similarity_search(query=query, k=5)
    print(similar_docs[0])
    prompt = PromptTemplate(
        template="{query}\nContext: {context}",
        input_variables=["query", "context"],
    )
    chain = load_qa_chain(llm, prompt = prompt, chain_type="stuff")
    print(chain.run(input_documents=similar_docs, query = query))


Neighborhood: Cedar Grove
Price: $720,000
Bedrooms: 3
Bathrooms: 2
House Size: 2,100 sqft
Description: This beautifully updated single-story home in Cedar Grove blends modern comfort with natural serenity. Boasting 3 bedrooms and 2 full bathrooms, the home welcomes you with its airy open layout, vaulted ceilings, and large windows that frame picturesque views of the surrounding trees. The kitchen is designed for culinary enthusiasts, featuring quartz countertops, a walk-in pantry, and a large island. The master suite offers private backyard access and an ensuite bathroom with a soaking tub, dual vanities, and a walk-in shower.
Neighborhood Description: Cedar Grove is known for its lush greenbelts, quiet walking trails, and friendly community events. It's an ideal location for families and retirees alike, with local parks, dog trails, and charming coffee shops that promote a laid-back lifestyle.


# Step 4: Building the User Preference Interface

### Collect buyer preferences, such as the number of bedrooms, bathrooms, location, and other specific requirements from a set of questions or telling the buyer to enter their preferences in natural language.

In [101]:
# Sample question answer hardcoded

questions = [
    "How big do you want your house to be?", 
    "What are 3 most important things for you in choosing this property?", 
    "Which amenities would you like?", 
    "Which transportation options are important to you?", 
    "How urban do you want your neighborhood to be?"
]

answers = [ 
    "A comfortable three-bedroom house with a spacious kitchen and a cozy living room.",
    "A quiet neighborhood, good local schools, and convenient shopping options.",
    "A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.",
    "Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
    "A balance between suburban tranquility and access to urban amenities like restaurants and theaters."
]

### Buyer Preference Parsing: Implement logic to interpret and structure these preferences for querying the vector database.

In [63]:
#Test: Do not need to run this cell

house_plain_query = "Looking for a house as descirbed here. "
for i in range(len(answers)):
    house_plain_query += f" {answers[i]} "

print(house_plain_query)

Looking for a house as descirbed here.  A comfortable three-bedroom house with a spacious kitchen and a cozy living room.  A quiet neighborhood, good local schools, and convenient shopping options.  A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.  Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.  A balance between suburban tranquility and access to urban amenities like restaurants and theaters. 


In [102]:
question_and_answer = ""
for i in range(len(questions)):
    question_and_answer += f"\nQuestion: {questions[i]}"
    question_and_answer += f"\nAnswer: {answers[i]}"

print(question_and_answer)


Question: How big do you want your house to be?
Answer: A comfortable three-bedroom house with a spacious kitchen and a cozy living room.
Question: What are 3 most important things for you in choosing this property?
Answer: A quiet neighborhood, good local schools, and convenient shopping options.
Question: Which amenities would you like?
Answer: A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.
Question: Which transportation options are important to you?
Answer: Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.
Question: How urban do you want your neighborhood to be?
Answer: A balance between suburban tranquility and access to urban amenities like restaurants and theaters.


# Step 5: Searching Based on Preferences

### Semantic Search Implementation: Use the structured buyer preferences to perform a semantic search on the vector database, retrieving listings that most closely match the user's requirements.

In [44]:
def get_matching_listings(query, num=3) :

    # find top 3 semantically similar documents to the query
    result = vectorStore.similarity_search(query=query, k=num)
    return result

In [64]:
# Test: Do not need to run this cell

buyer_query = house_plain_query
output = get_matching_listings(buyer_query)
print(output)

[Document(page_content='Neighborhood: Brookline\nPrice: $1,775,000\nBedrooms: 4\nBathrooms: 3\nHouse Size: 2,900 sqft\nDescription: This elegant 4-bedroom, 3-bathroom colonial in Brookline combines historical details with modern luxury. Step through the grand foyer into spacious living and dining rooms, adorned with crown molding and large bay windows. The chef’s kitchen opens up to a sunny breakfast nook and family room. Upstairs, the primary suite features a walk-in closet and spa bath. A fenced-in backyard and two-car garage round out this ideal suburban retreat.\nNeighborhood Description: Brookline offers a suburban feel with urban conveniences. Known for its excellent schools, leafy streets, and community-focused atmosphere, Brookline has easy access to downtown Boston via the Green Line, making it popular with families and professionals alike.', metadata={'row': 10, 'source': './final_real_estate_listing.csv'}), Document(page_content='Neighborhood: Green Oaks\nPrice: $800,000\nBe

### Listing Retrieval Logic: Fine-tune the retrieval algorithm to ensure that the most relevant listings are selected based on the semantic closeness to the buyer’s preferences.

#### TODO Improvement: Use Pydantic output parser for structured response rather than using example in the prompt

In [103]:

qa_query = """
Use the conversation below for my preferences for the house I am searching for. 

Conversation:
{question_and_answer}

Return details in structured format strictly as per the response example below. 
Extract bedrooms, bathrooms number and house size. Structure all other information as a summary in a description field. 
Do not paraphrase the answers from the user and ensure that the summary matches as close to the answers.
If some data is not specified for a field, keep it empty.

Response example: 
{response_format}

"""

prompt = PromptTemplate(
    template=qa_query,
    input_variables=["question_and_answer", "response_format"],
    #partial_variables={"format_instructions": parser.get_format_instructions},
)

response_format = """
Bedrooms: 3
Bathrooms: 2
House Size: 2,000 sqft
Description: I am looking for a charming 3-bedroom, 2-bathroom home with energy-efficient features such as solar panels and a well-insulated structure. Natural light should floods the living spaces, highlighting the beautiful hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area with a spacious backyard. I embrace sustainable living without compromising on style.
"""

llm_query = prompt.format(question_and_answer=question_and_answer, response_format=response_format)
print(llm_query)


Use the conversation below for my preferences for the house I am searching for. 

Conversation:

Question: How big do you want your house to be?
Answer: A comfortable three-bedroom house with a spacious kitchen and a cozy living room.
Question: What are 3 most important things for you in choosing this property?
Answer: A quiet neighborhood, good local schools, and convenient shopping options.
Question: Which amenities would you like?
Answer: A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.
Question: Which transportation options are important to you?
Answer: Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.
Question: How urban do you want your neighborhood to be?
Answer: A balance between suburban tranquility and access to urban amenities like restaurants and theaters.

Return details in structured format strictly as per the response example below. 
Extract bedrooms, bathrooms number and house size. Stru

In [104]:
llm_output = llm.invoke(llm_query)
print(llm_output)

Bedrooms: 3
Bathrooms: 
House Size: 
Description: I am searching for a comfortable three-bedroom house with a spacious kitchen and a cozy living room. My top priorities are a quiet neighborhood, good local schools, and convenient shopping options. I also desire a backyard for gardening, a two-car garage, and a modern, energy-efficient heating system. In terms of transportation, I am looking for easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads. I prefer a balance between suburban tranquility and access to urban amenities like restaurants and theaters.


In [20]:
# Test: Do not need to run this cell

buyer_llm_query = llm_output
output = get_matching_listings(buyer_llm_query)
print(output)

[Document(metadata={'row': 10, 'source': './Listings.csv'}, page_content='Neighborhood: Brookline\nPrice: $1,775,000\nBedrooms: 4\nBathrooms: 3\nHouse Size: 2,900 sqft\nDescription: This elegant 4-bedroom, 3-bathroom colonial in Brookline combines historical details with modern luxury. Step through the grand foyer into spacious living and dining rooms, adorned with crown molding and large bay windows. The chef’s kitchen opens up to a sunny breakfast nook and family room. Upstairs, the primary suite features a walk-in closet and spa bath. A fenced-in backyard and two-car garage round out this ideal suburban retreat.\nNeighborhood Description: Brookline offers a suburban feel with urban conveniences. Known for its excellent schools, leafy streets, and community-focused atmosphere, Brookline has easy access to downtown Boston via the Green Line, making it popular with families and professionals alike.'), Document(metadata={'row': 9, 'source': './Listings.csv'}, page_content='Neighborhood:

# Step 6: Personalizing Listing Descriptions

### LLM Augmentation
For each retrieved listing, use the LLM to augment the description, tailoring it to resonate with the buyer’s specific preferences. 
This involves subtly emphasizing aspects of the property that align with what the buyer is looking for.

#### Maintaining Factual Integrity: Ensure that the augmentation process enhances the appeal of the listing without altering factual information.

In [125]:
#Test: Do not need to run this cell

#Test preference

test_preference = """
Bedrooms: 3
Bathrooms:
House Size: 
Description: I am searching for a comfortable three-bedroom house with a spacious kitchen and a cozy living room. My top priorities are a quiet neighborhood, good local schools, and convenient shopping options. I would also like a backyard for gardening, a two-car garage, and a modern, energy-efficient heating system. In terms of transportation, I am looking for easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads. I prefer a neighborhood that strikes a balance between suburban tranquility and access to urban amenities like restaurants and theaters.
"""

In [139]:
query = """
You are a real estate agent. 
Based on the houses listings provided in the context, suggest at least 1-2 houses which closely match the user preference given.
If no such listing of house is found, output "No matching listing found".

If matching listings are found, genereate an appealing summary for each matching listing from the context. Map the user preference with the house description and neighbourhood descriptions.
Do not add any new factual information in your summary. 

"""

In [113]:
prompt = PromptTemplate(
    template="{query}\nUser Preference: {user_preference}\nContext: {context}",
    input_variables=["query", "user_preference", "context"],
)

chain = load_qa_chain(llm, prompt = prompt, chain_type="stuff")

In [140]:
preference = llm_output
print(preference)

similar_listings = get_matching_listings(preference, num=5)
#print(similar_listings)

final_output = chain.run(input_documents=similar_listings, query=query, user_preference=preference)
print(final_output)

Bedrooms: 3
Bathrooms: 
House Size: 
Description: I am searching for a comfortable three-bedroom house with a spacious kitchen and a cozy living room. My top priorities are a quiet neighborhood, good local schools, and convenient shopping options. I also desire a backyard for gardening, a two-car garage, and a modern, energy-efficient heating system. In terms of transportation, I am looking for easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads. I prefer a balance between suburban tranquility and access to urban amenities like restaurants and theaters.


Matching Listings:
1. Neighborhood: Brookline
Price: $1,775,000
Bedrooms: 4
Bathrooms: 3
House Size: 2,900 sqft
Description: This elegant 4-bedroom, 3-bathroom colonial in Brookline combines historical details with modern luxury. Step through the grand foyer into spacious living and dining rooms, adorned with crown molding and large bay windows. The chef’s kitchen opens up to a sunny breakfast noo