## Install Libraries

In [4]:
pip install langchain-community langchain-core

Collecting langchain-community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB

In [2]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ----------------- ---------------------- 5.0/11.5 MB 28.6 MB/s eta 0:00:01
   ---------------------------------------  11.3/11.5 MB 29.9 MB/s eta 0:00:01
   ---------------------------------------- 11.5/11.5 MB 28.1 MB/s eta 0:00:00
Downloading pytz-2025.1-py2.py3-none-any.whl (507 kB)
Downloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.1 tzdata-2025.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install lancedb

Collecting lancedb
  Downloading lancedb-0.19.0-cp39-abi3-win_amd64.whl.metadata (4.1 kB)
Collecting deprecation (from lancedb)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting pylance==0.23.0 (from lancedb)
  Downloading pylance-0.23.0-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting pyarrow>=14 (from pylance==0.23.0->lancedb)
  Downloading pyarrow-19.0.0-cp313-cp313-win_amd64.whl.metadata (3.4 kB)
Downloading lancedb-0.19.0-cp39-abi3-win_amd64.whl (30.0 MB)
   ---------------------------------------- 0.0/30.0 MB ? eta -:--:--
   ------ --------------------------------- 5.2/30.0 MB 27.7 MB/s eta 0:00:01
   -------------- ------------------------- 11.0/30.0 MB 27.3 MB/s eta 0:00:01
   ---------------------- ----------------- 16.5/30.0 MB 27.4 MB/s eta 0:00:01
   ----------------------------- ---------- 22.0/30.0 MB 27.2 MB/s eta 0:00:01
   ------------------------------------- -- 28.3/30.0 MB 27.6 MB/s eta 0:00:01
   --------------------------------

## Imports

In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import AIMessage,HumanMessage,SystemMessage,FunctionMessage


import json
import pandas as pd
import lancedb
from lancedb.pydantic import Vector,LanceModel
from lancedb.embeddings import get_registry
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.prompts import ChatPromptTemplate
import openai

In [4]:
import os

os.environ["OPENAI_API_KEY"] = "xxx"
openai.api_key = os.getenv("OPENAI_API_KEY")


## Generate Real Estate listings using Langchain

In [5]:
chat_llm = ChatOpenAI(temperature=1)

In [6]:
system_prompt = "You are a real estate agent in the city of Los Angeles in USA"
human_prompt = """Generate at least 7 real estate listings across different counties using your imagination. The listings must be
in a JSON array of dictionaries with each item in the following format:
{
 \"bds\":3,
 \"ba\":2,
 \"sqft\":1500,
 \"price":1000000,
 \"location\":\"North Hollywood\",
 \"school_rating\":3.8,
 \"description\":\"The spacious living area features floor-to-ceiling windows light\" 
}"""

In [7]:
messages = [
    SystemMessage(content=system_prompt),
    HumanMessage(content=human_prompt)
]
json_llm = chat_llm.bind(response_format={"type":"json_object"})

In [10]:
realestate_listings = chat_llm.invoke(messages)
realestate_listings

AIMessage(content='```json\n[\n    {\n        "bds": 3,\n        "ba": 2,\n        "sqft": 1500,\n        "price": 1000000,\n        "location": "North Hollywood",\n        "school_rating": 3.8,\n        "description": "The spacious living area features floor-to-ceiling windows, bringing in ample natural light."\n    },\n    {\n        "bds": 4,\n        "ba": 3,\n        "sqft": 2200,\n        "price": 1500000,\n        "location": "Beverly Hills",\n        "school_rating": 4.5,\n        "description": "Luxury living in the heart of Beverly Hills, with modern amenities and stunning views."\n    },\n    {\n        "bds": 2,\n        "ba": 1,\n        "sqft": 1200,\n        "price": 750000,\n        "location": "Santa Monica",\n        "school_rating": 4.0,\n        "description": "Charming beachside condo in Santa Monica, perfect for those seeking a relaxed lifestyle."\n    },\n    {\n        "bds": 5,\n        "ba": 4,\n        "sqft": 3500,\n        "price": 2500000,\n        "locati

In [11]:
realestates_json = json.loads(realestate_listings.json())["content"]
print(realestates_json)

```json
[
    {
        "bds": 3,
        "ba": 2,
        "sqft": 1500,
        "price": 1000000,
        "location": "North Hollywood",
        "school_rating": 3.8,
        "description": "The spacious living area features floor-to-ceiling windows, bringing in ample natural light."
    },
    {
        "bds": 4,
        "ba": 3,
        "sqft": 2200,
        "price": 1500000,
        "location": "Beverly Hills",
        "school_rating": 4.5,
        "description": "Luxury living in the heart of Beverly Hills, with modern amenities and stunning views."
    },
    {
        "bds": 2,
        "ba": 1,
        "sqft": 1200,
        "price": 750000,
        "location": "Santa Monica",
        "school_rating": 4.0,
        "description": "Charming beachside condo in Santa Monica, perfect for those seeking a relaxed lifestyle."
    },
    {
        "bds": 5,
        "ba": 4,
        "sqft": 3500,
        "price": 2500000,
        "location": "Malibu",
        "school_rating": 4.2,
        

C:\Users\sum_c\AppData\Local\Temp\ipykernel_2912\2702594770.py:1: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  realestates_json = json.loads(realestate_listings.json())["content"]


In [13]:
realestates_json_processed = realestates_json.replace('json','')
realestates_json_processed1 = realestates_json_processed.replace("```","")
print(realestates_json_processed1)


[
    {
        "bds": 3,
        "ba": 2,
        "sqft": 1500,
        "price": 1000000,
        "location": "North Hollywood",
        "school_rating": 3.8,
        "description": "The spacious living area features floor-to-ceiling windows, bringing in ample natural light."
    },
    {
        "bds": 4,
        "ba": 3,
        "sqft": 2200,
        "price": 1500000,
        "location": "Beverly Hills",
        "school_rating": 4.5,
        "description": "Luxury living in the heart of Beverly Hills, with modern amenities and stunning views."
    },
    {
        "bds": 2,
        "ba": 1,
        "sqft": 1200,
        "price": 750000,
        "location": "Santa Monica",
        "school_rating": 4.0,
        "description": "Charming beachside condo in Santa Monica, perfect for those seeking a relaxed lifestyle."
    },
    {
        "bds": 5,
        "ba": 4,
        "sqft": 3500,
        "price": 2500000,
        "location": "Malibu",
        "school_rating": 4.2,
        "descri

## Save Listings

In [14]:
f= open('data_json',"w")
f.write(realestates_json_processed1)
f.close()

In [15]:
df = pd.read_json('data_json')
df

Unnamed: 0,bds,ba,sqft,price,location,school_rating,description
0,3,2,1500,1000000,North Hollywood,3.8,The spacious living area features floor-to-cei...
1,4,3,2200,1500000,Beverly Hills,4.5,"Luxury living in the heart of Beverly Hills, w..."
2,2,1,1200,750000,Santa Monica,4.0,"Charming beachside condo in Santa Monica, perf..."
3,5,4,3500,2500000,Malibu,4.2,Experience the epitome of coastal living in th...
4,3,2,1800,980000,Pasadena,4.1,Classic Pasadena home with a beautiful backyar...
5,4,3,2400,1350000,Downtown Los Angeles,3.7,Modern urban living in a stylish loft in Downt...
6,5,5,5000,3800000,Pacific Palisades,4.8,Elegant Mediterranean villa in Pacific Palisad...


## Convert listing description to embedding

In [16]:
db = lancedb.connect("realestatedb")
func = get_registry().get("openai").create(name="text-embedding-ada-002")

In [17]:
class RealEstateListings(LanceModel):
    bedrooms:int
    bath:int
    area:int
    price:int
    location:str
    school_rating:float
    description:str = func.SourceField()
    description_vector:Vector(func.ndims()) = func.VectorField()

## Save listings in LanceDb

In [18]:
table = db.create_table("realestatelistings",schema=RealEstateListings,mode="overwrite")


In [19]:
data = df.apply(
lambda row:{
    "bedrooms":row["bds"],
    "bath":row["ba"],
    "area":row["sqft"],
    "price":row["price"],
    "location":row["location"],
    "school_rating":row["school_rating"],
    "description":row["description"]    
}, axis=1).values.tolist()

In [20]:
data

[{'bedrooms': 3,
  'bath': 2,
  'area': 1500,
  'price': 1000000,
  'location': 'North Hollywood',
  'school_rating': 3.8,
  'description': 'The spacious living area features floor-to-ceiling windows, bringing in ample natural light.'},
 {'bedrooms': 4,
  'bath': 3,
  'area': 2200,
  'price': 1500000,
  'location': 'Beverly Hills',
  'school_rating': 4.5,
  'description': 'Luxury living in the heart of Beverly Hills, with modern amenities and stunning views.'},
 {'bedrooms': 2,
  'bath': 1,
  'area': 1200,
  'price': 750000,
  'location': 'Santa Monica',
  'school_rating': 4.0,
  'description': 'Charming beachside condo in Santa Monica, perfect for those seeking a relaxed lifestyle.'},
 {'bedrooms': 5,
  'bath': 4,
  'area': 3500,
  'price': 2500000,
  'location': 'Malibu',
  'school_rating': 4.2,
  'description': 'Experience the epitome of coastal living in this luxurious Malibu estate with private beach access.'},
 {'bedrooms': 3,
  'bath': 2,
  'area': 1800,
  'price': 980000,
  'lo

In [21]:
#create a pandas dataframe
pdf = pd.DataFrame(data)


In [23]:
table.add(pdf)

In [26]:
table.head(5)

pyarrow.Table
bedrooms: int64 not null
bath: int64 not null
area: int64 not null
price: int64 not null
location: string not null
school_rating: double not null
description: string not null
description_vector: fixed_size_list<item: float>[1536]
  child 0, item: float
----
bedrooms: [[3,4,2,5,3]]
bath: [[2,3,1,4,2]]
area: [[1500,2200,1200,3500,1800]]
price: [[1000000,1500000,750000,2500000,980000]]
location: [["North Hollywood","Beverly Hills","Santa Monica","Malibu","Pasadena"]]
school_rating: [[3.8,4.5,4,4.2,4.1]]
description: [["The spacious living area features floor-to-ceiling windows, bringing in ample natural light.","Luxury living in the heart of Beverly Hills, with modern amenities and stunning views.","Charming beachside condo in Santa Monica, perfect for those seeking a relaxed lifestyle.","Experience the epitome of coastal living in this luxurious Malibu estate with private beach access.","Classic Pasadena home with a beautiful backyard garden, perfect for outdoor entertainin

## Capture User Preferences from user query using output parser

In [24]:
price_schema = ResponseSchema(name="price",description="This refers to the maximum listing price of the property the customer is looking for.Provide the number value if specified otherwise use 0")
bedrooms_schema = ResponseSchema(name="bedrooms",description="This refers to the number of bedrooms the customer requires for the property. Provide the number value if specified otherwise use 0 ")
bathroom_schema = ResponseSchema(name="bath", description="This refers to the number of bathrooms the customer requires for the property.Provide the number value if specified otherwise use 0")
size_schema = ResponseSchema(name="area", description="This refers to the size of the property the customer requires.Provide the float value if specified otherwise use 0.0")
location_schema = ResponseSchema(name="location", description ="This refers to the city where the customer is looking to buy the property")
school_schema = ResponseSchema(name="school_rating", description ="This refers to the rating of the school district  the where the property is located.Provide the float value if specified otherwise use 0.0")
preferences_schema = ResponseSchema(name="description", description = "This refers to the generalised description for the type of property the customer is looking for")
response_schemas = [price_schema, bedrooms_schema,bathroom_schema,size_schema,location_schema,school_schema, preferences_schema]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

format_instructions = output_parser.get_format_instructions()
print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"price": string  // This refers to the maximum listing price of the property the customer is looking for.Provide the number value if specified otherwise use 0
	"bedrooms": string  // This refers to the number of bedrooms the customer requires for the property. Provide the number value if specified otherwise use 0if the 
	"bath": string  // This refers to the number of bathrooms the customer requires for the property.Provide the number value if specified otherwise use 0
	"area": string  // This refers to the size of the property the customer requires.Provide the float value if specified otherwise use 0.0
	"location": string  // This refers to the city where the customer is looking to buy the property
	"school_rating": string  // This refers to the rating of the school district  the where the property is located.Provide the float value if spe

In [35]:
def build_where_expression(metadata_dict):
    expression = ""
    
    if "price" in metadata_dict:
        listmaxprice = int(metadata_dict["price"])
        if listmaxprice > 0:
            expression = f"price < {listmaxprice}"
            
    if "bedrooms" in metadata_dict:
        beds = int(metadata_dict["bedrooms"])
        if beds > 0:
            if expression:
                expression = expression + " and "
            expression = expression + f"bedrooms >= {beds}"
            
    if "bath" in metadata_dict:
        bathrooms = int(metadata_dict["bath"])
        if bathrooms > 0:
            if expression:
                expression = expression + " and "
            expression = expression + f"bath >= {bathrooms}"
        
    if "area" in metadata_dict:
        size = int(metadata_dict["area"])
        if size > 0:
            if expression:
                expression = expression + " and "
            expression = expression + f"area >= {size}"
        
    if "school_rating" in metadata_dict:
        schoolrating = float(metadata_dict["school_rating"])
        if schoolrating > 0:
            if expression:
                expression = expression + " and "
            expression = expression + f"school_rating >= {schoolrating}"
        
    return expression
     

In [28]:
def extract_preferences(user_input):
    template = """
    From the following text message, extract the following information:

    text message:{text}
    {format_instructions}
    """
    prompt_template = ChatPromptTemplate.from_template(template)
    messages = prompt_template.format_messages(text=user_input, format_instructions=format_instructions)

    response = chat_llm(messages)
    output_dict = output_parser.parse(response.content)
    return output_dict

In [29]:
# if starting a new session open the lance Db table
try:
    table
except NameError:
    lance_db = lancedb.connect("realestatedb")
    table = lance_db.open_table("realestatelistings") 

## Filter based on user preferences and do a vector search

In [42]:
def find_best_matches(user_input,top_n_listings):
    preferences = {}    
    metadata_dict = extract_preferences(user_input)
    print(metadata_dict)
    exprfilter = build_where_expression(metadata_dict)
    if "description" in metadata_dict:
        preferences = metadata_dict["description"]
    filteredDf = table.search(preferences,vector_column_name = "description_vector").where(exprfilter,prefilter=True).limit(top_n_listings).to_pandas()
    return filteredDf

In [43]:
user_input="I am looking for a home with a backyard with at least 3 bedrooms and 2 bath"
filteredDf = find_best_matches(user_input,3)
filteredDf.head()

{'price': 0, 'bedrooms': '3', 'bath': '2', 'area': 0.0, 'location': '', 'school_rating': 0.0, 'description': 'Looking for a home with a backyard with at least 3 bedrooms and 2 bathrooms'}


Unnamed: 0,bedrooms,bath,area,price,location,school_rating,description,description_vector,_distance
0,3,2,1800,980000,Pasadena,4.1,Classic Pasadena home with a beautiful backyar...,"[0.020166242, 0.010955387, 0.005145712, 0.0029...",0.297557
1,5,4,3500,2500000,Malibu,4.2,Experience the epitome of coastal living in th...,"[-0.0038384548, -0.0037837117, 0.008179258, -0...",0.422237
2,4,3,2200,1500000,Beverly Hills,4.5,"Luxury living in the heart of Beverly Hills, w...","[-0.004943543, -0.0077806246, 0.0027595914, -0...",0.429478


### Augment the output

In [74]:
import inflect
def augment_output(query,df):
    context=""
   # p = inflect.engine()
    for index,row in df.iterrows():
       # context = context + "The property having an area of " + p.number_to_words(int(row["area"])) + " is located in "+ row["location"] + " with a price of "+ p.number_to_words(int(row["price"])) + " and has "+ p.number_to_words(int(row["bedrooms"])) + " bedrooms and near to a school district with a rating of "+ p.number_to_words(int(row["school_rating"])) + "." + row["description"] + ".\n\n"
        context = context + "The property having an area of " + f"{row["area"]} sqft" + " is located in "+ f"{row["location"]}" + " with a price of "+ f"{row["price"]}" + " and has "+ f"{row["bedrooms"]}" + " bedrooms and near to a school district with a rating of "+ f"{row["school_rating"]}" + "." + row["description"] + ".\n\n"
               
        response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role":"system",
                "content":"You are an expert real estate agent that answers questions based on context provided.Augment the description to resonate with user preferences. \n If you do not "
                + " know the answer say you do not have enough information"
            },
            {
               "role":"user", 
                "content":f"context:{context}\n QUERY:{query}"
            }
                        
        ],
     )
    response = response.choices[0].message.content
    return response

In [75]:
augment_output("I am looking for a home with a backyard with at least 3 bedrooms and 2 bath",filteredDf)

'Based on your preferences for a home with a backyard, at least 3 bedrooms, and 2 bathrooms, I would recommend exploring the Classic Pasadena home. It offers a comfortable living space with 3 bedrooms and is situated in a desirable location with a school district rating of 4.1. The beautiful backyard garden makes it perfect for outdoor entertaining, fulfilling your desire for a home with outdoor space. Plus, the price of $980,000 makes it an attractive option for those looking for a cozy and inviting property.'