# The project showcase an AI real estate agent built by state of the art LLM tools

## Step 1: Use OpenAI GPT-3.5-Turbo to generate listings of properties

In [1]:
import numpy as np
import pandas as pd
import random
import openai
import re
from openai import OpenAI

In [34]:
with open('openai_api_key.txt', 'r') as file:
    openai_api_key = file.read()
client = OpenAI(api_key=openai_api_key)

In [35]:
prompt = """
You are a real estate agent, there are many properties on the market for sale, including condo, apartment, house, ranch, and mansion.
The listing must include: neighborhood name, property type, price, size, number of bedrooms, number of bathrooms, description, neighborhood description.
Use the following format:

Neighborhood name:
Property type:
Price:
Size:
Number of bedrooms:
Number of bathrooms:
Description:
Neighborhood Description:

Generate a description for a {} listed for sale:
"""

property_types = ["condo", "apartment", "house", "ranch", "mansion"]

def generate_listing_description(prompt):
    """
    Using custom prompt to generate property listings
    from gpt-3.5-turbo model. 
    """
    try:
        response = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="gpt-3.5-turbo",
            temperature = 1.0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(e)
        return ""

def generate_listings(num_listings, openai_api_key, prompt, property_types):
    """
    generate a number (num_listings) of listings using generate_listing_description
    parse the response string into a dictionary
    storm the listing dictionaries into a Pandas dataframe
    """
    listings = []
    success = False
    keys = ["neighborhood name",
            "property type",
            "price",
            "size",
            "number of bedrooms",
            "number of bathrooms",
            "description",
            "neighborhood description"]
    for i in range(num_listings):
        property_type = random.choice(property_types)
        prompt = prompt.format(property_type)
        listing_str = generate_listing_description(prompt).lower()
        listing_dict = {key: "" for key in keys}
        current_key = None
        for line in listing_str.split('\n'):
            line_key = next((key for key in keys if line.startswith(key + ":")), None)
            if line_key:
                current_key = line_key
                listing_dict[current_key] = line.split(": ", 1)[1].strip()
            elif current_key:
                listing_dict[current_key] += " " + line.strip()
        listings.append(listing_dict)
    listings_df = pd.DataFrame(listings)
    return listings_df

In [36]:
# Test generate_listing_description(openai_api_key, prompt)
prompt = prompt.format("apartment")
response = generate_listing_description(prompt)
response

'Neighborhood name: Parkside Heights\nProperty type: Apartment\nPrice: $300,000\nSize: 900 sq ft\nNumber of bedrooms: 2\nNumber of bathrooms: 2\nDescription: This modern and spacious apartment features an open concept living area, updated kitchen with stainless steel appliances, and a private balcony with stunning views. The master bedroom has a walk-in closet and en-suite bathroom. The building amenities include a gym, pool, and rooftop terrace.\nNeighborhood Description: Parkside Heights is a vibrant and walkable neighborhood with tree-lined streets, trendy cafes, and easy access to parks and public transportation. Residents enjoy a sense of community and close proximity to shopping and dining options.'

In [3]:
# Test generate_listings(num_of_listings, openai_api_key, prompt)
num_listings = 20
listings_df = generate_listings(num_listings, openai_api_key, prompt, property_types)

In [4]:
listings_df.head()

Unnamed: 0,neighborhood name,property type,price,size,number of bedrooms,number of bathrooms,description,neighborhood description
0,willow creek,ranch,"$750,000",10 acres,3,2,this charming ranch features a renovated farmh...,willow creek is a peaceful rural community kno...
1,midtown,condo,"$500,000","1,200 sqft",2,2,this modern and sleek condo in the heart of mi...,"midtown is known for its vibrant nightlife, tr..."
2,maplewood,apartment,"$300,000",900 sqft,2,1,this charming apartment in the heart of maplew...,maplewood is a vibrant and family-friendly nei...
3,rolling hills estates,ranch,"$1,500,000",20 acres,4,3,this stunning ranch property offers 20 acres o...,rolling hills estates is known for its picture...
4,oakridge heights,condo,"$300,000",900 sqft,2,1,this cozy condo in oakridge heights offers a c...,oakridge heights is a bustling neighborhood wi...


In [5]:
listings_df.to_csv("listings.csv")

## Step 2: Create embeddings for the listings

In [3]:
# load saved listing data
listings_df = pd.read_csv("./listings.csv", index_col=0)

In [4]:
# concatenate description and neighborhood description into one column
listings_df["combined_description"] = listings_df["description"].str.cat(listings_df["neighborhood description"], sep=" ")

In [5]:
listings_df["combined_description"][0]

'this charming ranch features a renovated farmhouse with a spacious living room, cozy fireplace, and a beautiful wrap-around porch. the property includes a barn with horse stables, a riding arena, and plenty of open pastures for livestock. perfect for those looking for a peaceful country retreat. willow creek is a peaceful rural community known for its large ranch properties and picturesque landscapes. residents enjoy the tranquility of country living while still being within a short drive to nearby towns for shopping and dining.'

In [6]:
# Using "text-embedding-ada-002" as text embedding model
client = OpenAI(api_key=openai_api_key)
def get_embedding(text):
    text = text.replace("\n", " ")
    model = "text-embedding-ada-002"
    return client.embeddings.create(input=[text], model=model).data[0].embedding

# Add embeddings list to dataframe
listings_df["ada_embeddings"] = listings_df["combined_description"].apply(get_embedding)

In [7]:
# convert price from string to float
# listings_df["price"] = listings_df["price"].str.replace("$", "").str.replace(",", "").astype(float)
listings_df.head()

Unnamed: 0,neighborhood name,property type,price,size,number of bedrooms,number of bathrooms,description,neighborhood description,combined_description,ada_embeddings
0,willow creek,ranch,"$750,000",10 acres,3,2.0,this charming ranch features a renovated farmh...,willow creek is a peaceful rural community kno...,this charming ranch features a renovated farmh...,"[-0.00754887517541647, 0.003767798189073801, -..."
1,midtown,condo,"$500,000","1,200 sqft",2,2.0,this modern and sleek condo in the heart of mi...,"midtown is known for its vibrant nightlife, tr...",this modern and sleek condo in the heart of mi...,"[0.005034128203988075, 0.007064504083245993, -..."
2,maplewood,apartment,"$300,000",900 sqft,2,1.0,this charming apartment in the heart of maplew...,maplewood is a vibrant and family-friendly nei...,this charming apartment in the heart of maplew...,"[-0.0019746043253690004, -0.005034757778048515..."
3,rolling hills estates,ranch,"$1,500,000",20 acres,4,3.0,this stunning ranch property offers 20 acres o...,rolling hills estates is known for its picture...,this stunning ranch property offers 20 acres o...,"[-0.009719268418848515, 0.0010025061201304197,..."
4,oakridge heights,condo,"$300,000",900 sqft,2,1.0,this cozy condo in oakridge heights offers a c...,oakridge heights is a bustling neighborhood wi...,this cozy condo in oakridge heights offers a c...,"[0.01581690087914467, -0.00694337347522378, 0...."


In [8]:
# save listings_df with embeddings
listings_df.to_csv("./listings_with_embedding.csv")

## Step 3: Store listings in a vector database

In [9]:
import lancedb
from lancedb.pydantic import vector, LanceModel

class PropertyListings(LanceModel):
    neighborhood_name: str
    property_type: str
    price: str
    size: str
    num_bedrooms: int
    num_bathrooms: float
    description: str
    neighborhood_description: str
    combined_description: str
    ada_embeddings: vector(1536)

db = lancedb.connect("./.lancedb")
table_name = "property_listings"
db.drop_table(table_name)
table = db.create_table(table_name, schema = PropertyListings)

In [10]:
listings_df.rename(columns={"neighborhood name": "neighborhood_name",
                            "property type": "property_type",
                            "number of bedrooms": "num_bedrooms",
                            "number of bathrooms": "num_bathrooms",
                            "neighborhood description": "neighborhood_description"
                           }, inplace=True)
table.add(listings_df)

In [11]:
# test database search
query = np.random.randn(1536)

In [12]:
table.search(query).limit(1).to_pandas()

Unnamed: 0,neighborhood_name,property_type,price,size,num_bedrooms,num_bathrooms,description,neighborhood_description,combined_description,ada_embeddings,_distance
0,willow creek,ranch,"$750,000",10 acres,3,2.0,this charming ranch features a renovated farmh...,willow creek is a peaceful rural community kno...,this charming ranch features a renovated farmh...,"[-0.007548875, 0.0037677982, -0.0014067, 0.003...",1541.103882


# Step 4: Collect buyer preferences

In [13]:
# Generate buyer preferences using gpt-3.5-turbo
average_buyer_prompt = """
You are a real estate buyer, you are interetsed in buying either a house, a condo, or an apartment, but you can only choose one.
A real estate agent is helping you choosing a property listing that best suits your preferences.

Answer the following questions:

Questions:
What kind of property are you interested in buying?
What kind of neighborhood would you like to live in?
How many bedrooms and bathrooms do you need?
What amenities would you like?

Answers:
"""

high_net_value_buyer_prompt = """
You are a high net value real estate buyer, you are interested in buying a mansion or a ranch, but you can only choose one.
A real estate agent is helping you choosing a property listing that best suits your preferences.

Answer the following questions:

Questions:
What kind of property are you interested in buying?
What kind of neighborhood would you like to live in?
How many bedrooms and bathrooms do you need?
What amenities would you like?

Answers:
"""

# create a list fo buyer prompts
buyer_prompt = [average_buyer_prompt, high_net_value_buyer_prompt]

client = OpenAI(api_key=openai_api_key)

def generate_buyer_preference(prompt):
    """
    Using custom prompt to generate property listings
    from gpt-3.5-turbo model. 
    """

    try:
        response = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="gpt-3.5-turbo",
            temperature = 1.0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(e)
        return ""
def clean_buyer_preference(response):
    if "\n\n" in response:
        response = response.split("\n\n")
    if "\n" in response:
        response = response.split("\n")
    if "?" in response:
        response = [s.strip() for s in response if "?" not in s]
    result_response = " ".join(response)
    # remove numerical bullet points in result_response
    result_response = re.sub(r'\d+\. ', '', result_response)
    # remove "I [text] " pattern
    cleaned_response = re.sub(r'I \w+ ', '', result_response)
    return cleaned_response

In [14]:
response = generate_buyer_preference(average_buyer_prompt)
response

'I am interested in buying a house.\nI would like to live in a quiet and safe neighborhood with good schools and access to parks.\nI would like to have at least 3 bedrooms and 2 bathrooms.\nI would like amenities such as a backyard, a garage, and a spacious kitchen.'

In [15]:
cleaned_response = clean_buyer_preference(response)
cleaned_response

'interested in buying a house. like to live in a quiet and safe neighborhood with good schools and access to parks. like to have at least 3 bedrooms and 2 bathrooms. like amenities such as a backyard, a garage, and a spacious kitchen.'

In [21]:
# Generate 10 buyer preferneces and store them
num_of_samples = 10
buyer_preference_df = pd.DataFrame(columns=["cleaned_buyer_preference", "embeddings"])
for i in range(num_of_samples):
    prompt = random.choice(buyer_prompt)
    response = generate_buyer_preference(prompt)
    cleaned_response = clean_buyer_preference(response)
    cleaned_response_embedding = get_embedding(cleaned_response)
    buyer_preference_df.loc[i, "cleaned_buyer_preference"] = cleaned_response
    buyer_preference_df.loc[i, "embeddings"] = cleaned_response_embedding

In [22]:
buyer_preference_df.head()

Unnamed: 0,cleaned_buyer_preference,embeddings
0,interested in buying a mansion. like to live i...,"[-0.026847463101148605, 0.0042910329066216946,..."
1,interested in buying a mansion. like to live i...,"[-0.02824532985687256, 0.004247883800417185, -..."
2,interested in buying a mansion. like to live i...,"[-0.024769064038991928, -0.0010675755329430103..."
3,interested in buying a mansion. like to live i...,"[-0.02311307191848755, 0.0027584778144955635, ..."
4,interested in buying a house. like to live in ...,"[-0.010938787832856178, 0.021456116810441017, ..."


In [23]:
buyer_preference_df.to_csv("./buyer_preference_embeddings.csv")

# Step 5: Search listings based on buyer's preference

## 5.1 test an average buyer
## It can be seen that the first three matches are houses with average price.

In [16]:
buyer_preference_df = pd.read_csv("./buyer_preference_embeddings.csv", index_col=0)

In [25]:
print(buyer_preference_df.iloc[4]['cleaned_buyer_preference'])
query = buyer_preference_df.iloc[4]['embeddings']
table.search(query).limit(3).to_pandas()

interested in buying a house. like to live in a family-friendly neighborhood with good schools and amenities nearby. need at least 3 bedrooms and 2 bathrooms. like amenities such as a backyard, a garage, and possibly a pool.


Unnamed: 0,neighborhood_name,property_type,price,size,num_bedrooms,num_bathrooms,description,neighborhood_description,combined_description,ada_embeddings,_distance
0,oakwood heights,house,"$500,000","2,500 sqft",4,3.0,"this charming 4-bedroom, 3-bathroom house is p...",oakwood heights is a quiet and family-friendly...,"this charming 4-bedroom, 3-bathroom house is p...","[0.008564916, 0.014116251, -0.0142616425, -0.0...",0.334512
1,willow creek,house,"$350,000","2,000 sq ft",4,2.5,"this charming 4 bedroom, 2.5 bathroom house in...",willow creek is a family-friendly neighborhood...,"this charming 4 bedroom, 2.5 bathroom house in...","[0.0067040585, 0.006078434, -0.0076392023, -0....",0.341342
2,maplewood,apartment,"$300,000",900 sqft,2,1.0,this charming apartment in the heart of maplew...,maplewood is a vibrant and family-friendly nei...,this charming apartment in the heart of maplew...,"[-0.0019746043, -0.005034758, 0.0060494402, -0...",0.386352


## 5.2 test a high net value buyer
## It can be seen that the first three matches are mansions

In [26]:
print(buyer_preference_df.iloc[1]['cleaned_buyer_preference'])
query = buyer_preference_df.iloc[1]['embeddings']
table.search(query).limit(3).to_pandas()

interested in buying a mansion. like to live in a luxurious and exclusive neighborhood. need at least 5 bedrooms and 5 bathrooms. like amenities such as a swimming pool, tennis court, home gym, theater room, and expansive outdoor space.


Unnamed: 0,neighborhood_name,property_type,price,size,num_bedrooms,num_bathrooms,description,neighborhood_description,combined_description,ada_embeddings,_distance
0,bel air,mansion,"$15,000,000","10,000 sqft",6,8.0,this luxurious mansion in the exclusive neighb...,"bel air is known for its affluent residents, u...",this luxurious mansion in the exclusive neighb...,"[0.006617324, 0.0033311045, 0.0016222703, -0.0...",0.273501
1,bel air,mansion,"$20,000,000","15,000 sqft",8,10.0,this luxurious mansion in bel air offers unpar...,bel air is one of the most prestigious and exc...,this luxurious mansion in bel air offers unpar...,"[0.0076307715, 0.0014425404, -0.008280199, -0....",0.294902
2,willow creek estates,mansion,"$3,500,000","10,000 sqft",6,8.0,"this stunning mansion is a true masterpiece, b...",willow creek estates is a prestigious and excl...,"this stunning mansion is a true masterpiece, b...","[0.0046979873, 0.0049256873, -0.004929036, -0....",0.309169


# Step 6: Alter the retrieved listing's description with the buyer's preferences

## Use Retreival Augmented Generation (RAG) technique to alter the description of the matched property

In [38]:
rag_prompt = """
You are a real estate agent.
Genearte a tailored description based on the context below, highlight the specific preferences in the context.
Do not change factual information including name, neighborhood, amenities, and location.

Context: 

{}

---

Description: 

{}

Tailored description:
"""

In [39]:
test_context = buyer_preference_df.iloc[1]['cleaned_buyer_preference']
test_query = buyer_preference_df.iloc[1]['embeddings']
test_matchup = table.search(test_query).limit(1).to_pandas()
test_matchup_description = test_matchup["combined_description"][0]

In [40]:
rag_prompt=rag_prompt.format(test_context, test_matchup_description)
print(rag_prompt)


You are a real estate agent.
Genearte a tailored description based on the context below, highlight the specific preferences in the context.
Do not change factual information including name, neighborhood, amenities, and location.

Context: 

interested in buying a mansion. like to live in a luxurious and exclusive neighborhood. need at least 5 bedrooms and 5 bathrooms. like amenities such as a swimming pool, tennis court, home gym, theater room, and expansive outdoor space.

---

Description: 

this luxurious mansion in the exclusive neighborhood of bel air boasts 10,000 sqft of living space, featuring 6 spacious bedrooms, 8 bathrooms, a grand foyer, gourmet kitchen, home theater, wine cellar, and a sprawling backyard with a pool and tennis court. the architectural design is breathtaking, with high ceilings, marble floors, and panoramic views of the city. bel air is known for its affluent residents, upscale shopping, and stunning hillside mansions. located in the heart of los angeles, 

In [41]:
# test rag prompt
response = generate_listing_description(rag_prompt)
print(response)

Welcome to your dream home in the exclusive neighborhood of Bel Air! This luxurious mansion meets all of your criteria with 6 spacious bedrooms, 8 bathrooms, a grand foyer, gourmet kitchen, home theater, and a wine cellar. The expansive outdoor space includes a pool, tennis court, and breathtaking views of the city. Located in the heart of Los Angeles, Bel Air offers privacy and tranquility while still being close to all the excitement the city has to offer. Don't miss this opportunity to live in luxury and style in one of the most sought-after neighborhoods in the city.
