# Objective

Predict if a property is a good investment based on financial and physical attributes.

<b>Target:</b>

A binary variable indicating good (1) or bad (0) investment. Define this based on criteria such as a high rent-to-price ratio or a favorable market estimate compared to the listed price.


In [255]:
import pandas as pd

# Data exploration


In [256]:
real_estate_data = pd.read_csv("./data/real_estate_data.csv")
real_estate_data.describe()

# Find any missing data in the dataset.
missing_values = real_estate_data.isnull().sum()

missing_values

State                0
City                 0
Street               0
Zipcode              0
Bedroom             14
Bathroom            34
Area                 0
PPSq                 0
LotArea            902
MarketEstimate    7236
RentEstimate      5976
Latitude             0
Longitude            0
ListedPrice          0
dtype: int64

In [257]:
# See how much data there is with no null values. This can help us determine if we can simply remove all of the nulls from the dataset.

len(real_estate_data.dropna()), real_estate_data.dropna().isnull().sum()

(14853,
 State             0
 City              0
 Street            0
 Zipcode           0
 Bedroom           0
 Bathroom          0
 Area              0
 PPSq              0
 LotArea           0
 MarketEstimate    0
 RentEstimate      0
 Latitude          0
 Longitude         0
 ListedPrice       0
 dtype: int64)

In [258]:
# Get the average difference of List to Estimate by state.

no_null_market_estimate = real_estate_data[real_estate_data["MarketEstimate"].notnull()]

avg_market_minus_list = (
    real_estate_data["MarketEstimate"] - real_estate_data["ListedPrice"]
).median()
avg_market_minus_list

-1600.0

# Data preprocessing


In [259]:
# Remove all rows without a Rent Estimate as this value is vital to my analysis.

real_estate_data = real_estate_data[real_estate_data["RentEstimate"].notnull()]

len(real_estate_data), real_estate_data.isnull().sum()

(16705,
 State                0
 City                 0
 Street               0
 Zipcode              0
 Bedroom              9
 Bathroom            24
 Area                 0
 PPSq                 0
 LotArea            611
 MarketEstimate    1308
 RentEstimate         0
 Latitude             0
 Longitude            0
 ListedPrice          0
 dtype: int64)

In [260]:
# Update market estimates with the average difference between
for index, row in real_estate_data.iterrows():
    if pd.isna(row["MarketEstimate"]):
        real_estate_data.at[index, "MarketEstimate"] = (
            row["ListedPrice"] + avg_market_minus_list
        )

real_estate_data.isnull().sum()

State               0
City                0
Street              0
Zipcode             0
Bedroom             9
Bathroom           24
Area                0
PPSq                0
LotArea           611
MarketEstimate      0
RentEstimate        0
Latitude            0
Longitude           0
ListedPrice         0
dtype: int64

In [261]:
# Populate bed, bath, and lot area based on the median area for the given state.


def get_median_for_row(row: pd.Series, df: pd.DataFrame, area_offset=200):
    area = row["Area"]
    filtered_df = df[df["State"].eq(row["State"])].query(
        f"Area <= {area + area_offset} or Area >= {area - area_offset}"
    )
    median_bed = filtered_df["Bedroom"].median()
    median_bath = filtered_df["Bathroom"].median()
    median_lot = filtered_df["LotArea"].median()
    return median_bed, median_bath, median_lot


for index, row in real_estate_data.iterrows():
    if pd.isna(row["Bedroom"]) or pd.isna(row["Bathroom"]) or pd.isna(row["LotArea"]):
        if pd.isna(row["Bedroom"]):
            median_bed, _, _ = get_median_for_row(row, real_estate_data)
            real_estate_data.at[index, "Bedroom"] = median_bed
        if pd.isna(row["Bathroom"]):
            _, median_bath, _ = get_median_for_row(row, real_estate_data)
            real_estate_data.at[index, "Bathroom"] = median_bath
        if pd.isna(row["LotArea"]):
            _, _, median_lot = get_median_for_row(row, real_estate_data)
            real_estate_data.at[index, "LotArea"] = median_lot

real_estate_data.isnull().sum()

State             0
City              0
Street            0
Zipcode           0
Bedroom           0
Bathroom          0
Area              0
PPSq              0
LotArea           0
MarketEstimate    0
RentEstimate      0
Latitude          0
Longitude         0
ListedPrice       0
dtype: int64

In [263]:
from src.const import property_tax_rates

# Calculating the additional features:
real_estate_data["RentToPriceRatio"] = (
    real_estate_data["RentEstimate"] / real_estate_data["ListedPrice"]
)

real_estate_data["MarketToPriceRatio"] = (
    real_estate_data["MarketEstimate"] / real_estate_data["ListedPrice"]
)

real_estate_data["AnnualPropertyTaxEstimate"] = real_estate_data[
    "MarketEstimate"
] * real_estate_data["State"].map(property_tax_rates)


# Estimated Monthly Mortgage - assuming a 30-year fixed mortgage at 6% interest rate.
interest_rate = 0.06 / 12
loan_term = 30 * 12  # 30 years fixed rate loan.
real_estate_data["MonthlyMortgageEstimate"] = (
    real_estate_data["ListedPrice"] * interest_rate * (1 + interest_rate) ** loan_term
) / ((1 + interest_rate) ** loan_term - 1)

real_estate_data[
    [
        "ListedPrice",
        "MarketEstimate",
        "RentEstimate",
        "RentToPriceRatio",
        "MarketToPriceRatio",
        "AnnualPropertyTaxEstimate",
        "MonthlyMortgageEstimate",
    ]
].head()

Unnamed: 0,ListedPrice,MarketEstimate,RentEstimate,RentToPriceRatio,MarketToPriceRatio,AnnualPropertyTaxEstimate,MonthlyMortgageEstimate
0,239900.0,240600.0,1599.0,0.006665,1.002918,986.46,1438.32171
3,335000.0,336200.0,1932.0,0.005767,1.003582,1378.42,2008.494259
4,250000.0,222700.0,1679.0,0.006716,0.8908,913.07,1498.876313
5,151000.0,150500.0,1385.0,0.009172,0.996689,617.05,905.321293
6,239000.0,238400.0,2125.0,0.008891,0.99749,977.44,1432.925755


# Decision Tree
