# Objective

Predict if a property is a good investment based on financial and physical attributes.

<b>Target:</b>

A binary variable indicating good (1) or bad (0) investment. Define this based on criteria such as a high rent-to-price ratio or a favorable market estimate compared to the listed price.


In [145]:
import pandas as pd

# Data exploration


In [186]:
real_estate_data = pd.read_csv("./data/real_estate_data.csv")
real_estate_data.describe()

# Find any missing data in the dataset.
missing_values = real_estate_data.isnull().sum()

missing_values

State                0
City                 0
Street               0
Zipcode              0
Bedroom             14
Bathroom            34
Area                 0
PPSq                 0
LotArea            902
MarketEstimate    7236
RentEstimate      5976
Latitude             0
Longitude            0
ListedPrice          0
dtype: int64

In [187]:
# See how much data there is with no null values. This can help us determine if we can simply remove all of the nulls from the dataset.

len(real_estate_data.dropna()), real_estate_data.dropna().isnull().sum()

(14853,
 State             0
 City              0
 Street            0
 Zipcode           0
 Bedroom           0
 Bathroom          0
 Area              0
 PPSq              0
 LotArea           0
 MarketEstimate    0
 RentEstimate      0
 Latitude          0
 Longitude         0
 ListedPrice       0
 dtype: int64)

In [190]:
# Get the average difference of List to Estimate by state.

no_null_market_estimate = real_estate_data[real_estate_data["MarketEstimate"].notnull()]
avg_market_minus_list = (
    real_estate_data["MarketEstimate"] - real_estate_data["ListedPrice"]
)

avg_market_minus_list.median()

1600.0

# Data preprocessing


In [148]:
# Remove all rows without a Rent Estimate as this value is vital to my analysis.

real_estate_data = real_estate_data[real_estate_data["RentEstimate"].notnull()]

missing_values = real_estate_data.isnull().sum()
missing_values

State                0
City                 0
Street               0
Zipcode              0
Bedroom              9
Bathroom            24
Area                 0
PPSq                 0
LotArea            611
MarketEstimate    1308
RentEstimate         0
Latitude             0
Longitude            0
ListedPrice          0
dtype: int64

In [137]:
from src.const import property_tax_rates

# Calculating the additional features:
real_estate_data["RentToPriceRatio"] = (
    real_estate_data["RentEstimate"] / real_estate_data["ListedPrice"]
)

real_estate_data["MarketToPriceRatio"] = (
    real_estate_data["MarketEstimate"] / real_estate_data["ListedPrice"]
)

real_estate_data["AnnualPropertyTaxEstimate"] = real_estate_data[
    "MarketEstimate"
] * real_estate_data["State"].map(property_tax_rates)


# Estimated Monthly Mortgage - assuming a 30-year fixed mortgage at 6% interest rate.
interest_rate = 0.06 / 12
loan_term = 30 * 12  # 30 years fixed rate loan.
real_estate_data["MonthlyMortgageEstimate"] = (
    real_estate_data["ListedPrice"] * interest_rate * (1 + interest_rate) ** loan_term
) / ((1 + interest_rate) ** loan_term - 1)

real_estate_data.head()

(State                           0
 City                            0
 Street                          0
 Zipcode                         0
 Bedroom                        14
 Bathroom                       34
 Area                            0
 PPSq                            0
 LotArea                       902
 MarketEstimate               7236
 RentEstimate                 5976
 Latitude                        0
 Longitude                       0
 ListedPrice                     0
 RentToPriceRatio             5976
 MarketToPriceRatio           7236
 AnnualPropertyTaxEstimate    7236
 MonthlyMortgageEstimate         0
 dtype: int64,
   State         City             Street  Zipcode  Bedroom  Bathroom    Area  \
 0    AL     Saraland           Scott Dr  36571.0      4.0       2.0  1614.0   
 1    AL  Robertsdale    Cowpen Creek Rd  36567.0      3.0       2.0  1800.0   
 2    AL  Gulf Shores  Spinnaker Dr #201  36542.0      2.0       2.0  1250.0   
 3    AL      Chelsea         Malle

# Decision Tree
