In [1]:
from unittest.mock import inplace

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

manatee_housing = pd.read_csv('../../.venv/lib/Datasets/residential_property.csv', low_memory=False)

In [2]:
manatee_housing.head()

Unnamed: 0,PARID,SITUS_ADDRESS,SITUS_POSTAL_CITY,SITUS_POSTAL_ZIP,LAND_USE_CODE,LAND_USE_DESC,HOMESTEAD_INDICATOR,TAX_DISTRICT,LAST_SALE_DATE,LAST_SALE_VORI,...,COUNTY_EXEMPT_VALUE,COUNTY_TAXABLE_VALUE,OWNER_NAME_LINE1,OWNER_NAME_LINE2,MAILING_ADDRESS_LINE1,MAILING_ADDRESS_LINE2,MAILING_CITY,MAILING_STATE,MAILING_POSTAL_CODE,MAILING_COUNTRY
0,1000010056,1101 GARY AVE,ELLENTON,34222,100,Single Family Residential (1554),N,307,01/26/2004,I,...,0.0,211179.0,"SEYBOLD, ROBERT F JR","SEYBOLD, LINDA",315 CRESCENT CT E,,BRADENTON,FL,34208-1735,USA
1,1000310050,1305 24TH AVE E,PALMETTO,34221,100,Single Family Residential (1554),N,307,04/15/2003,I,...,0.0,141940.0,"KAHL, HENRY V","KAHL, MARGARET A",519 PALM AVE,,ELLENTON,FL,34222-2231,USA
2,10010007,43160 BRADLEY RD,DUETTE,34219,10,Vac Unplatted <10 Ac (1554),N,313,03/09/1998,V,...,0.0,19965.0,"REISS, JOHN H JR","GRAVES, EVELYN GULLETT",PO BOX 63,,WEST DANVILLE,VT,05873,USA
3,10010056,43164 BRADLEY RD,DUETTE,34219,100,Single Family Residential (1554),N,313,02/18/2021,I,...,0.0,145429.0,"YOUMANS, DENNIS",,43164 BRADLEY RD,,DUETTE,FL,34219,USA
4,10011005,43002 BRADLEY RD,PARRISH,34219,725,Res Related Amenities (1554),N,313,04/19/2021,I,...,0.0,63142.0,"GULLETT, DOUGLAS JAMES",,43200 BRADLEY RD,,DUETTE,FL,34219,USA


In [3]:
manatee_housing['SITUS_POSTAL_ZIP'].unique()

array([34222, 34221, 34219, 34251, 34208, 34212, 34202, 34266, 34211,
       34203, 34243, 34201, 33598, 34209, 34205, 34210, 34207, 34216,
       34217, 34218, 34215, 34228])

In [4]:
manatee_housing.columns

Index(['PARID', 'SITUS_ADDRESS', 'SITUS_POSTAL_CITY', 'SITUS_POSTAL_ZIP',
       'LAND_USE_CODE', 'LAND_USE_DESC', 'HOMESTEAD_INDICATOR', 'TAX_DISTRICT',
       'LAST_SALE_DATE', 'LAST_SALE_VORI', 'LAST_SALE_QUALIFIED',
       'LAST_SALE_PRICE', 'LAND_ACREAGE', 'BLDG1_YEAR_BUILT',
       'BLDGS_SQFT_LIVING', 'BLDGS_SQFT_UNROOF', 'SWIMMING_POOL',
       'COMMUNITY_DEV_DIST', 'CRA_NAME', 'NEIGHBORHOOD_CODE',
       'NEIGHBORHOOD_NAME', 'SUBDIVISION_CODE', 'SUBDIVISION_NAME',
       'JUST_VALUE', 'COUNTY_ASSESSED_VALUE', 'COUNTY_EXEMPT_VALUE',
       'COUNTY_TAXABLE_VALUE', 'OWNER_NAME_LINE1', 'OWNER_NAME_LINE2',
       'MAILING_ADDRESS_LINE1', 'MAILING_ADDRESS_LINE2', 'MAILING_CITY',
       'MAILING_STATE', 'MAILING_POSTAL_CODE', 'MAILING_COUNTRY'],
      dtype='object')

In [5]:
#Targeted data frame
selected_columns = [                        #dataframe for Relevant predictive variables
    'SITUS_POSTAL_ZIP', 'LAST_SALE_VORI', 'LAST_SALE_PRICE', 'LAND_ACREAGE', 'BLDG1_YEAR_BUILT',  'BLDGS_SQFT_LIVING', 'BLDGS_SQFT_UNROOF', 'SWIMMING_POOL',  'NEIGHBORHOOD_NAME', 'SUBDIVISION_NAME',  'JUST_VALUE', 'COUNTY_ASSESSED_VALUE'
]
df_model = manatee_housing[selected_columns].copy()

In [6]:
##Drop missing values of outcome variables
df_model = df_model.dropna(subset=['JUST_VALUE'])

In [7]:
##Preprocessing

df_model['SWIMMING_POOL'] = df_model['SWIMMING_POOL'].map({'Y': 1, 'N': 0}).fillna(0) #SWIMMING POOL CAT., CATEGORICAL, CHANGE TO NUMERICAL WITHOUT DUMMY VARIABLES (I.E. BOOLEAN CONVERSION)

df_model['LAST_SALE_VORI'] = df_model['LAST_SALE_VORI'].map({'V': 1, 'I': 0}).fillna(0) #Change invalid or valid sales to represent sales that are passed down, or transfered from those sold in work place. Valid purchases (i.e. market sales) are represented by 1. Invalid ones represented by 0

In [8]:
#Segmented analysis, analysis by neighborhood, zipcode, subdivision
df_model_zipcode = df_model.drop(columns=['NEIGHBORHOOD_NAME', 'SUBDIVISION_NAME'])
df_model_neighborhood = df_model.drop(columns=['SITUS_POSTAL_ZIP', 'SUBDIVISION_NAME'])
df_model_subdivision = df_model.drop(columns=['SITUS_POSTAL_ZIP', 'NEIGHBORHOOD_NAME'])

In [9]:
df_model.head()

Unnamed: 0,SITUS_POSTAL_ZIP,LAST_SALE_VORI,LAST_SALE_PRICE,LAND_ACREAGE,BLDG1_YEAR_BUILT,BLDGS_SQFT_LIVING,BLDGS_SQFT_UNROOF,SWIMMING_POOL,NEIGHBORHOOD_NAME,SUBDIVISION_NAME,JUST_VALUE,COUNTY_ASSESSED_VALUE
0,34222,0.0,1.0,0.83,1974.0,1355,2189,0,16 & 17/34/18,NOT IN SUBDIVISION 0/0,270465.0,211179.0
1,34221,0.0,137500.0,1.0,1937.0,847,922,0,"PALMETTO-NORTH OF RIVER, WEST OF I-75",NOT IN SUBDIVISION 0/0,141940.0,141940.0
2,34219,1.0,4000.0,1.0,,0,0,0,RURAL NORTHEAST AREA OF COUNTY,NOT IN SUBDIVISION 0/0,20400.0,19965.0
3,34219,0.0,129900.0,1.0,1981.0,988,1572,0,RURAL NORTHEAST AREA OF COUNTY,NOT IN SUBDIVISION 0/0,145429.0,145429.0
4,34219,0.0,10.0,5.0,,0,0,0,RURAL NORTHEAST AREA OF COUNTY,NOT IN SUBDIVISION 0/0,63376.0,63142.0


In [10]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Index: 206120 entries, 0 to 210699
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   SITUS_POSTAL_ZIP       206120 non-null  int64  
 1   LAST_SALE_VORI         206120 non-null  float64
 2   LAST_SALE_PRICE        197049 non-null  float64
 3   LAND_ACREAGE           202449 non-null  float64
 4   BLDG1_YEAR_BUILT       180900 non-null  float64
 5   BLDGS_SQFT_LIVING      206120 non-null  int64  
 6   BLDGS_SQFT_UNROOF      206120 non-null  int64  
 7   SWIMMING_POOL          206120 non-null  int64  
 8   NEIGHBORHOOD_NAME      206120 non-null  object 
 9   SUBDIVISION_NAME       206117 non-null  object 
 10  JUST_VALUE             206120 non-null  float64
 11  COUNTY_ASSESSED_VALUE  206120 non-null  float64
dtypes: float64(6), int64(4), object(2)
memory usage: 20.4+ MB


In [11]:
df_model_zipcode = pd.get_dummies(df_model_zipcode, columns=['SITUS_POSTAL_ZIP'], drop_first=True) ##create dummy variables for categorical variable
df_model_neighborhood = pd.get_dummies(df_model_neighborhood, columns=['NEIGHBORHOOD_NAME'], drop_first=True) ##create dummy variables for categorical variable
df_model_subdivision = pd.get_dummies(df_model_subdivision, columns=['SUBDIVISION_NAME'], drop_first=True) ##create dummy variables for categorical variable

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define X and y
X = df_model_subdivision.drop(columns=['JUST_VALUE'])
y = df_model_subdivision['JUST_VALUE']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("R^2 Score:", r2_score(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))

R^2 Score: 0.9732920936021138


TypeError: got an unexpected keyword argument 'squared'