## Random Forest ##

In [21]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error

In [2]:
file_path = Path("../../fintech/PROJECT_2/Assesor_C.csv")
ma_df = pd.read_csv(file_path)
ma_df['Pool'].replace({True: 1, False: 0}, inplace=True)
ma_df = ma_df[ma_df.FullCashValue != 0]
ma_df.head()

Unnamed: 0,OwnerName,APN,SitusAddress,SitusCity,SitusZip,SubdivisionID,SubdivisionName,Mcr,SectionTownshipRange,IsRental,...,ConstructionYear,WeightedAge,PoolArea,Foreclosed,Floor1SqFt,Floor2SqFt,Floor3SqFt,BasementSqFt,Pool,Garage
2,IGLESIA DE DIOS 7 MO DIA PHX,"{'Book': '106', 'Map': '08', 'Item': '139A', '...","3646 W LATHAM ST PHOENIX, AZ 85009",PHOENIX,85009.0,1333,WEST PHOENIX NO 4,3030.0,03-1N-2E,False,...,1927,1927,0,False,1028,0,0,0,0,False
3,IGLESIA DE DIOS 7 MO DIA PHX,"{'Book': '106', 'Map': '08', 'Item': '139B', '...","3646 W LATHAM ST PHOENIX, AZ 85009",PHOENIX,85009.0,1333,WEST PHOENIX NO 4,3030.0,03-1N-2E,False,...,1927,1927,0,False,680,0,0,0,0,False
13,PHOENIX CITY OF,"{'Book': '105', 'Map': '55', 'Item': '128', 'c...","2068 W WAYLAND RD PHOENIX, AZ 85041",PHOENIX,85041.0,1186,PARK PHOENIX 2,9636.0,25-1N-2E,True,...,1961,1961,0,False,1180,0,0,0,0,False
14,PHOENIX CITY OF,"{'Book': '105', 'Map': '75', 'Item': '304A', '...","2294 W PECAN RD PHOENIX, AZ 85041",PHOENIX,85041.0,1226,PARK PHOENIX 3,9720.0,25-1N-2E,True,...,1972,1972,0,False,1904,0,0,0,0,False
76,PHOENIX CITY OF,"{'Book': '115', 'Map': '05', 'Item': '096', 'c...","1734 E MADISON ST PHOENIX, AZ 85034",PHOENIX,85034.0,2275,COLLINS ADDITION PHOENIX BLOCKS 55-56,111.0,10-1N-3E,False,...,1979,1979,0,False,1248,0,0,0,0,False


In [3]:
#Applying Get Dummies to Zip Codes
rf_df = ma_df[['SitusZip','LandSize', 'LivableSqFootage', 'ConstructionYear', 'Pool', 'FullCashValue']].copy()
rf_df = pd.get_dummies(rf_df, columns=["SitusZip", 'ConstructionYear'])
rf_df = rf_df[rf_df.FullCashValue != 0]

In [4]:
cols = ['FullCashValue']

Q1 = rf_df[cols].quantile(0.25)
Q3 = rf_df[cols].quantile(0.75)
IQR = Q3 - Q1

rf_df = rf_df[~((ma_df[cols] < (Q1 - 1.5 * IQR)) |(ma_df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [5]:
X = rf_df.copy()
X.drop("FullCashValue", axis=1, inplace=True)
X.head()

Unnamed: 0,LandSize,LivableSqFootage,Pool,SitusZip_85003.0,SitusZip_85004.0,SitusZip_85006.0,SitusZip_85007.0,SitusZip_85008.0,SitusZip_85009.0,SitusZip_85012.0,...,ConstructionYear_2009,ConstructionYear_2010,ConstructionYear_2012,ConstructionYear_2013,ConstructionYear_2014,ConstructionYear_2015,ConstructionYear_2016,ConstructionYear_2017,ConstructionYear_2018,ConstructionYear_2019
2,3237,1028,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
13,6504,1180,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,6399,1904,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76,7000,1248,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
79,8400,784,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
y = rf_df["FullCashValue"].values.reshape(-1, 1)
y[:5]

array([[108100],
       [138800],
       [178100],
       [101200],
       [ 73700]], dtype=int64)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [11]:
scaler = StandardScaler()

In [12]:
X_scaler = scaler.fit(X_train)

In [13]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [30]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_fit = rf_model.fit(X_train_scaled, y_train.ravel())

In [31]:
predictions = rf_fit.predict(X_test_scaled)

In [32]:
importances = rf_model.feature_importances_
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.4647436496635477, 'LivableSqFootage'),
 (0.2464154409298598, 'LandSize'),
 (0.033440283386282375, 'ConstructionYear_2002'),
 (0.03322693374371636, 'SitusZip_85032.0'),
 (0.01982897673119421, 'SitusZip_85006.0'),
 (0.0148564424240925, 'SitusZip_85016.0'),
 (0.009557882758952273, 'SitusZip_85040.0'),
 (0.00904575126139606, 'ConstructionYear_2019'),
 (0.008255666873345231, 'SitusZip_85008.0'),
 (0.007005156176119154, 'SitusZip_85035.0')]

In [33]:
print(f'R^2 score: {r2_score(y_true=y_test, y_pred=predictions):20,.2f}')
print(f'RMSE score: {mean_squared_error(y_true=y_test, y_pred=predictions, squared=True):20,.2f}')

R^2 score:                 0.77
RMSE score:       502,073,729.14
