## Random Forest Regression
#### Resources: https://www.geeksforgeeks.org/random-forest-regression-in-python/

In [1]:
# import libraries
import pandas as pd
import warnings 

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sqlalchemy.ext.automap import automap_base
from sqlalchemy import create_engine #, func
from sqlalchemy.orm import Session
from sqlalchemy import text

warnings.filterwarnings('ignore')

In [2]:
#Read data and create dataframe

filtered_df = pd.read_csv("Resources/filtered_df.csv")
#filtered_df

# Create a SQLite database engine
engine = create_engine('sqlite:///filtered_df.sqlite')

# reflect an existing database into a new model
Base=automap_base()

# reflect the tables
Base.prepare(autoload_with=engine)

# Write DataFrame to SQLite database
filtered_df.to_sql('realtor_filtered', con=engine, if_exists='replace', index=False)

# Confirm the data has been written by querying the database
query = "SELECT * FROM realtor_filtered;"  # Example query
with engine.connect() as conn:
    result = conn.execute(text(query))

In [3]:
df = result
df = pd.DataFrame(df)
df

Unnamed: 0,status,price,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date
0,for_sale,180000.0,2.0,1.0,0.34,Agawam,Massachusetts,1001.0,676.0,
1,for_sale,239900.0,3.0,1.0,0.46,Agawam,Massachusetts,1001.0,1196.0,
2,for_sale,525000.0,3.0,3.0,0.45,Agawam,Massachusetts,1001.0,2314.0,2014-06-25
3,for_sale,289900.0,3.0,2.0,0.36,Agawam,Massachusetts,1001.0,1276.0,2012-10-12
4,for_sale,275000.0,4.0,2.0,0.11,Agawam,Massachusetts,1001.0,1732.0,
...,...,...,...,...,...,...,...,...,...,...
1353428,sold,359900.0,4.0,2.0,0.33,Richland,Washington,99354.0,3600.0,2022-03-25
1353429,sold,350000.0,3.0,2.0,0.10,Richland,Washington,99354.0,1616.0,2022-03-25
1353430,sold,440000.0,6.0,3.0,0.50,Richland,Washington,99354.0,3200.0,2022-03-24
1353431,sold,179900.0,2.0,1.0,0.09,Richland,Washington,99354.0,933.0,2022-03-24


In [4]:
# reorder the columns
df = df[['bed', 'bath', 'acre_lot','zip_code','house_size', 'price']]
#df = df[['status','bed', 'bath', 'acre_lot', 'city','state','zip_code','house_size', 'price']]
df

Unnamed: 0,bed,bath,acre_lot,zip_code,house_size,price
0,2.0,1.0,0.34,1001.0,676.0,180000.0
1,3.0,1.0,0.46,1001.0,1196.0,239900.0
2,3.0,3.0,0.45,1001.0,2314.0,525000.0
3,3.0,2.0,0.36,1001.0,1276.0,289900.0
4,4.0,2.0,0.11,1001.0,1732.0,275000.0
...,...,...,...,...,...,...
1353428,4.0,2.0,0.33,99354.0,3600.0,359900.0
1353429,3.0,2.0,0.10,99354.0,1616.0,350000.0
1353430,6.0,3.0,0.50,99354.0,3200.0,440000.0
1353431,2.0,1.0,0.09,99354.0,933.0,179900.0


In [5]:
# Set the features and the target
X = df.iloc[:,0:5]
y = df.iloc[:,5]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1353433 entries, 0 to 1353432
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   bed         1353433 non-null  float64
 1   bath        1353433 non-null  float64
 2   acre_lot    1353433 non-null  float64
 3   zip_code    1353433 non-null  float64
 4   house_size  1353433 non-null  float64
 5   price       1353433 non-null  float64
dtypes: float64(6)
memory usage: 62.0 MB


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [8]:
X_train

Unnamed: 0,bed,bath,acre_lot,zip_code,house_size
1018607,2.0,2.0,0.06,55110.0,1360.0
1060674,2.0,1.0,0.11,63121.0,884.0
520744,5.0,4.0,0.25,76063.0,3269.0
744247,5.0,2.0,0.07,1009.0,1924.0
255833,3.0,2.0,0.23,34286.0,1831.0
...,...,...,...,...,...
110268,4.0,5.0,13.31,20117.0,4820.0
259178,3.0,4.0,0.45,33919.0,3800.0
131932,3.0,2.0,0.86,27537.0,1782.0
671155,4.0,4.0,0.17,92563.0,2944.0


In [9]:
# define and fit the model
regressor = RandomForestRegressor(n_estimators=110, random_state=0, oob_score=True)
regressor.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score
 
# Access the Out of Bag Score
oob_score = regressor.oob_score_
print(f'Out-of-Bag Score: {oob_score}')
 
# Make predictions
predictions = regressor.predict(X_test)
 
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Determine the R-Squared Value
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')


Out-of-Bag Score: 0.6356523908623477


Mean Squared Error: 2430513248073.7993
R-squared: 0.23015230161480837
