# Main Model

In [47]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [49]:
#Separating Features of the dataframe
X = df.drop(["Unnamed: 0", "Unnamed: 0.1", "Price", "House", "State", "Area", "Microlocation", "Price_log", "location_append", "Longitude", "Latitude", "geometry"], axis=1)
y = df["Price"]

In [50]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

#Perfroming Cross Validation (KFold takes on the task of fitting the model as well as testing)
folds = KFold(10, random_state = 11, shuffle = True)
cv_results = cross_validate(RandomForestRegressor(), X = X, y = y, cv = folds, scoring = ("r2"))

In [51]:
#Calculating mean r2 score
cv_results["test_score"].mean()

0.53914420575889

# Grid search optimization

In [52]:
from sklearn.model_selection import train_test_split

#Separating the model into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

#Creating a Grid of values to test for
grid = { 
    'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750],
    'max_features': ['sqrt','log2', 1.0, 2.0, 3.0,],
    'max_depth' : [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
}

#Executing grid search function
rf_GridSearchCV = GridSearchCV(estimator=RandomForestRegressor(), param_grid=grid, cv= 5, scoring="r2")
rf_GridSearchCV.fit(X_train, y_train)

In [None]:
#Printing optimized hyperparameters 
print(rf_GridSearchCV.best_params_)

In [None]:
from sklearn import metrics

#Calculating new score
prediction_rf_GridSearchCV = rf_GridSearchCV.predict(X_test)
print(metrics.r2_score(y_test, prediction_rf_GridSearchCV))

# Bayesian Optimization

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.space.transformers import Pipeline
from sklearn.svm import SVC

#Defining Bayesian optimization function and selecting hyperparameters 
boptimize = BayesSearchCV(
    RandomForestRegressor(),
    {
    'n_estimators': Integer(1, 1000),
    'max_features': Integer(1, 12),
    'max_depth': Integer(1, 100),
    'min_samples_split': Integer(2, 100),
    'min_samples_leaf': Integer(1, 100),
    },
    cv=5
)

#Fitting the model
boptimize.fit(X_train, y_train)

In [None]:
#showing new hyperparameters 
print(boptimize.best_params_)

In [None]:
#Calulating new score
print(boptimize.score(X_test, y_test))

In [None]:
from skopt.plots import plot_objective, plot_histogram

#Creating Partial Dependence plots of the objective function
_ = plot_objective(boptimize.optimizer_results_[0],
                   dimensions=["n_estimators", "max_features", "max_depth", "min_samples_split", "min_samples_leaf"],
                   size=3.2)
plt.show()

In [None]:
from skopt.plots import plot_convergence

#Plotting the convergence trace
plot_convergence(boptimize.optimizer_results_)

# Feature Importance

In [None]:
# Create a new model. that can use feature importance, with the best parameters
rf_feature_model = RandomForestRegressor(max_depth = boptimize.best_params_['max_depth'], max_features = boptimize.best_params_['max_features'], min_samples_leaf = boptimize.best_params_['min_samples_leaf'], min_samples_split = boptimize.best_params_['min_samples_split'], n_estimators = boptimize.best_params_['n_estimators'])

#Fitting the model
rf_feature_model.fit(X_train, y_train)

In [None]:
#Plotting Feature Importances
fig = plt.figure(figsize=(9, 6))

fig.suptitle('Random Forest Regression Feature Importance', x=0.5, y=0.92, ha='center', size=18, fontweight = 'bold', font='Arial Nova')

axes = sns.barplot(x=rf_feature_model.feature_importances_, y=X.columns)

sns.set(style='darkgrid')

axes.set_xlabel('Feature Importance', font='Arial Nova', size='15')
axes.set_ylabel('Correlation', font='Arial Nova', size='15')

xmajtick = np.arange(0, 15000000, 2000000)
xmintick = np.arange(0, 15000000, 1000000)

axes.grid(color="black", which = "major", linestyle = "--", linewidth = 1, alpha = 0.5)
axes.grid(color="gray", which = "minor", linestyle = ":", linewidth = 0.5, alpha = 1);

# Residuals Plot

In [None]:
from yellowbrick.datasets import load_concrete
from yellowbrick.regressor import ResidualsPlot

# Create the visualizer
rf_visualizer = ResidualsPlot(boptimize)

# Fit the training data to the visualizer
rf_visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
rf_visualizer.score(X_test, y_test)  

# Finalize and render the figure
rf_visualizer.show()