In [51]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

In [52]:
data = pd.read_csv("../data/train_set_scaled.csv", sep = "\t")
data = data.rename(columns={"Log price (1 billion VND)": "Log Price", 'Log area (square meters)': "Log Area", '(Log) Bedrooms' : "Log Bedrooms", '(Log) WC': "Log WC", '(Log) Number of floors': "Log Floors"})
data.shape

(4764, 15)

In [53]:
X = data.drop(columns=["Log Price"])
y = data['Log Price']

In [54]:
RandomForestRegressor?

[0;31mInit signature:[0m
[0mRandomForestRegressor[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'squared_error'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbootstrap[0m[0;34m=[0m[0;32mTrue[0m[0;34m,

In [55]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ['sqrt', "log2"] # there's no auto

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [56]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [57]:
# First create the base model to tune 
regressor = RandomForestRegressor(random_state = 42)

# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = regressor,
                               param_distributions = random_grid,
                               n_iter = 100, 
                               cv = 5, 
                               random_state=42,
                               n_jobs = -1,
                               scoring='neg_mean_squared_error'
                               )
rf_random.fit(X, y)



In [58]:
cv_res = pd.DataFrame(rf_random.cv_results_)
cv_res = cv_res[[
    'param_n_estimators', 
    'param_min_samples_split', 
    'param_min_samples_leaf', 
    'param_max_features', 
    'param_max_depth', 
    'param_bootstrap',
    'split0_test_score', 
    'split1_test_score', 
    'split2_test_score', 
    'mean_test_score',
]]
score_cols = ["split0", "split1", "split2", "mean_test"]
cv_res.columns = ["n_estimators", "min_samples_split", "min_samples_leaf", "max_features", "max_depth", "bootstrap"] + score_cols
cv_res[score_cols] = np.sqrt(-cv_res[score_cols])

cv_res.sort_values(by="mean_test", ascending=True).head(20)

Unnamed: 0,n_estimators,min_samples_split,min_samples_leaf,max_features,max_depth,bootstrap,split0,split1,split2,mean_test
49,377,2,2,log2,110.0,False,0.619598,0.618379,0.614491,0.62149
31,466,2,2,log2,50.0,False,0.619356,0.618573,0.614428,0.621521
54,733,5,2,log2,,False,0.621405,0.618503,0.614597,0.62288
24,733,5,1,log2,80.0,False,0.621449,0.619461,0.615218,0.623007
44,200,5,2,sqrt,90.0,False,0.620363,0.618304,0.614926,0.623061
21,822,5,1,log2,70.0,False,0.621531,0.619449,0.615567,0.623119
32,466,5,1,log2,100.0,False,0.620773,0.620141,0.614997,0.623206
11,466,5,1,log2,90.0,False,0.620773,0.620141,0.614997,0.623206
50,911,2,1,sqrt,20.0,False,0.624141,0.618995,0.61435,0.623261
6,555,5,2,log2,100.0,False,0.621331,0.618873,0.615556,0.623268


In [59]:
rf_random.best_params_

{'n_estimators': 377,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': 110,
 'bootstrap': False}

In [60]:
rf_random.best_score_

-0.38624926487589717

In [61]:
cross_val_score(rf_random.best_estimator_, X, y, cv = 5, scoring = 'r2').mean()

0.6124203693557659

In [62]:
model = rf_random.best_estimator_

# Using grid search to fine tune more thoroughly

In [76]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = range(300, 600, 100)
# Number of features to consider at every split
max_features = ['log2']
# Maximum number of levels in tree
max_depth = [50, 60, 70, 80, 90, 110]
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [False]
# Create the random grid
parameters = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

regressor = RandomForestRegressor()

grid_search = GridSearchCV(estimator = regressor,
                           param_grid = parameters,
                           cv = 5, n_jobs = -1,
                           scoring='neg_mean_squared_error')

grid_search = grid_search.fit(X, y)

In [77]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res = cv_res[[
    'param_n_estimators', 
    'param_min_samples_split', 
    'param_min_samples_leaf', 
    'param_max_features', 
    'param_max_depth', 
    'param_bootstrap',
    'split0_test_score', 
    'split1_test_score', 
    'split2_test_score', 
    'mean_test_score',
]]
score_cols = ["split0", "split1", "split2", "mean_test"]
cv_res.columns = ["n_estimators", "min_samples_split", "min_samples_leaf", "max_features", "max_depth", "bootstrap"] + score_cols
cv_res[score_cols] = np.sqrt(-cv_res[score_cols])

cv_res.sort_values(by="mean_test", ascending=True).head(20)

Unnamed: 0,n_estimators,min_samples_split,min_samples_leaf,max_features,max_depth,bootstrap,split0,split1,split2,mean_test
135,300,3,2,log2,110,False,0.62083,0.615948,0.611714,0.621671
85,400,2,2,log2,80,False,0.619846,0.617704,0.613583,0.621674
107,500,5,1,log2,90,False,0.619937,0.618798,0.61439,0.621761
41,500,3,2,log2,60,False,0.620588,0.616507,0.613758,0.622027
33,300,5,1,log2,60,False,0.620976,0.616362,0.612769,0.622071
128,500,4,1,log2,110,False,0.620593,0.619201,0.612926,0.622115
23,500,5,2,log2,50,False,0.619783,0.618777,0.615241,0.622205
40,400,3,2,log2,60,False,0.620988,0.616473,0.614195,0.622235
139,400,4,2,log2,110,False,0.621107,0.618254,0.613559,0.622236
116,500,4,2,log2,90,False,0.619876,0.618257,0.61369,0.622272


In [78]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 110,
 'max_features': 'log2',
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 300}

In [79]:
model2 = grid_search.best_estimator_

cross_val_score(model2, X, y, cv = 5, scoring = 'r2').mean()

0.6113696636806861

# Test

In [80]:
test = pd.read_csv("../data/test_set_scaled.csv", sep = "\t")
test = test.rename(columns={"Log price (1 billion VND)": "Log Price", 'Log area (square meters)': "Log Area", '(Log) Bedrooms' : "Log Bedrooms", '(Log) WC': "Log WC", '(Log) Number of floors': "Log Floors"})

In [81]:
X_test = test.drop(columns=["Log Price"])
y_test = test['Log Price']

In [82]:
from sklearn.metrics import r2_score

y_pred = model.predict(X_test)

r2_score(y_test, y_pred)

0.613203137753001

In [83]:
y_pred2 = model2.predict(X_test)
r2_score(y_test, y_pred2)

0.6116388243801701

In [84]:
from joblib import dump

dump(model, "random_grid_random_forest.joblib")
dump(model2, "grid_search_random_forest.joblib")

['grid_search_random_forest.joblib']