In [18]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from scipy.sparse import hstack
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold

In [8]:
hdb = pd.read_csv('data/resale-flat-prices-based-on-registration-date-from-march-2012-onwards.csv')

In [9]:
hdb = hdb.rename(columns={'month': 'year-month'})
hdb['year'] = hdb['year-month'].apply(lambda x: int(x.split("-")[0]))
hdb['month'] = hdb['year-month'].apply(lambda x: int(x.split("-")[1]))
hdb['lower_storey_bound'] = hdb['storey_range'].apply(lambda x: int(x.split()[0]))
hdb['upper_storey_bound'] = hdb['storey_range'].apply(lambda x: int(x.split()[2]))
hdb['flat_age'] = hdb['year'] - hdb['lease_commence_date']
town_dummies = pd.get_dummies(hdb.town, drop_first=True, prefix='TOWN')
area_dummies = town_dummies.multiply(hdb["floor_area_sqm"], axis="index").add_prefix('AREA_')

hdb_rich = pd.concat([hdb, town_dummies, area_dummies], axis=1)
new_factors = np.concatenate((town_dummies.columns, area_dummies.columns, ["upper_storey_bound", "floor_area_sqm", "flat_age"]), axis=0)

In [10]:
vectorizer = CountVectorizer(max_features = 500, 
                             ngram_range=(1, 2), 
                             binary=True)

In [11]:
street_words = vectorizer.fit_transform(hdb_rich.street_name)

In [12]:
X = hstack((street_words, hdb_rich[new_factors]))

In [16]:
y = hdb_rich.resale_price
kf = KFold(10, shuffle=True)

In [74]:
params = {'n_estimators': [20, 30, 40], 'max_depth': [10, 20, 30]}
gs = GridSearchCV(RandomForestRegressor(), param_grid = params, scoring='neg_mean_squared_error', cv=kf, n_jobs=4)

In [75]:
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [20, 30, 40], 'max_depth': [10, 20, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [77]:
np.sqrt(-gs.best_score_)

30135.188332408048

In [78]:
gs.best_params_

{'max_depth': 30, 'n_estimators': 40}

In [93]:
feature_importances = pd.DataFrame({'Features' : feature_names, 'Importance Score': gs.best_estimator_.feature_importances_})
feature_importances.sort_values('Importance Score', ascending=False)[:5]

Unnamed: 0,Features,Importance Score
551,floor_area_sqm,0.543067
552,flat_age,0.085306
550,upper_storey_bound,0.072125
528,AREA_TOWN_BUKIT MERAH,0.027475
526,AREA_TOWN_BISHAN,0.017142


In [None]:
rmses = []
for train_index, test_index in kf.split(X):
    model = RandomForestRegressor(max_depth=30, n_estimators=40)
    X_train = X.toarray()[train_index]
    y_train = y[train_index]
    X_test = X.toarray()[test_index]
    y_test = y[test_index]
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(metrics.mean_squared_error(preds, y_test))
    rmses.append(rmse)
    print("RMSE {}").format(rmse)
print("Mean RMSE {}").format(np.mean(rmse))

RMSE 29821.834446
RMSE 30062.5600221
RMSE 30064.6283888
RMSE 29905.790824
RMSE 29942.5009915
RMSE 29626.5112284
RMSE 30206.8181761
RMSE 30931.8706824
