In [22]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import nest_asyncio

In [2]:
df = pd.read_csv('./data/from-jan-2015-onward-scaled.csv')
df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,scaled_floor_area_sqm,scaled_lease_commence_date,scaled_remaining_lease,scaled_resale_price,scaled_storey_range,scaled_latitude,scaled_longitude,minDistanceFromMall,minDistanceFromMrt,minDistFromSch
0,2015-01,ANG MO KIO,3 ROOM,174,ANG MO KIO AVE 4,8,60.0,Improved,1986,70.0,...,-1.542079,-0.569175,-0.363679,-1.237364,-0.081788,0.193734,-0.017583,893.278432,988.960998,210.89667
1,2015-01,ANG MO KIO,3 ROOM,541,ANG MO KIO AVE 10,2,68.0,New Generation,1981,65.0,...,-1.212248,-0.977903,-0.775289,-1.102024,-1.151769,0.166323,0.231787,813.149646,718.272789,420.358229
2,2015-01,ANG MO KIO,3 ROOM,163,ANG MO KIO AVE 4,2,69.0,New Generation,1980,64.0,...,-1.171019,-1.059649,-0.857611,-1.034354,-1.151769,0.157602,-0.00986,796.220278,1063.134486,350.034254
3,2015-01,ANG MO KIO,3 ROOM,446,ANG MO KIO AVE 10,2,68.0,New Generation,1979,63.0,...,-1.212248,-1.141395,-0.939933,-1.000519,-1.151769,0.022595,0.228126,706.917955,619.745373,344.590308
4,2015-01,ANG MO KIO,3 ROOM,557,ANG MO KIO AVE 10,8,68.0,New Generation,1980,64.0,...,-1.212248,-1.059649,-0.857611,-1.000519,-0.081788,0.112748,0.26108,952.440214,830.384131,598.219907


In [3]:
y = df['scaled_resale_price']
X = df.drop(df.iloc[:,:16], axis =1, inplace = True)
X = df.loc[:, df.columns != 'scaled_resale_price']
# label_X = df.drop(['SalePrice', 'Unnamed: 0'], axis =1)
X.head()

Unnamed: 0,scaled_lease_commence_date,scaled_remaining_lease,scaled_storey_range,scaled_latitude,scaled_longitude,minDistanceFromMall,minDistanceFromMrt,minDistFromSch
0,-0.569175,-0.363679,-0.081788,0.193734,-0.017583,893.278432,988.960998,210.89667
1,-0.977903,-0.775289,-1.151769,0.166323,0.231787,813.149646,718.272789,420.358229
2,-1.059649,-0.857611,-1.151769,0.157602,-0.00986,796.220278,1063.134486,350.034254
3,-1.141395,-0.939933,-1.151769,0.022595,0.228126,706.917955,619.745373,344.590308
4,-1.059649,-0.857611,-0.081788,0.112748,0.26108,952.440214,830.384131,598.219907


## Lasso Regression

In [4]:
param_list = []
start = 0.0001
for i in range(30):
    param_list.append(start)
    start = start + 0.0001
parameters2 = {'alpha': param_list}
lasso = Lasso()
lasso.fit(X,y)
metrics = cross_validate(lasso, X,y, cv = 5, scoring = ('neg_root_mean_squared_error'))
print('SCORE FOR UNTUNED LASSO')
print(-metrics['test_score'].mean())

SCORE FOR UNTUNED LASSO
0.9877081538756263


In [5]:
tuned_lasso = GridSearchCV(lasso, parameters2, scoring = 'neg_root_mean_squared_error', cv = 5)
tuned_lasso.fit(X,y)
best_param = tuned_lasso.best_params_['alpha']
lasso_tuned = Lasso(alpha = best_param)
lasso_tuned.fit(X,y)
metrics = cross_validate(lasso_tuned, X,y, cv = 5, scoring = ('neg_root_mean_squared_error'))

In [6]:
print('BASED ON CV, WE FOUND OPTIMAL ALPHA TO BE:')
print(best_param)
print(-tuned_lasso.best_score_)
print()
model_coefs = list(lasso_tuned.coef_)

good_features = []
for i in range(len(model_coefs)):
    if model_coefs[i]> 0:
        good_features.append(list(X.columns)[i])

print('\n'+'FEATURES WITH NON ZERO COEFFICIENTS FOR LASSO')
print(good_features)

BASED ON CV, WE FOUND OPTIMAL ALPHA TO BE:
0.0015000000000000005
0.8205916383738886


FEATURES WITH NON ZERO COEFFICIENTS FOR LASSO
['scaled_remaining_lease', 'scaled_storey_range', 'scaled_longitude', 'minDistFromSch']


In [7]:
for i in good_features:
    if '_' in i:
        print(i, end = ', ')

scaled_remaining_lease, scaled_storey_range, scaled_longitude, 

In [8]:
new_feature_set = X[good_features]

## Reworking based on feature selection

In [9]:
new_feature_set = X[good_features]
lr_model_selected = LinearRegression()
metrics = cross_validate(lr_model_selected, new_feature_set, y, cv = 5,scoring = ('neg_root_mean_squared_error'))
print('CV PREDICTION ERROR FOR NEW FEATURE SET MULTIVARIATE LR')
-metrics['test_score'].mean()

CV PREDICTION ERROR FOR NEW FEATURE SET MULTIVARIATE LR


0.8826540253971384

In [10]:
param_list = []
start = 4
for i in range(30):
    param_list.append(start)
    start = start + 0.1
#     start = start +

parameters = {'alpha': param_list} 
rr = Ridge()
metrics = cross_validate(rr, X[good_features], y, cv = 5, scoring = ('neg_root_mean_squared_error'))
print('CV PREDICTION ERROR FOR UNTUNED RR')
print(-metrics['test_score'].mean())

tuned_rr = GridSearchCV(rr, parameters, scoring = 'neg_root_mean_squared_error', cv = 5)
tuned_rr.fit(X[good_features],y)
print('CV PREDICTION ERROR FOR TUNED RR')
print(-tuned_rr.best_score_)
print(tuned_rr.best_params_)

CV PREDICTION ERROR FOR UNTUNED RR
0.8826540021581277
CV PREDICTION ERROR FOR TUNED RR
0.8826538654917853
{'alpha': 6.89999999999999}


In [11]:
param_list = []
start = 0.0001
for i in range(30):
    param_list.append(start)
    start = start + 0.0001
parameters2 = {'alpha': param_list}
lasso = Lasso()
lasso.fit(new_feature_set,y)
metrics = cross_validate(lasso, new_feature_set,y, cv = 5, scoring = ('neg_root_mean_squared_error'))
print('SCORE FOR UNTUNED LASSO')
print(-metrics['test_score'].mean())

SCORE FOR UNTUNED LASSO
0.9977461008873755


In [12]:
tuned_lasso = GridSearchCV(lasso, parameters2, scoring = 'neg_root_mean_squared_error', cv = 5)
tuned_lasso.fit(new_feature_set,y)
best_param = tuned_lasso.best_params_['alpha']
lasso_tuned = Lasso(alpha = best_param)
lasso_tuned.fit(X,y)
metrics = cross_validate(lasso_tuned, new_feature_set,y, cv = 5, scoring = ('neg_root_mean_squared_error'))



In [13]:
print('BASED ON CV, WE FOUND OPTIMAL ALPHA TO BE:')
print(best_param)
print(-tuned_lasso.best_score_)
print()
model_coefs = list(lasso_tuned.coef_)

BASED ON CV, WE FOUND OPTIMAL ALPHA TO BE:
0.0020000000000000005
0.8826478840617877



## Random Forest

In [14]:
rf = RandomForestRegressor(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1, oob_score = True, bootstrap = True)
rf.fit(new_feature_set, y)
print(rf.get_params())
print('\n This is the oob score: ', rf.oob_score_)
rf_features = []

for name, score in zip(list(X.columns),rf.feature_importances_):
    if score > 0:
#         print(name,score)
        rf_features.append((name,score))



{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': 16, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': True, 'random_state': None, 'verbose': 0, 'warm_start': False}

 This is the oob score:  0.33670044686937184


In [15]:
print('LASSO REGRESSION FEATURES SELECTED: \n', good_features)
print()
print('FEATURES USED TO SPLIT RF: \n', rf_features)

LASSO REGRESSION FEATURES SELECTED: 
 ['scaled_remaining_lease', 'scaled_storey_range', 'scaled_longitude', 'minDistFromSch']

FEATURES USED TO SPLIT RF: 
 [('scaled_lease_commence_date', 0.3767313106833139), ('scaled_remaining_lease', 0.35362411996518434), ('scaled_storey_range', 0.2333865138651979), ('scaled_latitude', 0.036258055486303925)]


In [16]:
import nest_asyncio
nest_asyncio.apply()

## Tuning of RF model

Using the OOB score to evaluate the random forest model gives us an extremely high OOB r^2 score. Based on the model, we also see that many features have very low importance, and this incentivizes us to revisit the variable selection methodology that we adopted earlier.
We try hyperparameter search using random hyperparameter grid, so that we can find optimum hyperparamters for our random forest model

In [17]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num =10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10,110,num =11)]
max_depth.append(None)
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,4]
bootstrap = [True]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True]}


In [18]:
from dask_ml.model_selection import RandomizedSearchCV

In [19]:
rf = RandomForestRegressor()
#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 25, cv =5, verbose = 2, random_state = 42, n_jobs = -1, scoring = 'neg_root_mean_squared_error')
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 25, cv =5, random_state = 42, n_jobs = -1, scoring = 'neg_root_mean_squared_error')
rf_random.fit(new_feature_set, y)
print('THIS IS THE BEST SCORE')
print(-rf_random.best_score_)
print('THIS IS THE BEST PARAMS')
print(rf_random.best_params_)

THIS IS THE BEST SCORE
0.5302358976356846
THIS IS THE BEST PARAMS
{'n_estimators': 1200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': True}


In [20]:

param_grid = {
    'max_depth': [20,25,15],
    'min_samples_leaf': [1],
    'min_samples_split': [2],
    'max_features' : ['sqrt'],
    'n_estimators': [800,850, 750]
#     'bootstrap': [False]
}



In [23]:
from dask_ml.model_selection import GridSearchCV

In [25]:
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, n_jobs= -1, scoring = 'neg_root_mean_squared_error')
grid_search.fit(X,y)
# print(grid_search.best_params_)
print('SCORE FOR RF')
print(-grid_search.best_score_)

  kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})


SCORE FOR RF
0.41588484718538693
