In [104]:
%matplotlib notebook
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [105]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

DATA_DIR = '../data/'

In [106]:
import sys
sys.path.append("..") 
from EDA_v1 import EDA

In [107]:
filepath_test = DATA_DIR + 'test.csv'
df_test = pd.read_csv(filepath_test)
filepath_train = DATA_DIR + 'train.csv'
df_train = pd.read_csv(filepath_train)
filepath_test_y = DATA_DIR + 'example-submission.csv'
df_test_y = pd.read_csv(filepath_test_y)
df_test_xy = df_test.copy()
df_test_xy['price'] = df_test_y['Predicted']
print(df_train.shape, df_test.shape, df_test_xy.shape)

(20254, 21) (6966, 20) (6966, 21)


In [108]:
%%time
eda = EDA(df_train.copy(), df_test.copy())
eda.setup_ablation() # type + tenure + year + size_sqft + furnishing + lng + lat + plan_area_one_hot

CPU times: total: 2.52 s
Wall time: 2.52 s


In [109]:
df_train = eda.df
df_test = eda.df_test
df_train_y = df_train['price']
df_train_X = df_train.drop(columns=['price'])
X_train = df_train_X.to_numpy()
y_train = df_train_y.to_numpy()
X_test = df_test.to_numpy()
print(X_train.shape, X_test.shape)
# 0 -- tenure, 1 -- built_year, 2 -- num_beds, 3 -- num_baths, 4 -- size_sqft, 5,6 -- lat, lng, 7-11 -- property_type, 12-15 -- furnish

(20032, 73) (6966, 73)


### Data Scaling

In [110]:
for i in range(X_train.shape[1]):
    min_max_scaler = MinMaxScaler()
    col_train = X_train[:, i]
    col_train = col_train.reshape(-1, 1)
    col_train = min_max_scaler.fit_transform(col_train)
    X_train[:, i] = col_train.reshape(-1)
    
    col_test = X_test[:, i]
    col_test = col_test.reshape(-1, 1)
    col_test = min_max_scaler.transform(col_test)
    X_test[:, i] = col_test.reshape(-1)

y_train = y_train.reshape(-1)

In [111]:
print(y_train)

[ 514500.  995400. 8485000. ... 4193700.  754800. 4178000.]


In [115]:
%%time

# param_test = {'max_depth':[100, 200, 300], 'min_samples_split':[2]}
# param_test = {'n_estimators':[50, 100, 200, 400], 'max_depth':[50, 100, 150], 'min_samples_split':[2, 4, 6]} # 400, 100, 2 -- 171w
# param_test = {'n_estimators':[200, 400, 600], 'max_depth':[50, 100, 150], 'min_samples_split':[2, 4]} # # 400, 100, 2 -- 171w
# param_test = {'n_estimators':[64, 128, 256], 'max_depth':[16, 32, 64], 'min_samples_split':[2, 8, 32, 128]} 
# param_test = {'n_estimators':[64, 128, 256], 'max_depth':[16, 32, 64], 'min_samples_split':[2]} 
param_test = {'n_estimators':[128], 'max_depth':[32], 'min_samples_split':[2]} 

random_state=42
gsearch2 = GridSearchCV(estimator = RandomForestRegressor(n_estimators=100,
#                                                               min_samples_split=2,
#                                                               max_depth=3,
                                                              random_state=random_state),
                                                              param_grid = param_test, 
                                                              scoring='neg_root_mean_squared_error',
                                                              n_jobs=8,
                                                              cv=5, verbose=3)
gsearch2.fit(X_train, y_train)
y_train_pred = gsearch2.predict(X_train)
model = gsearch2
print('rmse on training set: ', np.sqrt(mean_squared_error(y_train_pred, y_train)))
print('random_state: ', random_state)
print(gsearch2.best_params_, gsearch2.best_score_)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
rmse on training set:  689040.8512582438
random_state:  42
{'max_depth': 32, 'min_samples_split': 2, 'n_estimators': 128} -1728202.88424947
CPU times: total: 11.7 s
Wall time: 26.8 s


In [102]:
y_test_pred = model.predict(X_test)


In [103]:
y_test_pred_dict = {'Id':np.arange(len(y_test_pred)),
       'Predicted': y_test_pred.flatten()}
df_y_test_pred = pd.DataFrame(y_test_pred_dict)
df_y_test_pred.set_index(['Id'], inplace=True)
print(df_y_test_pred)
# df_y_test_pred.to_csv(DATA_DIR+"submission_RF_172.9w.csv")

         Predicted
Id                
0     1.176839e+06
1     1.598417e+06
2     1.236259e+06
3     6.911619e+05
4     5.785035e+05
...            ...
6961  1.916550e+07
6962  1.270702e+07
6963  3.563160e+06
6964  5.336461e+05
6965  4.517913e+06

[6966 rows x 1 columns]


## Ablation

In [112]:
%%time

col_idx = eda.get_col_idx(eda.df.drop(columns=['price']))
mse_dict = {'split': ['Train', 'Val']}
for key in col_idx.keys():
    ids = col_idx[key]
    X_train_sub = X_train[:, ids]
    X_test_sub = X_test[:, ids]
    print(key, X_train_sub.shape, X_test_sub.shape)
    param_test = {'n_estimators':[32, 64, 128], 'max_depth':[16, 32], 'min_samples_split':[2]} 
    random_state=42
    model = GridSearchCV(estimator = RandomForestRegressor(n_estimators=100,
    #                                                               min_samples_split=2,
    #                                                               max_depth=3,
                                                                  random_state=random_state),
                                                                  param_grid = param_test, 
                                                                  scoring='neg_root_mean_squared_error',
                                                                  n_jobs=8,
                                                                  cv=5, verbose=3)
    model.fit(X_train_sub, y_train)
    y_train_pred = model.predict(X_train_sub)
    rmse_train = np.sqrt(mean_squared_error(y_train_pred, y_train))
    print('rmse on training set: ', rmse_train)
    print('random_state: ', random_state)
    print(model.best_params_, model.best_score_)
    mse_dict[key] = [rmse_train / 1e6, -model.best_score_ / 1e6]
    print('-'*55)

tenure (20032, 1) (6966, 1)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
rmse on training set:  4486768.631669887
random_state:  42
{'max_depth': 16, 'min_samples_split': 2, 'n_estimators': 64} -4480501.392387135
-------------------------------------------------------
built_year (20032, 1) (6966, 1)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
rmse on training set:  4474683.333130866
random_state:  42
{'max_depth': 16, 'min_samples_split': 2, 'n_estimators': 128} -4469446.586762218
-------------------------------------------------------
size_sqft (20032, 1) (6966, 1)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
rmse on training set:  2316557.172482086
random_state:  42
{'max_depth': 16, 'min_samples_split': 2, 'n_estimators': 64} -2709736.404229038
-------------------------------------------------------
lat (20032, 1) (6966, 1)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
rmse on training set:  2185600.4373027165
random_state:

In [113]:
df = pd.DataFrame(mse_dict).transpose()
df.head()
# df.to_csv('../experiment/ablation_RF.csv')

In [116]:
%%time

col_idx_0 = eda.get_col_idx(eda.df.drop(columns=['price']))
all_idx = np.array(range(X_train.shape[1]))
train_mse_full = 689040.8512582438
val_mse_full = 1728202.88424947

mse_dict = {'split': ['Train', 'delta_train', 'Val', 'delta_val']}
for key in col_idx.keys():
    ids = col_idx[key]
    ids = list(set(all_idx) - set(ids))
    X_train_sub = X_train[:, ids]
    X_test_sub = X_test[:, ids]
    print(key, X_train_sub.shape, X_test_sub.shape)
    param_test = {'n_estimators':[32, 64, 128], 'max_depth':[16, 32], 'min_samples_split':[2]} 
    random_state=42
    model = GridSearchCV(estimator = RandomForestRegressor(n_estimators=100,
    #                                                               min_samples_split=2,
    #                                                               max_depth=3,
                                                                  random_state=random_state),
                                                                  param_grid = param_test, 
                                                                  scoring='neg_root_mean_squared_error',
                                                                  n_jobs=8,
                                                                  cv=5, verbose=3)
    model.fit(X_train_sub, y_train)
    y_train_pred = model.predict(X_train_sub)
    rmse_train = np.sqrt(mean_squared_error(y_train_pred, y_train))
    print('rmse on training set: ', rmse_train)
    print('random_state: ', random_state)
    print(model.best_params_, model.best_score_)
    mse_dict['w/o '+ key] = [rmse_train / 1e6, (rmse_train - train_mse_full) / 1e6, 
                             -model.best_score_ / 1e6, (-model.best_score_ - val_mse_full) / 1e6]
    print('-'*55)

tenure (20032, 72) (6966, 72)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
rmse on training set:  694721.4069274782
random_state:  42
{'max_depth': 32, 'min_samples_split': 2, 'n_estimators': 128} -1730772.5752973321
-------------------------------------------------------
built_year (20032, 72) (6966, 72)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
rmse on training set:  691572.8027442286
random_state:  42
{'max_depth': 32, 'min_samples_split': 2, 'n_estimators': 64} -1716742.6118333158
-------------------------------------------------------
size_sqft (20032, 72) (6966, 72)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
rmse on training set:  1754494.7289211988
random_state:  42
{'max_depth': 16, 'min_samples_split': 2, 'n_estimators': 128} -2377112.6283360953
-------------------------------------------------------
lat (20032, 72) (6966, 72)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
rmse on training set:  683911.5906543623
r

In [117]:
df = pd.DataFrame(mse_dict).transpose()
df.head()
# df.to_csv('../experiment/ablation_wo_RF.csv')