In [1]:
%matplotlib notebook
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, f1_score

DATA_DIR = '../data/'

In [3]:
import sys
sys.path.append("..") 
from EDA_v1 import EDA

In [4]:
filepath_test = DATA_DIR + 'test.csv'
df_test = pd.read_csv(filepath_test)
filepath_train = DATA_DIR + 'train.csv'
df_train = pd.read_csv(filepath_train)

In [5]:
# df_train.head()

In [6]:
%%time
eda = EDA(df_train.copy(), df_test.copy())
eda.setup_ablation() # type + tenure + year + size_sqft + furnishing + lng + lat + plan_area_one_hot

CPU times: total: 1.94 s
Wall time: 1.93 s


In [7]:
# eda.df.info()

In [8]:
#  eda.df_test.info()

In [9]:
# eda.df.head()

In [10]:
# eda.df_test.head()

In [11]:
# eda.df['num_beds'].unique()

In [12]:
df_train = eda.df
df_test = eda.df_test
df_train_y = df_train['price']
df_train_X = df_train.drop(columns=['price'])
X_train = df_train_X.to_numpy(dtype = 'float32')
y_train = df_train_y.to_numpy(dtype = 'float32')
X_test = df_test.to_numpy(dtype = 'float32')
print(X_train.shape, X_test.shape)

(20029, 72) (6966, 72)


In [13]:
X_train.dtype

dtype('float32')

### Data Scaling

In [14]:
for i in range(X_train.shape[1]):
    min_max_scaler = MinMaxScaler()
    col_train = X_train[:, i]
    col_train = col_train.reshape(-1, 1)
    col_train = min_max_scaler.fit_transform(col_train)
    X_train[:, i] = col_train.reshape(-1)
    
    col_test = X_test[:, i]
    col_test = col_test.reshape(-1, 1)
    col_test = min_max_scaler.transform(col_test)
    X_test[:, i] = col_test.reshape(-1)

y_train = y_train.reshape(-1)
# y_train = y_train / 100

In [15]:
print(X_train.shape, y_train.shape)
print(y_train)

(20029, 72) (20029,)
[ 514500.  995400. 8485000. ... 4193700.  754800. 4178000.]


In [16]:
%%time
random_state = 10
param_test = {'max_depth':[16, 64, 256, 1024], 'min_samples_split':[2, 16, 64], 'min_samples_leaf':[1, 2, 4]}
# param_test = {'min_samples_split':[2, 16, 64, 256, 1024]}
gsearch5 = GridSearchCV(estimator = DecisionTreeRegressor(#min_samples_split=2,
#                                                               max_depth=3,
                                                              random_state=random_state),
                                                              param_grid = param_test, 
                                                              scoring='neg_root_mean_squared_error',
                                                              n_jobs=4,
                                                              cv=5, verbose=3)
gsearch5.fit(X_train, y_train)
y_train_pred = gsearch5.predict(X_train)
print('rmse on training set: ', np.sqrt(mean_squared_error(y_train_pred, y_train)))
print('random_state: ', random_state)
model = gsearch5
print(gsearch5.best_params_, gsearch5.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
rmse on training set:  376692.5900181266
random_state:  10
{'max_depth': 16, 'min_samples_leaf': 1, 'min_samples_split': 2} -1664250.833274186
CPU times: total: 578 ms
Wall time: 7.66 s


In [17]:
gsearch5.cv_results_

{'mean_fit_time': array([0.1270277 , 0.10822353, 0.10022235, 0.11522608, 0.1084239 ,
        0.10942469, 0.12682891, 0.11942639, 0.11502523, 0.14223189,
        0.1276288 , 0.11742611, 0.15103393, 0.12942905, 0.11782603,
        0.12782927, 0.12082686, 0.13142872, 0.15783563, 0.12002692,
        0.10662322, 0.14023123, 0.12502809, 0.11962681, 0.12202697,
        0.11562514, 0.10502372, 0.14803324, 0.12162848, 0.10022235,
        0.14022908, 0.11802664, 0.10582376, 0.12522874, 0.1220274 ,
        0.10662389]),
 'std_fit_time': array([0.00729563, 0.00407097, 0.00376403, 0.00947552, 0.004759  ,
        0.01392485, 0.01222327, 0.01217819, 0.00498047, 0.01205927,
        0.0073926 , 0.00287165, 0.01115605, 0.01317262, 0.01049787,
        0.00840154, 0.00541952, 0.0117264 , 0.01501457, 0.00525538,
        0.00527781, 0.00503704, 0.00678393, 0.00653181, 0.01014148,
        0.0059539 , 0.00260784, 0.01029785, 0.00866358, 0.01010942,
        0.01269005, 0.00865047, 0.00435544, 0.00479224, 0.007

In [18]:
# %%time
# num_data = len(X_train)

# sub_feat_idx_s = [0,1,2,3,4,5,7,12]
# sub_feat_idx_e = [1,2,3,4,5,7,12,16]
# feat_name = ['tenure', 'built_year', 'num_beds', 'num_baths', 'size_sqft', 'lat_lng', 'prop_type', 'furnish']
# # 0 -- tenure, 1 -- built_year, 2 -- num_beds, 3 -- num_baths, 4 -- size_sqft, 5,6 -- lat, lng, 7-11 -- property_type, 12-15 -- furnish
# for i in range(len(sub_feat_idx_s)):
#     X_train_sub = X_train[:, sub_feat_idx_s[i]:sub_feat_idx_e[i]]
#     print('-----------{}-----------'.format(feat_name[i]))
#     param_test = {'min_samples_split':[2, 16, 64, 256, 1024]}
#     gsearch5 = GridSearchCV(estimator = DecisionTreeRegressor(#min_samples_split=2,
#     #                                                               max_depth=3,
#                                                                   random_state=10),
#                                                                   param_grid = param_test, 
#                                                                   scoring='neg_root_mean_squared_error',
#                                                                   n_jobs=4,
#                                                                   cv=5, verbose=3)
#     gsearch5.fit(X_train_sub, y_train)
#     y_train_pred = gsearch5.predict(X_train_sub)
#     print(gsearch5.best_params_, gsearch5.best_score_)
#     print('rmse on training set: ', np.sqrt(mean_squared_error(y_train_pred, y_train)))

In [19]:
y_test_pred = model.predict(X_test)

In [20]:
y_test_pred_dict = {'Id':np.arange(len(y_test_pred)),
       'Predicted': y_test_pred.flatten()}
df_y_test_pred = pd.DataFrame(y_test_pred_dict)
df_y_test_pred.set_index(['Id'], inplace=True)
print(df_y_test_pred)
# df_y_test_pred.to_csv(DATA_DIR+"submission_tree_196w.csv")

         Predicted
Id                
0     1.145887e+06
1     1.080450e+06
2     1.130500e+06
3     6.829283e+05
4     5.931658e+05
...            ...
6961  1.680000e+07
6962  1.265092e+07
6963  3.552600e+06
6964  5.112933e+05
6965  4.531800e+06

[6966 rows x 1 columns]


## Ablation

In [21]:
np.set_printoptions(threshold=np.inf)
X_train[:5, 2]

array([0.01220717, 0.01755508, 0.03493576, 0.01038191, 0.00775446],
      dtype=float32)

In [22]:
%%time

col_idx = eda.get_col_idx(eda.df.drop(columns=['price']))
mse_dict = {'split': ['Train', 'Val']}
for key in col_idx.keys():
    ids = col_idx[key]
    X_train_sub = X_train[:, ids]
    X_test_sub = X_test[:, ids]
    print(key, X_train_sub.shape, X_test_sub.shape)
    random_state = 10
    param_test = {'max_depth':[16, 32, 64, 128], 'min_samples_split':[2, 4, 6], 'min_samples_leaf':[1, 2, 4]}
    model = GridSearchCV(estimator = DecisionTreeRegressor(#min_samples_split=2,
    #                                                               max_depth=3,
                                                                  random_state=random_state),
                                                                  param_grid = param_test, 
                                                                  scoring='neg_root_mean_squared_error',
                                                                  n_jobs=4,
                                                                  cv=5, verbose=3)
    model.fit(X_train_sub, y_train)
    y_train_pred = model.predict(X_train_sub)
    rmse_train = np.sqrt(mean_squared_error(y_train_pred, y_train))
    print('rmse on training set: ', rmse_train)
    print('random_state: ', random_state)
    print(model.best_params_, model.best_score_)
    mse_dict[key] = [rmse_train / 1e6, -model.best_score_ / 1e6]
    print('-'*55)

tenure (20029, 1) (6966, 1)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
rmse on training set:  4486807.247057997
random_state:  10
{'max_depth': 16, 'min_samples_leaf': 1, 'min_samples_split': 2} -4480640.5739611145
-------------------------------------------------------
built_year (20029, 1) (6966, 1)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
rmse on training set:  4474653.93521629
random_state:  10
{'max_depth': 16, 'min_samples_leaf': 1, 'min_samples_split': 2} -4469368.048760654
-------------------------------------------------------
size_sqft (20029, 1) (6966, 1)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
rmse on training set:  2317063.067114128
random_state:  10
{'max_depth': 32, 'min_samples_leaf': 1, 'min_samples_split': 6} -2756658.494004461
-------------------------------------------------------
lat (20029, 1) (6966, 1)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
rmse on training set:  2343966.18147702

In [23]:
df = pd.DataFrame(mse_dict).transpose()
df.head()
# df.to_csv('../experiment/ablation_DT.csv')

Unnamed: 0,0,1
split,Train,Val
tenure,4.486807,4.480641
built_year,4.474654,4.469368
size_sqft,2.317063,2.756658
lat,2.343966,3.198181


In [24]:
%%time

col_idx_0 = eda.get_col_idx(eda.df.drop(columns=['price']))
all_idx = np.array(range(X_train.shape[1]))
train_mse_full = 436395.04923679214
val_mse_full = 2000172.1511837984

mse_dict = {'split': ['Train', 'delta_train', 'Val', 'delta_val']}
for key in col_idx.keys():
    ids = col_idx[key]
    ids = list(set(all_idx) - set(ids))
    X_train_sub = X_train[:, ids]
    X_test_sub = X_test[:, ids]
    print(key, X_train_sub.shape, X_test_sub.shape)
    random_state = 10
    param_test = {'max_depth':[16, 32, 64, 128], 'min_samples_split':[2, 4, 6], 'min_samples_leaf':[1, 2, 4]}
    model = GridSearchCV(estimator = DecisionTreeRegressor(#min_samples_split=2,
    #                                                               max_depth=3,
                                                                  random_state=random_state),
                                                                  param_grid = param_test, 
                                                                  scoring='neg_root_mean_squared_error',
                                                                  n_jobs=4,
                                                                  cv=5, verbose=3)
    model.fit(X_train_sub, y_train)
    y_train_pred = model.predict(X_train_sub)
    rmse_train = np.sqrt(mean_squared_error(y_train_pred, y_train))
    print('rmse on training set: ', rmse_train)
    print('random_state: ', random_state)
    print(model.best_params_, model.best_score_)
    mse_dict['w/o '+ key] = [rmse_train / 1e6, (rmse_train - train_mse_full) / 1e6, 
                             -model.best_score_ / 1e6, (-model.best_score_ - val_mse_full) / 1e6]
    print('-'*55)

tenure (20029, 71) (6966, 71)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
rmse on training set:  695741.106232261
random_state:  10
{'max_depth': 16, 'min_samples_leaf': 1, 'min_samples_split': 4} -1673118.0916657709
-------------------------------------------------------
built_year (20029, 71) (6966, 71)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
rmse on training set:  791323.7212613238
random_state:  10
{'max_depth': 64, 'min_samples_leaf': 2, 'min_samples_split': 2} -1730163.4352907985
-------------------------------------------------------
size_sqft (20029, 71) (6966, 71)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
rmse on training set:  1735153.786764865
random_state:  10
{'max_depth': 16, 'min_samples_leaf': 1, 'min_samples_split': 4} -2398692.308150918
-------------------------------------------------------
lat (20029, 71) (6966, 71)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
rmse on training set:  773569.

In [25]:
df = pd.DataFrame(mse_dict).transpose()
df.head()
# df.to_csv('../experiment/ablation_wo_DT.csv')

Unnamed: 0,0,1,2,3
split,Train,delta_train,Val,delta_val
w/o tenure,0.695741,0.259346,1.673118,-0.327054
w/o built_year,0.791324,0.354929,1.730163,-0.270009
w/o size_sqft,1.735154,1.298759,2.398692,0.39852
w/o lat,0.77357,0.337175,1.795242,-0.20493
