In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sps
import seaborn as sns
import time
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from scipy.special import boxcox, inv_boxcox
from sklearn.metrics import r2_score

In [2]:
# alternative methods of loading keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.layers import Dropout 
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import regularizers 

In [3]:
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [4]:
# Retrieve stored variables from preprocessing notebook

%store -r Xlog_train_ss
%store -r Xlog_val_ss
%store -r Xlog_test_ss 

%store -r Xlog_train
%store -r Xlog_val
%store -r Xlog_test

%store -r ylog_train
%store -r ylog_val
%store -r ylog_test 

%store -r X_train
%store -r X_val
%store -r X_test

%store -r y_train
%store -r y_val
%store -r y_test

%store -r X_train_ss
%store -r X_val_ss
%store -r X_test_ss

%store -r X
%store -r y

## Keras

In [8]:
n_cols = Xlog_train.shape[1]
input_shape = (n_cols, )

# Creates a model given an activation and learning rate
def create_model(learning_rate = 0.01, activation = 'relu'):
  
    # Create an Adam optimizer with the given learning rate
    opt = Adam(lr=learning_rate)
  
    # Create a model with 2 hidden layers
    model = Sequential()
    model.add(Dense(128, 
                    activation = activation,
                    input_shape = input_shape,
                    activity_regularizer = regularizers.l2(1e-5)))
    model.add(Dropout(0.50))
    model.add(Dense(128,
                    activation = activation, 
                    activity_regularizer = regularizers.l2(1e-5)))
    model.add(Dropout(0.50))
    model.add(Dense(1, activation = activation))

    # Compile the model
    model.compile(optimizer = opt,
                  loss = "mse",
                  metrics=['mse'])
    return model

In [6]:
# Create a Keras Regressor
model_1 = KerasRegressor(build_fn = create_model,
                       verbose = 0)

# Define the hyperparameter space
params = {'batch_size': [16, 32], 
          'epochs': [100],
          'learning_rate': [0.02, 0.01]}

# Create a randomize search cv object 
random_search = RandomizedSearchCV(model_1,
                                   param_distributions = params,
                                   cv = KFold(10))
random_search_results = random_search.fit(Xlog_train, ylog_train)



In [7]:
random_search.cv_results_

{'mean_fit_time': array([669.52495759, 455.71868176, 239.12452421, 230.65378642]),
 'std_fit_time': array([80.94071162, 89.76019963,  7.21527384, 11.27991699]),
 'mean_score_time': array([0.94492939, 0.44517431, 0.26185436, 0.24453919]),
 'std_score_time': array([0.17769696, 0.16413604, 0.03521401, 0.02159727]),
 'param_learning_rate': masked_array(data=[0.02, 0.01, 0.02, 0.01],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_epochs': masked_array(data=[100, 100, 100, 100],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_batch_size': masked_array(data=[16, 16, 32, 32],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'learning_rate': 0.02, 'epochs': 100, 'batch_size': 16},
  {'learning_rate': 0.01, 'epochs': 100, 'batch_size': 16},
  {'learning_rate': 0.02, 'epochs': 100, 'batch_size': 32},
  {'learnin

In [22]:
log_train_pred = random_search.predict(Xlog_train)
log_val_pred = random_search.predict(Xlog_val)

train_pred = inv_boxcox(log_train_pred, -0.6)
val_pred = inv_boxcox(log_val_pred, -0.6)

# get the mean of results and square-root it to get RMSE
print('Training RMSE : %.7f' % mean_squared_error(y_train, train_pred, squared=False))
print('Val RMSE : %.7f' % mean_squared_error(y_val, val_pred, squared=False))
print('R2 Score : %.7f' % r2_score(y_val,val_pred))

Training RMSE : 197724.3988144
Val RMSE : 198126.0148006
R2 Score : -0.8638293


The result is really bad so I will try log transformed rather than boxcox

In [None]:
ylog_train = np.log(y_train)

In [6]:
from scipy.stats import skew
# Select features to be considered for normalization (dummy variables are not included)
numeric_cols = pd.Index(['floor_area_sqm', 'remaining_lease', 'floor_range',
                 'pmi', 'cpi', 'gdp_growth','hdb_index','cli',
                 'unemployed_rate', 
                 'dist_attraction', 
                  'dist_mrt', 'dist_mall', 'dist_market',
                 'dist_park', 'dist_sport'])

skewed_cols = Xlog_train[numeric_cols].apply(lambda x: skew(x.dropna())) 
skewed_cols = skewed_cols[skewed_cols > 0.7]
skewed_cols = skewed_cols.index

# Between -0.5 and 0.5 are considered sysmterical
# Between -1 and -0.5 or between 0.5 and 1 are considered moderately skewed
# Below -1 or above 1 is considered highly skewed
# I decided to use 0.7 as the threshold to determine whether to apply normalization

for s in skewed_cols:
    Xlog_train[s] = np.log(X_train[s])
    Xlog_val[s] = np.log(X_val[s])

In [15]:
# Create a Keras Regressor
model_2 = KerasRegressor(build_fn = create_model,
                       verbose = 0)

model_2.fit(x=Xlog_train, y=ylog_train, epochs=50, batch_size=16)

<tensorflow.python.keras.callbacks.History at 0x231b3eb2a08>

In [16]:
log_train_pred = model_2.predict(Xlog_train)
log_val_pred = model_2.predict(Xlog_val)

train_pred = np.exp(log_train_pred)
val_pred = np.exp(log_val_pred)

# get the mean of results and square-root it to get RMSE
print('Training RMSE : %.7f' % mean_squared_error(y_train, train_pred, squared=False))
print('Val RMSE : %.7f' % mean_squared_error(y_val, val_pred, squared=False))
print('R2 Score : %.7f' % r2_score(y_val,val_pred))

Training RMSE : 460654.8394156
Val RMSE : 460302.3409795
R2 Score : -9.1163112


### It is still very bad, so will go without normalizing data

In [17]:
# Create a Keras Regressor
model_3 = KerasRegressor(build_fn = create_model,
                       verbose = 0)

model_3.fit(x=X_train, y=y_train, epochs=50, batch_size=16)

<tensorflow.python.keras.callbacks.History at 0x231acfa0848>

In [14]:
train_pred = model_3.predict(X_train)
val_pred = model_3.predict(X_val)

# get the mean of results and square-root it to get RMSE
print('Training RMSE : %.7f' % mean_squared_error(y_train, train_pred, squared=False))
print('Val RMSE : %.7f' % mean_squared_error(y_val, val_pred, squared=False))
print('R2 Score : %.7f' % r2_score(y_val,val_pred))

Training RMSE : 45412.3960336
Val RMSE : 45589.5989104
R2 Score : 0.9007644


Will use the above result for comparison with other XGBoost models