In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor

In [None]:
# data
train = pd.read_csv('train.csv')
population_demo = pd.read_csv('auxiliary-data/sg-population-demographics.csv')

# all the auxiliary variables done by Fiona
station = pd.read_csv('train_df_fe_station.csv') 
malls = pd.read_csv('train_df_fe_malls.csv')
hawker = pd.read_csv('train_df_fe_hawker.csv')
commercial = pd.read_csv('train_df_fe_commercial.csv')
# auxilliary variables chosen
aux = [station, malls, hawker, commercial]
aux_df = pd.concat(aux, axis = 1) # df of all auxilliary variables
aux_chosen = aux_df[['commercial_CBD', 'commercial_type_CR',
       'commercial_type_IEBP', 'commercial_type_IEPB', 'commercial_type_BN',
       'commercial_type_IHL', 'hawker_ECLFV', 'hawker_NFC', 'hawker_CRB89',
       'hawker_OARB51OARFCSM', 'hawker_CRB', 'hawker_HVMFC', 'hawker_BFC',
       'hawker_CCFC', 'hawker_TBM', 'hawker_BPHC', 'hawker_GMFC',
       'hawker_YPHC', 'hawker_OTH', 'hawker_KAHC', 'hawker__',
       'hawker_highrating_', 'hawker_established_', 'malls_GWC', 'malls_IO',
       'malls_TSMBS', 'malls_NAC', 'malls_PS', 'malls_SC', 'malls_OTH',
       'malls_CA', 'malls_JCA', 'malls_VivoCity', 'malls_JP', 'malls__',
       'malls_ratingsbin_4.1', 'malls_ratingsbin_4.3', 'malls_ratingsbin_>4.0',
       'malls_ratingsbin_4.2', 'malls_ratingsbin_4.0',
       'malls_ratingsbin_>=4.5', 'malls_ratingsbin_4.4', 'malls_established_',
       'station_type_mrt', 'station_type_other', 'station_interchange_',
       'station_EW_', 'station_NS_', 'station_NE_', 'station_CC_',
       'station_DT_']]

In [None]:
train.loc[train['flat_type'] == "1-room", 'flat_type'] = "1 room"
train.loc[train['flat_type'] == "2-room", 'flat_type'] = "2 room"
train.loc[train['flat_type'] == "3-room", 'flat_type'] = "3 room"
train.loc[train['flat_type'] == "4-room", 'flat_type'] = "4 room"
train.loc[train['flat_type'] == "5-room", 'flat_type'] = "5 room"

# converting the block column to 1 if it has the number 4
# converting the block column to 0 if it does not have the number 4
train.loc[train['block'].str.contains('4'),'block'] = 1
train.loc[train['block'].str.contains('4') == False, 'block'] = 0

# convert to 01 to 06, 06 to 10, 10 to 15, 16 to 21, 21 to 25, 25 to 30, 
# 31 to 36, 36 to 40, 40 to 45, 46 to 51
# data is messy as it has lots of overlaps, so the partioning is to make
# it more systematic
# 01 to 06
train.loc[train['storey_range'] == "01 to 03", 'storey_range'] = "01 to 06"
train.loc[train['storey_range'] == "01 to 05", 'storey_range'] = "01 to 06"
train.loc[train['storey_range'] == "04 to 06", 'storey_range'] = "01 to 06"
# 06 to 10
train.loc[train['storey_range'] == "07 to 09", 'storey_range'] = "06 to 10"
# 10 to 15
train.loc[train['storey_range'] == "10 to 12", 'storey_range'] = "10 to 15"
train.loc[train['storey_range'] == "11 to 15", 'storey_range'] = "10 to 15"
train.loc[train['storey_range'] == "13 to 15", 'storey_range'] = "10 to 15"
# 16 to 21
train.loc[train['storey_range'] == "16 to 18", 'storey_range'] = "16 to 21"
train.loc[train['storey_range'] == "16 to 20", 'storey_range'] = "16 to 21"
train.loc[train['storey_range'] == "19 to 21", 'storey_range'] = "16 to 21"
# 21 to 25
train.loc[train['storey_range'] == "22 to 24", 'storey_range'] = "21 to 25"
# 25 to 30
train.loc[train['storey_range'] == "25 to 27", 'storey_range'] = "25 to 30"
train.loc[train['storey_range'] == "26 to 30", 'storey_range'] = "25 to 30"
train.loc[train['storey_range'] == "28 to 30", 'storey_range'] = "25 to 30"
# 31 to 36
train.loc[train['storey_range'] == "31 to 33", 'storey_range'] = "31 to 36"
train.loc[train['storey_range'] == "31 to 35", 'storey_range'] = "31 to 36"
train.loc[train['storey_range'] == "34 to 36", 'storey_range'] = "31 to 36"
# 36 to 40
train.loc[train['storey_range'] == "37 to 39", 'storey_range'] = "36 to 40"
# 40 to 45
train.loc[train['storey_range'] == "40 to 42", 'storey_range'] = "40 to 45"
train.loc[train['storey_range'] == "43 to 45", 'storey_range'] = "40 to 45"
# 46 to 51
train.loc[train['storey_range'] == "46 to 48", 'storey_range'] = "46 to 51"
train.loc[train['storey_range'] == "49 to 51", 'storey_range'] = "46 to 51"

# population count across age in a particular subzone
dicts = {}
for area in np.unique(population_demo.subzone):
    area_count = population_demo[population_demo['subzone'] == area]['count'].sum()
    dicts[area] = area_count 
train['popcount_subzone'] = train['subzone'].map(dicts)

# 490 was derived from central subzone in the population demographics
# dataset. However, there is no such subzone in the main dataset. After
# verifying it, central subzone is inferred to be 'city hall' in main 
# data set (beach road area)
print(dicts['central subzone'])
train.loc[train['subzone'] == "city hall", 'popcount_subzone'] = 490

train[['resale_year', 'resale_month']] = train['month'].str.split('-', 1, expand=True)


train['age_at_sales'] = pd.to_numeric(train['resale_year']) - pd.to_numeric(train['lease_commence_date'])

In [None]:
# implement one-hot encoding on categorical columns
# do note that pd.get_dummies drop the original variable column by
# default. Also note that "block" does not have to be one-hot encoded
# because it is a binary variable .
categorical_cols = ['flat_type', 'street_name','resale_year','resale_month',
                    'storey_range', 'flat_model', 'subzone', 
                    'planning_area', 'region', 'lease_commence_date']
train_dummies = pd.get_dummies(train, columns = categorical_cols)

train_y_all = train_dummies['resale_price']


train_final_all = train_dummies.drop(columns = ['town', 'eco_category', 
                                            'elevation','month','resale_price'])

combined = [train_final_all, aux_chosen]
train_final_all = pd.concat(combined, axis = 1)

# splitting to training and validation
X_train, X_val, y_train, y_val = train_test_split(train_final_all, train_y_all, test_size=0.25, random_state=0)





In [None]:
%%time

# Normalize data using the sciki-learn MinMaxScaler
scaler = MinMaxScaler().fit(X_train)
X_train, X_val = scaler.transform(X_train), scaler.transform(X_val)

# Adaboost

In [None]:
%%time

# Create the parameter grid
param_grid = {
    'base_estimator__max_depth': [3,20,50],
    'base_estimator__min_samples_leaf': [2,20,50],
    'n_estimators': [1500],
    'learning_rate':[0.1,0.01,0.001],
    'loss': ['linear','square']
}

# Create a based model
adaboost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())

# Instantiate the grid search model
grid_search_ab = GridSearchCV(estimator = adaboost, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, 
                           scoring = 'neg_root_mean_squared_error')

# Fit the grid search to the data
grid_search_ab.fit(X_train, y_train)


# Store the parameters of the best model
best_params = grid_search_ab.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred = grid_search_ab.predict(X_val)

# Calculate the RMSE score
best_RMSE = mean_squared_error(y_val, y_pred, squared=False)

print('Adaboost regressor: {} (RMSE: {:.3f})'.format(best_params, best_RMSE))

# Random Forest

In [None]:
%%time

# Create the parameter grid
param_grid = {
    'bootstrap': [True],
    'max_depth': [70,150,None],
    'max_features': ['sqrt',100,500],# might change this to a much smaller value
    'min_samples_split': [2,20,40],
    'min_samples_leaf': [1,20,40],
    'n_estimators': [1500]
}

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search_rf = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, 
                           scoring = 'neg_root_mean_squared_error')

# Fit the grid search to the data
grid_search_rf.fit(X_train, y_train)


# Store the parameters of the best model
best_params = grid_search_rf.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred = grid_search_rf.predict(X_val)

# Calculate the RMSE score
best_RMSE = mean_squared_error(y_val, y_pred, squared=False)

print('randomforest regressor: {} (RMSE: {:.3f})'.format(best_params, best_RMSE))

# MLP

In [None]:
%%time

# Create the parameter grid
param_grid = {
    'hidden_layer_sizes': [(50,50,50,50),(100,100,100,100), (200,200,200,200),
                          (50,50,50,50,50,50),(100,100,100,100,100,100),(200,200,200,200,200,200)],
    'alpha': [0.00005,0.0005, 0.005],
    'activation':['relu'],
    'solver':['adam'],
    'max_iter':[1000],
    'learning_rate':['adaptive']
}

# Create a based model
mlp = MLPRegressor()

# Instantiate the grid search model
grid_search_mlp = GridSearchCV(estimator = mlp, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, 
                           scoring = 'neg_root_mean_squared_error')

# Fit the grid search to the data
grid_search_mlp.fit(X_train, y_train)


# Store the parameters of the best model
best_params = grid_search_mlp.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred = grid_search_mlp.predict(X_val)

# Calculate the RMSE score
best_RMSE = mean_squared_error(y_val, y_pred, squared=False)

print('MLP regressor: {} (RMSE: {:.3f})'.format(best_params, best_RMSE))

# KNN

In [None]:
%%time

# Create the parameter grid
param_grid = {
    'n_neighbors': [5,15,45,100,200]
}

# Create a based model
knn = KNeighborsRegressor()

# Instantiate the grid search model
grid_search_knn = GridSearchCV(estimator = knn, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, 
                           scoring = 'neg_root_mean_squared_error')

# Fit the grid search to the data
grid_search_knn.fit(X_train, y_train)


# Store the parameters of the best model
best_params = grid_search_knn.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred = grid_search_knn.predict(X_val)

# Calculate the RMSE score
best_RMSE = mean_squared_error(y_val, y_pred, squared=False)

print('KNN regressor: {} (RMSE: {:.3f})'.format(best_params, best_RMSE))

# SVM

In [None]:
%%time

# grid search 
# Create the parameter grid based on the results of random search 
param_grid = {
    'C': [0.1,0.5,1,10],
    'degree': [3,6],
    'kernel': ['rbf','poly'],
    'gamma': ['scale'],
    'max_iter':[-1]
}

# Create a based model
svm = SVR()

# Instantiate the grid search model
grid_search_svm = GridSearchCV(estimator = svm, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2,
                          scoring = 'neg_root_mean_squared_error')

# Fit the grid search to the data
grid_search_svm.fit(X_train, y_train)

# Store the parameters of the best model
best_params = grid_search_svm.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred = grid_search_svm.predict(X_val)

# Calculate the RMSE score
best_RMSE = mean_squared_error(y_val, y_pred, squared=False)

print('SVM regressor: {} (RMSE: {:.3f})'.format(best_params, best_RMSE))

# Gradient boosting

In [None]:
%%time

# Create the parameter grid
param_grid = {
    'max_depth': [3, 20,40],
    'min_samples_split': [2,20,40]
    'min_samples_leaf': [1,20,40],
    'n_estimators': [1500],
    'learning_rate':[0.1]
}

# Create a based model
gbr = GradientBoostingRegressor()

# Instantiate the grid search model
grid_search_gbr = GridSearchCV(estimator = gbr, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, 
                           scoring = 'neg_root_mean_squared_error')

# Fit the grid search to the data
grid_search_gbr.fit(X_train, y_train)


# Store the parameters of the best model
best_params = grid_search_gbr.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred = grid_search_gbr.predict(X_val)

# Calculate the RMSE score
best_RMSE = mean_squared_error(y_val, y_pred, squared=False)

print('GB regressor: {} (RMSE: {:.3f})'.format(best_params, best_RMSE))