In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from math import radians, cos, sin, asin, sqrt
from sklearn.metrics import mean_squared_error as mse
import random
from EDA_v1 import *

In [None]:
# specify dataset filepath 

# --- original train & test dataset ----
filepath_test = '../data/test.csv'
df_test = pd.read_csv(filepath_test)
filepath_train = '../data/train.csv'
# filepath_train = '../data/train_cheat2.csv'
df_train = pd.read_csv(filepath_train)

# --- auxiliary dataset ----
filepath_commercial = '../data/auxiliary-data/sg-commerical-centres.csv'
filepath_mrt = '../data/auxiliary-data/sg-mrt-stations.csv'
filepath_pri_school = '../data/auxiliary-data/sg-primary-schools.csv'
filepath_sec_school = '../data/auxiliary-data/sg-secondary-schools.csv'
filepath_mall = '../data/auxiliary-data/sg-shopping-malls.csv'
filepath_subzone = '../data/auxiliary-data/sg-subzones.csv'

df_commercial = pd.read_csv(filepath_commercial)
df_mrt = pd.read_csv(filepath_mrt)
df_pri_school = pd.read_csv(filepath_pri_school)
df_sec_school = pd.read_csv(filepath_sec_school)
df_mall = pd.read_csv(filepath_mall)
df_subzone = pd.read_csv(filepath_subzone)

## EDA and Preprocessing

In [None]:
def setup(eda):
    # string value formalization
    eda.str_clean_up()
    # remove abnormal data
    eda.handle_train_abnormal()


    # ------ Original Features -------------
    # one-hot property type for both train & test
    eda.property_type_method()

    # processing tenure feature for train data
    eda.tenure_method()
    # processing tenure feature for test data
    eda.tenure_method(for_test=True)

    # processing num of beds & baths feature for train data
    eda.num_bed_bath_method()
    # processing num of beds & baths feature for test data
    eda.num_bed_bath_method(for_test=True)

    # processing built year feature for train data using method 2
    eda.built_year_method2()
    # processing built year feature for test data using method 2
    eda.built_year_method2(for_test=True)

    # one-hot furnishing for both train & test data
    eda.furnishing_method()

    # one-hot planning area for both train & test data
    eda.planning_area_method()


    # ------ Auxiliary Features -------------
    # calculate shorest distance to different commerical type for train data
    eda.cal_min_dis_to_diff_commercial(df_commercial)
    # calculate shorest distance to different commerical type for test data
    eda.cal_min_dis_to_diff_commercial(df_commercial, for_test=True)

    # calculate shorest distance to different MRT lines for train data
    eda.cal_min_dis_to_diff_mrt(df_mrt)
    # calculate shorest distance to different MRT lines for test data
    eda.cal_min_dis_to_diff_mrt(df_mrt, for_test=True)

    # calculate shortest distance to primary school for train data
    eda.cal_min_dis_to_school_or_mall(df_pri_school, cal_type='pc')
    # calculate shortest distance to primary school for test data
    eda.cal_min_dis_to_school_or_mall(df_pri_school, cal_type='pc', for_test=True)

    # calculate shortest distance to second school for train data
    eda.cal_min_dis_to_school_or_mall(df_sec_school, cal_type='sc')
    # calculate shortest distance to second school for test data
    eda.cal_min_dis_to_school_or_mall(df_sec_school, cal_type='sc', for_test=True)

    # calculate shortest distance to shopping mall for train data
    eda.cal_min_dis_to_school_or_mall(df_mall, cal_type='sm')
    # calculate shortest distance to shopping mall for test data
    eda.cal_min_dis_to_school_or_mall(df_mall, cal_type='sm', for_test=True)

    # attach with subzone size & population for train data
    eda.attach_subzone_auxiliary_info(df_subzone)
    # attach with subzone size & population for test data
    eda.attach_subzone_auxiliary_info(df_subzone, for_test=True)

    # calculate population density for train data
    eda.cal_subzone_population_density(df_subzone)
    # calculate population density for test data
    eda.cal_subzone_population_density(df_subzone, for_test=True)
    
    
    drop_cols = ['listing_id', 'title', 'address', 'property_name', 'floor_level', 'available_unit_types',
            'total_num_units', 'property_details_url', 'elevation','subzone', 'planning_area', 'furnishing',
            'property_type',]

    eda.df.drop(columns=drop_cols, inplace=True)
    eda.df_test.drop(columns=drop_cols,inplace=True)  

In [None]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

eda = EDA(df_train_copy, df_test_copy)
setup(eda)

df_train_y = eda.df['price']
df_train_X = eda.df.drop(columns=['price'])
X_train = df_train_X.to_numpy(dtype='float32')
y_train = df_train_y.to_numpy(dtype='float32')
X_test = eda.df_test.to_numpy(dtype='float32')

## Max-min Scale

In [None]:
for i in range(X_train.shape[1]):
    min_max_scaler = preprocessing.MinMaxScaler()
    col_train = X_train[:, i]
    col_train = col_train.reshape(-1, 1)
    col_train = min_max_scaler.fit_transform(col_train)
    X_train[:, i] = col_train.reshape(-1)
    
    col_test = X_test[:, i]
    col_test = col_test.reshape(-1, 1)
    col_test = min_max_scaler.transform(col_test)
    X_test[:, i] = col_test.reshape(-1)


y_train = y_train.reshape(-1)
y_train = y_train / 10000

## Hyper-parameter Tuning 

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
parameters = {'n_estimators':[20,30,50,80,100,150, 200, 250, 300, 400, 500], 
              'min_samples_split': [2, 3, 4, 6, 8, 10],
              'min_samples_leaf': [2, 3, 4, 6, 8, 10],
              'max_features':[1.0, 0.8, 0.6],
              'max_depth':[None, 3, 4, 5, 7, 9, 10, 12, 18, 20, 25, 30, 45, 50, 55, 60, 65, 70, 85, 80]

             }
model = GridSearchCV(estimator=RandomForestRegressor(), param_grid=parameters, verbose=4, cv=5, scoring='neg_root_mean_squared_error')
model.fit(X_train, y_train)

In [None]:
model.cv_results_

In [None]:
best_params = model.best_params_
best_params