In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from math import radians, cos, sin, asin, sqrt
from sklearn.metrics import mean_squared_error as mse
import random
from EDA_v1 import *

In [None]:
# specify dataset filepath 

# --- original train & test dataset ----
filepath_test = '../data/test.csv'
df_test = pd.read_csv(filepath_test)
filepath_train = '../data/train.csv'
# filepath_train = '../data/train_cheat2.csv'
df_train = pd.read_csv(filepath_train)

# --- auxiliary dataset ----
filepath_commercial = '../data/auxiliary-data/sg-commerical-centres.csv'
filepath_mrt = '../data/auxiliary-data/sg-mrt-stations.csv'
filepath_pri_school = '../data/auxiliary-data/sg-primary-schools.csv'
filepath_sec_school = '../data/auxiliary-data/sg-secondary-schools.csv'
filepath_mall = '../data/auxiliary-data/sg-shopping-malls.csv'
filepath_subzone = '../data/auxiliary-data/sg-subzones.csv'

df_commercial = pd.read_csv(filepath_commercial)
df_mrt = pd.read_csv(filepath_mrt)
df_pri_school = pd.read_csv(filepath_pri_school)
df_sec_school = pd.read_csv(filepath_sec_school)
df_mall = pd.read_csv(filepath_mall)
df_subzone = pd.read_csv(filepath_subzone)

In [None]:
def setup(eda):
    eda.str_clean_up()
    eda.handle_train_abnormal()

    # one-hot property type
    eda.property_type_method()

    eda.tenure_method()
    eda.tenure_method(for_test=True)

    eda.num_bed_bath_method()
    eda.num_bed_bath_method(for_test=True)

    eda.built_year_method2()
    eda.built_year_method2(for_test=True)
    
    eda.attach_subzone_auxiliary_info(df_subzone)
    eda.attach_subzone_auxiliary_info(df_subzone, for_test=True) 
    
    eda.cal_subzone_population_density(df_subzone)
    eda.cal_subzone_population_density(df_subzone, for_test=True)

    drop_cols = ['listing_id', 'title', 'address', 'property_name', 'floor_level', 'available_unit_types',
                'total_num_units', 'property_details_url', 'elevation','subzone', 'planning_area', 'furnishing',
                'property_type', 'lng', 'lat','num_baths']

    eda.df.drop(columns=drop_cols, inplace=True)
    eda.df_test.drop(columns=drop_cols,inplace=True)  

In [None]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

eda = EDA(df_train_copy, df_test_copy)
setup(eda)

df_train_y = eda.df['price']
df_train_X = eda.df.drop(columns=['price'])
X_train = df_train_X.to_numpy(dtype='float32')
y_train = df_train_y.to_numpy(dtype='float32')
X_test = eda.df_test.to_numpy(dtype='float32')

In [None]:
for i in range(X_train.shape[1]):
    min_max_scaler = preprocessing.MinMaxScaler()
    col_train = X_train[:, i]
    col_train = col_train.reshape(-1, 1)
    col_train = min_max_scaler.fit_transform(col_train)
    X_train[:, i] = col_train.reshape(-1)
    
    col_test = X_test[:, i]
    col_test = col_test.reshape(-1, 1)
    col_test = min_max_scaler.transform(col_test)
    X_test[:, i] = col_test.reshape(-1)


y_train = y_train.reshape(-1)
y_train = y_train /100000

## Hyper-parameter Tuning 

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
parameters = {'max_iter':[7000], 
              'learning_rate_init': [0.5, 0.1, 0.01, 0.001, 0.005, 0.0001, 0.005],
              'solver': ['sgd','adam'],
              'early_stopping': [True],
              'hidden_layer_sizes':[(100,), (500,), (1500,),(2000, ), (3000, ), (100, 100), (200, 200),(200, 100), (100, 100, 100), (100, 500, 100),
                                   (5000,),  (30, 200, 30), (300, 100), (200, 300), (200, 200, 200)]

             }
model = GridSearchCV(estimator=MLPRegressor(), param_grid=parameters, verbose=4, cv=5, scoring='neg_root_mean_squared_error')
model.fit(X_train, y_train)

In [None]:
model.cv_results_

In [None]:
best_params = model.best_params_
best_params