In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import randint as sp_randint
from sklearn import preprocessing
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('data/train_with_location.csv', sep='\t')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431732 entries, 0 to 431731
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           431732 non-null  int64  
 1   year                 431732 non-null  int64  
 2   month                431732 non-null  int64  
 3   num_rooms            431732 non-null  int64  
 4   is_executive         431732 non-null  bool   
 5   is_multi_gen         431732 non-null  bool   
 6   storey_range_avg     431732 non-null  float64
 7   is_low_floor         431732 non-null  int64  
 8   floor_area_sqm       431732 non-null  float64
 9   lease_commence_date  431732 non-null  int64  
 10  latitude             431732 non-null  float64
 11  longitude            431732 non-null  float64
 12  elevation            431732 non-null  float64
 13  town                 431732 non-null  object 
 14  block                431732 non-null  object 
 15  street_name      

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,year,month,num_rooms,is_executive,is_multi_gen,storey_range_avg,is_low_floor,floor_area_sqm,lease_commence_date,...,block,street_name,flat_model,subzone,planning_area,region,resale_price,cbd_dist,nearest_center,n_center_dist
0,0,2001,8,4,False,False,2.0,1,118.0,1989,...,440,pasir ris drive 4,model a,pasir ris drive,pasir ris,east region,209700.0,14.75526,3,2.314
1,1,2014,10,5,False,False,11.0,0,110.0,2003,...,196B,punggol field,improved,punggol field,punggol,north-east region,402300.0,13.775751,16,1.462
2,2,2020,9,5,False,False,2.0,1,112.0,2004,...,404A,fernvale lane,premium apartment,fernvale,sengkang,north-east region,351000.0,11.496847,18,1.445
3,3,2000,10,3,False,False,8.0,0,67.0,1980,...,375,clementi avenue 4,new generation,clementi north,clementi,west region,151200.0,10.38114,9,2.387
4,4,2013,1,3,False,False,8.0,0,73.0,1985,...,163,bukit batok street 11,model a,bukit batok west,bukit batok,west region,318600.0,14.185992,1,1.807


In [5]:
# convert string to categorical variables
df['town'] = df['town'].astype('category')
df['block'] = df['block'].astype('category')
df['street_name'] = df['street_name'].astype('category')
df['flat_model'] = df['flat_model'].astype('category')
df['subzone'] = df['subzone'].astype('category')
df['planning_area'] = df['planning_area'].astype('category') # 32 planning_areas
df['region'] = df['region'].astype('category') # 5 regions
df['nearest_center'] = df['nearest_center'].astype('category')

In [6]:
x_num_cols = ['year', 'month', 'num_rooms', 'is_executive', 'is_multi_gen', 'storey_range_avg', 'is_low_floor', 'floor_area_sqm', 'lease_commence_date', 'latitude', 'cbd_dist', 'longitude', 'n_center_dist']
x_cat_cols = ['flat_model', 'planning_area', 'nearest_center']
y_col = 'resale_price'

In [7]:
encoder = OneHotEncoder(drop='first').fit(df[x_cat_cols])
X_cat_encoded = pd.DataFrame(encoder.transform(df[x_cat_cols]).toarray(), columns=encoder.get_feature_names_out())

In [8]:
X_encoded = pd.concat([df[x_num_cols], X_cat_encoded], axis=1)

In [9]:
scaler = StandardScaler().fit(X_encoded)
X_scaled = pd.DataFrame(scaler.transform(X_encoded), columns=X_encoded.columns)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df[y_col], test_size=0.33)

In [11]:
results = {}

# Linear Regression

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
res = [mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)]
results['linear_regression'] = res
print("MSE: {:.2f}, r2: {:.2f}".format(*res))

MSE: 3057788547.74, r2: 0.82


# Random Forest

In [13]:
rfr = RandomForestRegressor(n_estimators=100, max_depth=10)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
res = [mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)]
results['random_forest'] = res
print("MSE: {:.2f}, r2: {:.2f}".format(*res))

MSE: 1000603716.68, r2: 0.94


# Gradient Boosting

In [14]:
# TODO: no parameter tuning yet
gbr_model = GradientBoostingRegressor()
gbr_model.fit(X_train, y_train)
y_pred = gbr_model.predict(X_test)
res = [mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)]
results['gradient_boosting'] = res
print("MSE: {:.2f}, r2: {:.2f}".format(*res))

MSE: 1107010653.68, r2: 0.93


# AdaBoost

In [15]:
ada_model = AdaBoostRegressor()
ada_model.fit(X_train, y_train)
y_pred = ada_model.predict(X_test)
res = [mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)]
results['ada_boost'] = res
print("MSE: {:.2f}, r2: {:.2f}".format(*res))

MSE: 5639020951.59, r2: 0.67


# XGBoost

In [16]:
# I was told this is a regressor popular in industry...
xgb_model = XGBRegressor(objective='reg:squarederror')
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
res = [mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)]
results['xgboost'] = res
print("MSE: {:.2f}, r2: {:.2f}".format(*res))

MSE: 405678305.71, r2: 0.98


In [17]:
np.round(pd.DataFrame.from_dict(results, orient='index', columns=['MSE', 'r2']), 2)

Unnamed: 0,MSE,r2
linear_regression,3057789000.0,0.82
random_forest,1000604000.0,0.94
gradient_boosting,1107011000.0,0.93
ada_boost,5639021000.0,0.67
xgboost,405678300.0,0.98


# Prediction

In [18]:
test = pd.read_csv('data/test_with_location.csv', sep='\t')

In [19]:
test.head()

Unnamed: 0.1,Unnamed: 0,year,month,num_rooms,is_executive,is_multi_gen,storey_range_avg,is_low_floor,floor_area_sqm,lease_commence_date,...,town,block,street_name,flat_model,subzone,planning_area,region,cbd_dist,nearest_center,n_center_dist
0,0,2004,1,4,False,False,5.0,1,94.0,1989,...,bukit batok,186,bukit batok west avenue 6,new generation,bukit batok west,bukit batok,west region,13.96319,1,1.75
1,1,2001,11,5,False,False,5.0,1,122.0,1997,...,tampines,366,tampines street 34,improved,tampines east,tampines,east region,14.253167,3,1.866
2,2,2002,7,3,False,False,2.0,1,67.0,1982,...,jurong east,206,jurong east street 21,new generation,toh guan,jurong east,west region,13.7366,1,0.929
3,3,2015,4,3,False,False,5.0,1,82.0,1981,...,ang mo kio,180,Ang Mo Kio Avenue 5,new generation,yio chu kang east,ang mo kio,north-east region,10.390801,18,2.027
4,4,2004,4,5,False,False,2.0,1,117.0,1978,...,clementi,356,clementi avenue 2,standard,clementi north,clementi,west region,9.889045,28,2.4


In [20]:
test['town'] = test['town'].astype('category')
test['block'] = test['block'].astype('category')
test['street_name'] = test['street_name'].astype('category')
test['flat_model'] = test['flat_model'].astype('category')
test['subzone'] = test['subzone'].astype('category')
test['planning_area'] = test['planning_area'].astype('category')
test['region'] = test['region'].astype('category')
test['nearest_center'] = test['nearest_center'].astype('category')

In [21]:
X_cat_encoded_final = pd.DataFrame(encoder.transform(test[x_cat_cols]).toarray(), columns=encoder.get_feature_names_out())
X_encoded_final = pd.concat([test[x_num_cols], X_cat_encoded_final], axis=1)
X_scaled_final = pd.DataFrame(scaler.transform(X_encoded_final), columns=X_encoded_final.columns)

In [22]:
y_final = xgb_model.predict(X_scaled_final)

In [23]:
y_final

array([195461.11, 313295.22, 114859.64, ..., 173928.77, 241307.78,
       321784.2 ], dtype=float32)

In [24]:
pd.DataFrame({'Id':test.index, 'Predicted':y_final}).to_csv('prediction.csv', index=False)