In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import randint as sp_randint
from sklearn import preprocessing
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('data/train_preprocessed.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431732 entries, 0 to 431731
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   year                 431732 non-null  int64  
 1   month                431732 non-null  int64  
 2   num_rooms            431732 non-null  int64  
 3   is_executive         431732 non-null  bool   
 4   is_multi_gen         431732 non-null  bool   
 5   storey_range_avg     431732 non-null  float64
 6   is_low_floor         431732 non-null  int64  
 7   floor_area_sqm       431732 non-null  float64
 8   lease_commence_date  431732 non-null  int64  
 9   latitude             431732 non-null  float64
 10  longitude            431732 non-null  float64
 11  elevation            431732 non-null  float64
 12  town                 431732 non-null  object 
 13  block                431732 non-null  object 
 14  street_name          431732 non-null  object 
 15  flat_model       

In [4]:
df.head()

Unnamed: 0,year,month,num_rooms,is_executive,is_multi_gen,storey_range_avg,is_low_floor,floor_area_sqm,lease_commence_date,latitude,longitude,elevation,town,block,street_name,flat_model,subzone,planning_area,region,resale_price
0,2001,8,4,False,False,2.0,1,118.0,1989,1.369008,103.958697,0.0,pasir ris,440,pasir ris drive 4,model a,pasir ris drive,pasir ris,east region,209700.0
1,2014,10,5,False,False,11.0,0,110.0,2003,1.399007,103.906991,0.0,punggol,196B,punggol field,improved,punggol field,punggol,north-east region,402300.0
2,2020,9,5,False,False,2.0,1,112.0,2004,1.388348,103.873815,0.0,sengkang,404A,fernvale lane,premium apartment,fernvale,sengkang,north-east region,351000.0
3,2000,10,3,False,False,8.0,0,67.0,1980,1.318493,103.766702,0.0,clementi,375,clementi avenue 4,new generation,clementi north,clementi,west region,151200.0
4,2013,1,3,False,False,8.0,0,73.0,1985,1.348149,103.742658,0.0,bukit batok,163,bukit batok street 11,model a,bukit batok west,bukit batok,west region,318600.0


In [5]:
# convert string to categorical variables
df['town'] = df['town'].astype('category')
df['block'] = df['block'].astype('category')
df['street_name'] = df['street_name'].astype('category')
df['flat_model'] = df['flat_model'].astype('category')
df['subzone'] = df['subzone'].astype('category')
df['planning_area'] = df['planning_area'].astype('category') # 32 planning_areas
df['region'] = df['region'].astype('category') # 5 regions

In [6]:
x_num_cols = ['year', 'month', 'num_rooms', 'is_executive', 'is_multi_gen', 'storey_range_avg', 'is_low_floor', 'floor_area_sqm', 'lease_commence_date', 'latitude', 'longitude']
x_cat_cols = ['flat_model', 'planning_area']
y_col = 'resale_price'

In [7]:
encoder = OneHotEncoder(drop='first').fit(df[x_cat_cols])
X_cat_encoded = pd.DataFrame(encoder.transform(df[x_cat_cols]).toarray(), columns=encoder.get_feature_names_out())

In [8]:
X_encoded = pd.concat([df[x_num_cols], X_cat_encoded], axis=1)

In [9]:
scaler = StandardScaler().fit(X_encoded)
X_scaled = pd.DataFrame(scaler.transform(X_encoded), columns=X_encoded.columns)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df[y_col], test_size=0.33)

In [11]:
results = {}

# Linear Regression

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
res = [mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)]
results['linear_regression'] = res
print("MSE: {:.2f}, r2: {:.2f}".format(*res))

MSE: 3146655114.97, r2: 0.81


# Random Forest

In [13]:
rfr = RandomForestRegressor(n_estimators=100, max_depth=10)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
res = [mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)]
results['random_forest'] = res
print("MSE: {:.2f}, r2: {:.2f}".format(*res))

MSE: 1193757847.65, r2: 0.93


# Gradient Boosting

In [14]:
# TODO: no parameter tuning yet
gbr_model = GradientBoostingRegressor()
gbr_model.fit(X_train, y_train)
y_pred = gbr_model.predict(X_test)
res = [mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)]
results['gradient_boosting'] = res
print("MSE: {:.2f}, r2: {:.2f}".format(*res))

MSE: 1157108119.11, r2: 0.93


# AdaBoost

In [15]:
ada_model = AdaBoostRegressor()
ada_model.fit(X_train, y_train)
y_pred = ada_model.predict(X_test)
res = [mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)]
results['ada_boost'] = res
print("MSE: {:.2f}, r2: {:.2f}".format(*res))

# XGBoost

In [16]:
# I was told this is a regressor popular in industry...
xgb_model = XGBRegressor(objective='reg:squarederror')
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
res = [mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)]
results['xgboost'] = res
print("MSE: {:.2f}, r2: {:.2f}".format(*res))

MSE: 419319249.54, r2: 0.98


In [17]:
np.round(pd.DataFrame.from_dict(results, orient='index', columns=['MSE', 'r2']), 2)

Unnamed: 0,MSE,r2
linear_regression,3146655000.0,0.81
random_forest,1193758000.0,0.93
gradient_boosting,1157108000.0,0.93
xgboost,419319200.0,0.98


# Prediction

In [18]:
test = pd.read_csv('data/test_preprocessed.csv')

In [19]:
test.head()

Unnamed: 0,year,month,num_rooms,is_executive,is_multi_gen,storey_range_avg,is_low_floor,floor_area_sqm,lease_commence_date,latitude,longitude,elevation,town,block,street_name,flat_model,subzone,planning_area,region
0,2004,1,4,False,False,5.0,1,94.0,1989,1.346581,103.744085,0.0,bukit batok,186,bukit batok west avenue 6,new generation,bukit batok west,bukit batok,west region
1,2001,11,5,False,False,5.0,1,122.0,1997,1.357618,103.961379,0.0,tampines,366,tampines street 34,improved,tampines east,tampines,east region
2,2002,7,3,False,False,2.0,1,67.0,1982,1.337804,103.741998,0.0,jurong east,206,jurong east street 21,new generation,toh guan,jurong east,west region
3,2015,4,3,False,False,5.0,1,82.0,1981,1.380084,103.849574,0.0,ang mo kio,180,Ang Mo Kio Avenue 5,new generation,yio chu kang east,ang mo kio,north-east region
4,2004,4,5,False,False,2.0,1,117.0,1978,1.31396,103.769831,0.0,clementi,356,clementi avenue 2,standard,clementi north,clementi,west region


In [20]:
test['town'] = test['town'].astype('category')
test['block'] = test['block'].astype('category')
test['street_name'] = test['street_name'].astype('category')
test['flat_model'] = test['flat_model'].astype('category')
test['subzone'] = test['subzone'].astype('category')
test['planning_area'] = test['planning_area'].astype('category')
test['region'] = test['region'].astype('category')

In [21]:
X_cat_encoded_final = pd.DataFrame(encoder.transform(test[x_cat_cols]).toarray(), columns=encoder.get_feature_names_out())
X_encoded_final = pd.concat([test[x_num_cols], X_cat_encoded_final], axis=1)
X_scaled_final = pd.DataFrame(scaler.transform(X_encoded_final), columns=X_encoded_final.columns)

In [22]:
y_final = xgb_model.predict(X_scaled_final)

In [23]:
y_final

array([183728.94, 307496.22, 116881.37, ..., 164817.02, 247110.67,
       320873.5 ], dtype=float32)

In [24]:
pd.DataFrame({'Id':test.index, 'Predicted':y_final}).to_csv('prediction.csv', index=False)