In [3]:
import pandas as pd
import numpy as np

In [4]:
def data_cleaning(train):
    # split 'MONTH' column into 'YEAR' and 'MONTH' columns
    train[['YEAR', 'MONTH']] = train['MONTH'].str.split('-', expand=True).astype(int)
    # ensure consistent formatting in 'FLAT_TYPE' column
    train['FLAT_TYPE'] = train['FLAT_TYPE'].str.replace('-', ' ', regex=False)
    # drop 'ECO_CATEGORY' column as it has the same value for all rows
    train.drop(columns=['ECO_CATEGORY'], inplace=True)
    # create 'FLAT_AGE' column and drop 'LEASE_COMMENCE_DATA' column
    train['FLAT_AGE'] = train['YEAR'] - train['LEASE_COMMENCE_DATA']
    train.drop(columns=['LEASE_COMMENCE_DATA'], inplace=True)
    return train

def merge_hdb_info(train, hdb_info):
    train[['BLOCK', 'STREET']] = train[['BLOCK', 'STREET']].apply(lambda x: x.str.lower())
    hdb_info[['BLOCK', 'ADDRESS']] = hdb_info[['BLOCK', 'ADDRESS']].apply(lambda x: x.str.lower())
    train_merge = train.merge(hdb_info, 
                          how='left', 
                          left_on=['BLOCK', 'STREET'], 
                          right_on=['BLOCK', 'ADDRESS'],
                          indicator=True,
                          suffixes=('', '_HDB'))
    train_merge.drop(columns=['BLOCK', 'STREET', 'TOWN_HDB', 'ADDRESS', 'POSTAL_CODE', '_merge'], inplace=True)
    return train_merge


In [None]:
folder_path = "./dataset"
train = pd.read_csv(folder_path + 'train.csv', index_col= None)
train.drop_duplicates(keep='first', inplace=True) # only drop duplicates in training set
test = pd.read_csv(folder_path + 'test.csv', index_col= None)
hdb_info = pd.read_csv('./dataset/auxiliary-data/sg-hdb-block-details.csv', index_col=None)
cleaned_train = data_cleaning(train)
cleaned_test = data_cleaning(test)
merged_train = merge_hdb_info(cleaned_train, hdb_info)
merged_test = merge_hdb_info(cleaned_test, hdb_info)
# merged_train.to_csv('merged_train.csv', index=False)
# merged_test.to_csv('merged_test.csv', index=False)


In [6]:
merged_train.shape,merged_test.shape

((162570, 15), (50000, 14))

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import KFold

In [8]:
train = merged_train
test = merged_test

In [9]:
# reduce number of categories in FLAT_MODEL
flat_model_map = {
    '3gen': 'multi generation',
    'model a maisonette': 'maisonette',
    'improved maisonette': 'maisonette',
    'premium maisonette': 'maisonette',
    'premium apartment loft': 'premium apartment',
    'type s1': 'new generation',
    'type s2': 'new generation',
    'model a2': 'model a'
}
train['FLAT_MODEL'] = train['FLAT_MODEL'].map(flat_model_map).fillna(train['FLAT_MODEL'])
test['FLAT_MODEL'] = test['FLAT_MODEL'].map(flat_model_map).fillna(test['FLAT_MODEL'])
train['FLAT_MODEL'].value_counts()

FLAT_MODEL
model a              59733
improved             39773
new generation       20299
premium apartment    18186
simplified            6174
apartment             5891
maisonette            4838
standard              4363
dbss                  2604
adjoined flat          290
2 room                 225
multi generation       109
terrace                 85
Name: count, dtype: int64

In [10]:
def mean_floor_range(floor_range):
    if pd.isna(floor_range):
        return np.nan
    parts = floor_range.split(' to ')
    if len(parts) != 2:
        return np.nan
    try:
        low = int(parts[0])
        high = int(parts[1])
        return (low + high) / 2
    except:
        return np.nan
train['FLOOR_RANGE'] = train['FLOOR_RANGE'].apply(mean_floor_range)
train['FLOOR_RATIO'] = train['FLOOR_RANGE']/train['MAX_FLOOR']
test['FLOOR_RANGE'] = test['FLOOR_RANGE'].apply(mean_floor_range)
test['FLOOR_RATIO'] = test['FLOOR_RANGE']/test['MAX_FLOOR']

In [12]:
def target_encode_cv(df, feature, target, n_splits=5, random_state=42):
    df = df.copy()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    encoded = np.zeros(len(df))
    for train_index, val_index in kf.split(df):
        train_fold, val_fold = df.iloc[train_index], df.iloc[val_index]
        means = train_fold.groupby(feature)[target].mean()
        encoded[val_index] = val_fold[feature].map(means)
    # fill unseen categories with overall mean
    overall_mean = df[target].mean()
    encoded[np.isnan(encoded)] = overall_mean
    return encoded

# basic target encoding without CV
def target_encode(df, feature, target):
    df = df.copy()
    means = df.groupby(feature)[target].mean()
    encoded = df[feature].map(means)
    overall_mean = df[target].mean()
    encoded[np.isnan(encoded)] = overall_mean
    return encoded
# target encoding for test set using train set means
def target_encode_test(train_df, test_df, feature, target):
    train_df = train_df.copy()
    test_df = test_df.copy()
    means = train_df.groupby(feature)[target].mean()
    test_encoded = test_df[feature].map(means)
    overall_mean = train_df[target].mean()
    test_encoded[np.isnan(test_encoded)] = overall_mean
    return test_encoded

In [13]:
def feat_encoding(data):
    df = data.copy()
    scaler = StandardScaler()
    df[['FLOOR_AREA_SQM', 'FLAT_AGE']] = scaler.fit_transform(df[['FLOOR_AREA_SQM', 'FLAT_AGE']])
    flat_type_order = ['1 room', '2 room', '3 room', '4 room', '5 room', 'executive', 'multi generation']
    encoder = OrdinalEncoder(categories=[flat_type_order])
    df['FLAT_TYPE'] = encoder.fit_transform(df[['FLAT_TYPE']])
    df = pd.get_dummies(df, columns=['REGION', 'FLAT_MODEL'], prefix=['region', 'model'])
    if 'RESALE_PRICE' in df.columns:
        df['TOWN'] = target_encode_cv(df, 'TOWN', 'LOG_RESALE_PRICE')
    else:
        df['TOWN'] = target_encode_test(train, df, 'TOWN', 'LOG_RESALE_PRICE')
    return df

In [14]:
train['LOG_RESALE_PRICE'] = np.log(train['RESALE_PRICE'])
train_encoded = feat_encoding(train)
test_encoded = feat_encoding(test)
train_encoded.shape, test_encoded.shape

((162570, 33), (50000, 31))

In [15]:
train_encoded.columns

Index(['MONTH', 'TOWN', 'FLAT_TYPE', 'FLOOR_RANGE', 'FLOOR_AREA_SQM',
       'RESALE_PRICE', 'YEAR', 'FLAT_AGE', 'LATITUDE', 'LONGITUDE',
       'MAX_FLOOR', 'SUBZONE', 'PLANNING_AREA', 'FLOOR_RATIO',
       'LOG_RESALE_PRICE', 'region_central region', 'region_east region',
       'region_north region', 'region_north-east region', 'region_west region',
       'model_2 room', 'model_adjoined flat', 'model_apartment', 'model_dbss',
       'model_improved', 'model_maisonette', 'model_model a',
       'model_multi generation', 'model_new generation',
       'model_premium apartment', 'model_simplified', 'model_standard',
       'model_terrace'],
      dtype='object')

In [16]:
train_encoded.drop(columns=['FLOOR_RANGE', 'SUBZONE', 'PLANNING_AREA'], inplace=True)
test_encoded.drop(columns=['FLOOR_RANGE', 'SUBZONE', 'PLANNING_AREA'], inplace=True)

train_encoded.to_csv("train_encoded.csv", index = False)
test_encoded.to_csv("test_encoded.csv", index = False)


In [18]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

y = train_encoded['LOG_RESALE_PRICE']
X = train_encoded.drop(columns=['LOG_RESALE_PRICE'])

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42,
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="rmse",
        callbacks=[early_stopping(100), log_evaluation(0)],
    )
    
    preds = model.predict(X_valid)

    y_valid_exp = np.expm1(y_valid)
    preds_exp = np.expm1(preds)
    
    rmse = mean_squared_error(y_valid_exp, preds_exp, squared=False)
    rmse_scores.append(rmse)
    print(f"Fold {fold+1} RMSE: {rmse:.4f}")

print("Average RMSE:", np.mean(rmse_scores))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003904 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1359
[LightGBM] [Info] Number of data points in the train set: 130056, number of used features: 29
[LightGBM] [Info] Start training from score 13.100523
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[149]	valid_0's rmse: 0.00378124	valid_0's l2: 1.42978e-05
Fold 1 RMSE: 2089.1052
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002692 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1357
[LightGBM] [Info] Number of data points in the train set: 130056, number of used features: 29
[LightGBM] [Info] Start training from score 13.099515
Train

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

y_raw = np.expm1(y)   # Real price
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_model(model, X, y, model_name="Model"):
    log_rmse_list, real_rmse_list, mape_list = [], [], []
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), 1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr_log, y_va_log = y.iloc[tr_idx], y.iloc[va_idx]
        y_va_raw = y_raw.iloc[va_idx]

        model.fit(X_tr, y_tr_log)
        preds_log = model.predict(X_va)

        rmse_log = mean_squared_error(y_va_log, preds_log, squared=False)
        preds_raw = np.expm1(preds_log)
        rmse_real = mean_squared_error(y_va_raw, preds_raw, squared=False)
        mape = mean_absolute_percentage_error(y_va_raw, preds_raw) * 100

        log_rmse_list.append(rmse_log)
        real_rmse_list.append(rmse_real)
        mape_list.append(mape)

        print(f"Fold {fold}: logRMSE={rmse_log:.5f}, realRMSE={rmse_real:.2f}, MAPE={mape:.2f}%")

    print("\n=== Summary for", model_name, "===")
    print(f"Average logRMSE : {np.mean(log_rmse_list):.5f}")
    print(f"Average realRMSE: {np.mean(real_rmse_list):.2f}")
    print(f"Average MAPE    : {np.mean(mape_list):.2f}%")


In [23]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
evaluate_model(model, X, y, "Linear Regression")


Fold 1: logRMSE=0.05947, realRMSE=46249.80, MAPE=4.37%
Fold 2: logRMSE=0.05994, realRMSE=47373.51, MAPE=4.39%
Fold 3: logRMSE=0.05913, realRMSE=47286.75, MAPE=4.33%
Fold 4: logRMSE=0.05970, realRMSE=48226.86, MAPE=4.38%
Fold 5: logRMSE=0.05887, realRMSE=45347.94, MAPE=4.36%

=== Summary for Linear Regression ===
Average logRMSE : 0.05942
Average realRMSE: 46896.97
Average MAPE    : 4.37%


In [24]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=1.0, random_state=42)
evaluate_model(model, X, y, "Ridge Regression")


Fold 1: logRMSE=0.05947, realRMSE=46282.76, MAPE=4.36%
Fold 2: logRMSE=0.05994, realRMSE=47419.21, MAPE=4.39%
Fold 3: logRMSE=0.05913, realRMSE=47323.12, MAPE=4.33%
Fold 4: logRMSE=0.05971, realRMSE=48263.28, MAPE=4.38%
Fold 5: logRMSE=0.05886, realRMSE=45355.00, MAPE=4.36%

=== Summary for Ridge Regression ===
Average logRMSE : 0.05942
Average realRMSE: 46928.67
Average MAPE    : 4.36%


In [25]:
from sklearn.linear_model import Lasso

model = Lasso(alpha=0.0005, max_iter=20000, random_state=42)
evaluate_model(model, X, y, "Lasso Regression")


Fold 1: logRMSE=0.06136, realRMSE=48458.04, MAPE=4.42%
Fold 2: logRMSE=0.06179, realRMSE=50059.09, MAPE=4.44%
Fold 3: logRMSE=0.06108, realRMSE=49723.54, MAPE=4.40%
Fold 4: logRMSE=0.06154, realRMSE=50526.99, MAPE=4.43%
Fold 5: logRMSE=0.06081, realRMSE=46953.70, MAPE=4.42%

=== Summary for Lasso Regression ===
Average logRMSE : 0.06132
Average realRMSE: 49144.27
Average MAPE    : 4.42%


In [27]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=1500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)
evaluate_model(model, X, y, "XGBoost")


Fold 1: logRMSE=0.00608, realRMSE=4450.35, MAPE=0.32%
Fold 2: logRMSE=0.00628, realRMSE=5186.94, MAPE=0.31%
Fold 3: logRMSE=0.00576, realRMSE=4351.43, MAPE=0.31%
Fold 4: logRMSE=0.00582, realRMSE=4767.50, MAPE=0.31%
Fold 5: logRMSE=0.00584, realRMSE=4453.47, MAPE=0.31%

=== Summary for XGBoost ===
Average logRMSE : 0.00596
Average realRMSE: 4641.94
Average MAPE    : 0.31%


In [29]:
from sklearn.svm import SVR

model = Pipeline([
    ("scaler", StandardScaler()),
    ("svr", SVR(C=5.0, epsilon=0.1, kernel="rbf"))
])
evaluate_model(model, X, y, "Support Vector Regression (RBF)")


Fold 1: logRMSE=0.03766, realRMSE=19876.82, MAPE=3.00%
Fold 2: logRMSE=0.03625, realRMSE=19444.71, MAPE=2.84%
Fold 3: logRMSE=0.03855, realRMSE=20045.16, MAPE=3.08%
Fold 4: logRMSE=0.03825, realRMSE=20117.67, MAPE=3.05%
Fold 5: logRMSE=0.03857, realRMSE=20837.48, MAPE=3.03%

=== Summary for Support Vector Regression (RBF) ===
Average logRMSE : 0.03786
Average realRMSE: 20064.37
Average MAPE    : 3.00%
