# CS5228 Project
Baseline pipeline for HDB resale price prediction

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder


## 1. Data Preprocess

In [2]:
DATA_DIR = Path("./dataset")
TRAIN = DATA_DIR / "train.csv"
TEST = DATA_DIR / "test.csv"
OUT = DATA_DIR / "submission.csv"

train_save_path = DATA_DIR / "train_clean.csv"
test_save_path = DATA_DIR / "test_clean.csv"

train_df = pd.read_csv(TRAIN)
test_df = pd.read_csv(TEST)

# 统一flat type格式
train_df["FLAT_TYPE"] = train_df["FLAT_TYPE"].str.replace(r"[-]+", " ", regex=True)
train_df["FLAT_TYPE"] = train_df["FLAT_TYPE"].str.replace(r"\s+", " ", regex=True).str.strip()

test_df["FLAT_TYPE"] = test_df["FLAT_TYPE"].str.replace(r"[-]+", " ", regex=True)
test_df["FLAT_TYPE"] = test_df["FLAT_TYPE"].str.replace(r"\s+", " ", regex=True).str.strip()

#FLOOR Range
train_df[["FLOOR_LOW", "FLOOR_HIGH"]] = train_df["FLOOR_RANGE"].str.split(" to ", expand=True)
train_df["FLOOR_LOW"] = train_df["FLOOR_LOW"].astype(int)
train_df["FLOOR_HIGH"] = train_df["FLOOR_HIGH"].astype(int)

test_df[["FLOOR_LOW", "FLOOR_HIGH"]] = test_df["FLOOR_RANGE"].str.split(" to ", expand=True)
test_df["FLOOR_LOW"] = test_df["FLOOR_LOW"].astype(int)
test_df["FLOOR_HIGH"] = test_df["FLOOR_HIGH"].astype(int)

# train_df.to_csv(train_save_path, index=False)
# test_df.to_csv(test_save_path, index=False)
train_df.head()


Unnamed: 0,MONTH,TOWN,FLAT_TYPE,BLOCK,STREET,FLOOR_RANGE,FLOOR_AREA_SQM,FLAT_MODEL,ECO_CATEGORY,LEASE_COMMENCE_DATA,RESALE_PRICE,FLOOR_LOW,FLOOR_HIGH
0,2020-10,woodlands,4 room,681B,woodlands drive 62,07 to 09,102.0,premium apartment,uncategorized,2000,420000.0,7,9
1,2021-07,bishan,4 room,264,bishan street 24,07 to 09,104.0,model a,uncategorized,1992,585000.0,7,9
2,2021-05,bukit panjang,4 room,520,jelapang road,19 to 21,102.0,model a,uncategorized,1998,450000.0,19,21
3,2021-08,punggol,4 room,121B,edgedale plains,16 to 18,93.0,model a,uncategorized,2017,465000.0,16,18
4,2023-05,hougang,5 room,997B,Buangkok Crescent,10 to 12,113.0,improved,uncategorized,2018,710000.0,10,12


## 2. Build Features

In [3]:
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # 计算楼层平均区间
    out["FLOOR_AVE"] = (out["FLOOR_LOW"] + out["FLOOR_HIGH"]) / 2
    # 计算楼层年龄
    out["MONTH"] = pd.to_datetime(out["MONTH"], errors="coerce")
    sale_year = out["MONTH"].dt.year
    out["HDB_AGE"] = sale_year - out["LEASE_COMMENCE_DATA"]
    return out

train_df = build_features(train_df)
test_df = build_features(test_df)

# 计算remaining lease
#drop column：ECO_CATEGORY,BLOCK
train_df.drop(columns=["ECO_CATEGORY","BLOCK","FLOOR_RANGE","FLOOR_LOW","FLOOR_HIGH","TOWN","STREET"], inplace=True)
test_df.drop(columns=["ECO_CATEGORY","BLOCK","FLOOR_RANGE","FLOOR_LOW","FLOOR_HIGH","TOWN","STREET"], inplace=True)
train_df.head()

Unnamed: 0,MONTH,FLAT_TYPE,FLOOR_AREA_SQM,FLAT_MODEL,LEASE_COMMENCE_DATA,RESALE_PRICE,FLOOR_AVE,HDB_AGE
0,2020-10-01,4 room,102.0,premium apartment,2000,420000.0,8.0,20
1,2021-07-01,4 room,104.0,model a,1992,585000.0,8.0,29
2,2021-05-01,4 room,102.0,model a,1998,450000.0,20.0,23
3,2021-08-01,4 room,93.0,model a,2017,465000.0,17.0,4
4,2023-05-01,5 room,113.0,improved,2018,710000.0,11.0,5


## 3. Encode

In [4]:
# flat type encoding
def encode(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    mapping = {
        "1 room": 1,
        "2 room": 2,
        "3 room": 3,
        "4 room": 4,
        "5 room": 5,
        "executive": 6,
        "multi generation": 7
    }
    out["FLAT_TYPE_ENC"] = out["FLAT_TYPE"].map(mapping)
    out.drop(columns=["FLAT_TYPE"], inplace=True)

    le = LabelEncoder()
    out["FLAT_MODEL_ENC"] = le.fit_transform(out["FLAT_MODEL"])
    out.drop(columns=["FLAT_MODEL"], inplace=True)

    def to_month_index(s):
        return s.dt.year * 12 + s.dt.month
    out["MONTH"] = pd.to_datetime(out["MONTH"], errors="coerce")
    out["TIME_INDEX"] = to_month_index(out["MONTH"])
    month=out["MONTH"].dt.month
    out["M_sin"]=np.sin(month*2*np.pi/12)
    out["M_cos"]=np.cos(month*2*np.pi/12)
    out["QTR"]=out["MONTH"].dt.quarter
    out.drop(columns=["MONTH"], inplace=True)

    return out

train_df = encode(train_df)
test_df = encode(test_df)
train_df.head()



Unnamed: 0,FLOOR_AREA_SQM,LEASE_COMMENCE_DATA,RESALE_PRICE,FLOOR_AVE,HDB_AGE,FLAT_TYPE_ENC,FLAT_MODEL_ENC,TIME_INDEX,M_sin,M_cos,QTR
0,102.0,2000,420000.0,8.0,20,4,13,24250,-0.866025,0.5,4
1,104.0,1992,585000.0,8.0,29,4,8,24259,-0.5,-0.866025,3
2,102.0,1998,450000.0,20.0,23,4,8,24257,0.5,-0.866025,2
3,93.0,2017,465000.0,17.0,4,4,8,24260,-0.866025,-0.5,3
4,113.0,2018,710000.0,11.0,5,5,5,24281,0.5,-0.866025,2


## 4. Train

ridge model:
- score 0922: 77014.34495

In [5]:
# 1) 特征/标签
X = train_df.drop(columns=["RESALE_PRICE"])
y = train_df["RESALE_PRICE"].astype(float)

# ---- 目标 log 变换 ----
y_log = np.log1p(y)  # log(1 + price)，对右偏分布更稳

# 2) 划分
X_train, X_valid, y_train_log, y_valid = train_test_split(
    X, y_log,    # 注意这里用 y_log
    test_size=0.2, random_state=42
)

# 3) 训练（在 log 空间评估）
model = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    random_state=42,
)

model.fit(X_train, y_train_log,)

# 4) 验证集评估（把预测从对数域还原再算 RMSE, 单位=SGD）
pred_valid_log = model.predict(X_valid)
pred_valid = np.expm1(pred_valid_log)         # 还原到价格
rmse = mean_squared_error(np.expm1(y_valid),  # y_valid 也要先还原
                          pred_valid) ** 0.5
print(f"Valid RMSE (SGD): {rmse:.2f}")

# 也可顺便看 RMSLE（对数域 RMSE，本质上就是 eval 的度量）
rmsle = mean_squared_error(y_valid, pred_valid_log) ** 0.5
print(f"Valid RMSLE (log space RMSE): {rmsle:.4f}")

# 5) 用最佳迭代数在全量数据上重训（同样用 log 目标）
best_n = getattr(model, "best_iteration", None)
if best_n is not None:
    best_n += 1  # best_iteration 是 0-based
    final_model = XGBRegressor(
        n_estimators=best_n,
        learning_rate=0.03,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        tree_method="hist",
        random_state=42,
    )
else:
    final_model = model

final_model.fit(X, y_log, verbose=False)

# 6) 生成测试集预测与提交（先预测 log，再还原）
pred_test_log = final_model.predict(test_df)
pred_test = np.expm1(pred_test_log)

sub = pd.DataFrame({
    "Id": np.arange(len(test_df), dtype=int),
    "Predicted": pred_test
})
sub_path = Path("./submission.csv")
sub.to_csv(sub_path, index=False)
print("Saved:", sub_path)
sub.head()

Valid RMSE (SGD): 74617.61
Valid RMSLE (log space RMSE): 0.1293
Saved: submission.csv


Unnamed: 0,Id,Predicted
0,0,532199.375
1,1,371686.78125
2,2,392740.5625
3,3,450487.71875
4,4,556720.9375
