# CS5228 Project
Baseline pipeline for HDB resale price prediction

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.compose import TransformedTargetRegressor

DATA_DIR = Path("./dataset")
TRAIN = DATA_DIR / "train.csv"
TEST = DATA_DIR / "test.csv"
OUT = DATA_DIR / "submission.csv"

## 1. Data Preprocess

In [None]:
train_df = pd.read_csv(TRAIN)
test_df = pd.read_csv(TEST)

print('Number of train data: {}'.format(len(train_df)))
print('Number of eval data: {}'.format(len(test_df)))
train_df.head()

Number of train data: 162691
Number of train data: 50000


Unnamed: 0,MONTH,TOWN,FLAT_TYPE,BLOCK,STREET,FLOOR_RANGE,FLOOR_AREA_SQM,FLAT_MODEL,ECO_CATEGORY,LEASE_COMMENCE_DATA,RESALE_PRICE
0,2020-10,woodlands,4 room,681B,woodlands drive 62,07 to 09,102.0,premium apartment,uncategorized,2000,420000.0
1,2021-07,bishan,4 room,264,bishan street 24,07 to 09,104.0,model a,uncategorized,1992,585000.0
2,2021-05,bukit panjang,4 room,520,jelapang road,19 to 21,102.0,model a,uncategorized,1998,450000.0
3,2021-08,punggol,4 room,121B,edgedale plains,16 to 18,93.0,model a,uncategorized,2017,465000.0
4,2023-05,hougang,5 room,997B,Buangkok Crescent,10 to 12,113.0,improved,uncategorized,2018,710000.0


## 2. Build Features

In [4]:
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["MONTH"] = pd.to_datetime(out["MONTH"], errors="coerce")
    out["sale_year"] = out["MONTH"].dt.year
    out["sale_month"] = out["MONTH"].dt.month

    # 解析楼层区间
    def parse_floor_range(s):
        if pd.isna(s): return np.nan, np.nan, np.nan
        parts = str(s).split("to")
        if len(parts) == 2:
            try:
                low, high = int(parts[0].strip()), int(parts[1].strip())
                return low, high, (low + high)/2
            except: return np.nan, np.nan, np.nan
        return np.nan, np.nan, np.nan

    lows, highs, mids = zip(*[parse_floor_range(x) for x in out["FLOOR_RANGE"].astype(str)])
    out["floor_low"], out["floor_high"], out["floor_mid"] = lows, highs, mids

    out["flat_age"] = out["sale_year"] - out["LEASE_COMMENCE_DATA"]
    out["remaining_lease"] = 99 - out["flat_age"]

    out["BLOCK_PREFIX"] = out["BLOCK"].astype(str).str.extract(r"^(\d+)").fillna("OTHER")
    out["STREET_TOKEN"] = out["STREET"].astype(str).str.split().str[:2].apply(lambda xs: " ".join(xs))

    out.drop(columns=["MONTH"], inplace=True)
    return out
    
train_f = build_features(train_df)
test_f  = build_features(test_df)
print(train_f.keys())

Index(['TOWN', 'FLAT_TYPE', 'BLOCK', 'STREET', 'FLOOR_RANGE', 'FLOOR_AREA_SQM',
       'FLAT_MODEL', 'ECO_CATEGORY', 'LEASE_COMMENCE_DATA', 'RESALE_PRICE',
       'sale_year', 'sale_month', 'floor_low', 'floor_high', 'floor_mid',
       'flat_age', 'remaining_lease', 'BLOCK_PREFIX', 'STREET_TOKEN'],
      dtype='object')


## 3. Encode

In [5]:
def target_mean_encode(train_df, test_df, cols, target):
    train, test = train_df.copy(), test_df.copy()
    global_mean = train[target].mean()
    enc_cols = []
    for c in cols:
        means = train.groupby(c, dropna=False)[target].mean()
        train[f"{c}_mean"] = train[c].map(means).fillna(global_mean)
        test[f"{c}_mean"]  = test[c].map(means).fillna(global_mean)
        enc_cols.append(f"{c}_mean")
    return train, test, enc_cols

cat_cols = ["TOWN","FLAT_TYPE","FLAT_MODEL","ECO_CATEGORY","BLOCK_PREFIX","STREET_TOKEN"]
train_f, test_f, enc_cols = target_mean_encode(train_f, test_f, cat_cols, "RESALE_PRICE")

base_numeric = [
    "FLOOR_AREA_SQM","sale_year","sale_month",
    "floor_low","floor_high","floor_mid",
    "flat_age","remaining_lease"
]
feat_cols = base_numeric + enc_cols

X = train_f[feat_cols]
y = train_f["RESALE_PRICE"]
X_test = test_f[feat_cols]

## 4. Train

ridge model:
- score 0922: 77014.34495

In [6]:
model = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("regressor", TransformedTargetRegressor(
        regressor=Ridge(alpha=5.0, random_state=42),
        func=np.log1p, inverse_func=np.expm1
    ))
])
X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_tr, y_tr)
va_pred = model.predict(X_va)

mse = mean_squared_error(y_va, va_pred)
rmse = np.sqrt(mse)

print(f"Hold-out RMSE: {rmse:.2f} SGD")

model.fit(X, y)
test_pred = model.predict(X_test)

submission = pd.DataFrame({
    "Id": np.arange(len(test_pred), dtype=int),
    "Predicted": test_pred
})
submission.to_csv("submission_ridge.csv", index=False)
submission.head()

Hold-out RMSE: 76139.63 SGD


Unnamed: 0,Id,Predicted
0,0,520601.875287
1,1,570418.117116
2,2,543395.068194
3,3,440195.859863
4,4,570404.392821
