In [8]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import nbformat

In [2]:
train_df = pd.read_csv('../data/cleaned_train_with_amenities.csv', dtype={'postal_code': str})
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162691 entries, 0 to 162690
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   town                       162691 non-null  object 
 1   flat_type                  162691 non-null  object 
 2   block                      162691 non-null  object 
 3   street                     162691 non-null  object 
 4   floor_area_sqm             162691 non-null  float64
 5   flat_model                 162691 non-null  object 
 6   lease_commence_data        162691 non-null  int64  
 7   resale_price               162691 non-null  float64
 8   postal_code                162691 non-null  object 
 9   latitude                   162691 non-null  float64
 10  longitude                  162691 non-null  float64
 11  max_floor                  162691 non-null  int64  
 12  subzone                    162691 non-null  object 
 13  height                     16

In [4]:
# Remove outliers based on floor_area_sqm more than 200 sqm
train_df = train_df[train_df['floor_area_sqm'] <= 200]
selected_numeric_features = ['resale_price', 'floor_area_sqm', 'max_floor', 'height', 'index', 'mrt_status','age_at_sale','flat_type_num', 'flat_model_num','longitude','latitude']
distance_features = ['mrt_distance', 'mall_distance', 'primary_school_distance', 'secondary_school_distance', 'hawker_distance']
selected_numeric_features += distance_features

In [5]:
# Select features for model
selected_categorical_features = ['subzone','flat_model']
target_variable = 'resale_price'
train_df = train_df[selected_numeric_features + selected_categorical_features]
# Split the data into training and validation sets (80% train, 20% val) using random sampling
training_set, validation_set = train_test_split(train_df, test_size=0.2, random_state=42)
# separate features and target variable
X_train = training_set.drop(columns=[target_variable])
y_train = training_set[target_variable]
X_val = validation_set.drop(columns=[target_variable])
y_val = validation_set[target_variable]
# Ensure categorical columns have appropriate dtype
for col in selected_categorical_features:
    if col in X_train.columns:
        X_train[col] = X_train[col].astype('category')
    if col in X_val.columns:
        X_val[col] = X_val[col].astype('category')
# Use CatBoost Regressor
from catboost import CatBoostRegressor
# Compute categorical feature indices from X_train (safer than using train_df)
cat_features_indices = [X_train.columns.get_loc(col) for col in selected_categorical_features if col in X_train.columns]
print('cat_features_indices used for CatBoost:', cat_features_indices)
model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, eval_metric='RMSE', random_seed=42, logging_level='Silent')
model.fit(X_train, y_train, cat_features=cat_features_indices, eval_set=(X_val, y_val), use_best_model=True)
# Make predictions on validation set
y_val_pred = model.predict(X_val)
# Calculate RMSE (works with older scikit-learn versions that don't accept `squared` keyword)
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
print(f"Validation RMSE: {rmse:.2f}")

cat_features_indices used for CatBoost: [15, 16]
Validation RMSE: 26640.29


In [20]:
# --- 1) Prepare data ---
#get relative_height
def get_relative_height(input_df):
    df = input_df.copy()

    floor_range_mapping  = {
    '01 to 03': 2,
    '04 to 06': 7,
    '07 to 09': 8,
    '10 to 12': 11,
    '13 to 15': 14,
    '16 to 18': 17,
    '19 to 21': 20,
    '22 to 24': 23,
    '25 to 27': 26,
    '28 to 30': 29,
    '31 to 33': 32,
    '34 to 36': 35,
    '37 to 39': 38,
    '40 to 42': 41,
    '43 to 45': 44,
    '46 to 48': 47,
    '49 to 51': 50
}
    reversed_mapping = {v: k for k, v in floor_range_mapping.items()}
    df['floor_range'] = df['height'].map(reversed_mapping)
    # get relative_height
    fr = df['floor_range'].str.extract(r'(?i)(\d+)\s*to\s*(\d+)', expand=True).astype(float)
    df['floor_avg'] = fr.mean(axis=1)
    df['relative_height'] = df['floor_avg'] / df['max_floor']
    

    df["flat_type"] = df["flat_type"].str.upper()
    df["flat_model"] = df["flat_model"].str.upper()
    
    return df

In [None]:
import random
use_cols = [
    'resale_price',
    'floor_area_sqm',
    'age_at_sale',
    'month_from2017',
    'flat_type',
    'flat_model',
    'hawker_distance',
    'mall_distance',
    'secondary_school_distance',
    'primary_school_distance',
    'mrt_distance',
    'relative_height',
    'subzone',
    'town'
]
train = pd.read_csv('../data/cleaned_train_with_amenities.csv', dtype={'postal_code': str})
train = get_relative_height(train)
train = train.reindex(columns=[c for c in use_cols if c in train.columns]).dropna().copy()


X = train.drop(columns=["resale_price"])
y = train["resale_price"]

cat_features = ["flat_type", "flat_model","subzone", "town"]
best_params = [
    {"iterations": 5000, 'learning_rate': 0.047513996053010106, 'depth': 10, 'l2_leaf_reg': 7.046020927148463, 'border_count': 208, "random_state": random.randint(0, 1e6), "verbose": 100, "early_stopping_rounds": 100, 'bagging_temperature': 0.013140315717808437, 'random_strength': 4.8957988602597755, 'min_data_in_leaf': 10, 'subsample': 0.7023925659862831},
    {"iterations": 5000, 'learning_rate': 0.049416402725337184, 'depth': 10, 'l2_leaf_reg': 9.238143103709053, 'border_count': 253, "random_state": random.randint(0, 1e6), "verbose": 100, "early_stopping_rounds": 100, 'bagging_temperature': 0.04644939580506229, 'random_strength': 6.590286048089739, 'min_data_in_leaf': 10, 'subsample': 0.7013741968685885},
    {"iterations": 5000, 'learning_rate': 0.04609436031415763, 'depth': 10, 'l2_leaf_reg': 7.21179644796382, 'border_count': 207, "random_state": random.randint(0, 1e6), "verbose": 100, "early_stopping_rounds": 100, 'bagging_temperature': 0.20712048748259987, 'random_strength': 6.050530410716024, 'min_data_in_leaf': 13, 'subsample': 0.7247923252458766},
    {"iterations": 5000, 'learning_rate': 0.05134716194137222, 'depth': 10, 'l2_leaf_reg': 9.482716463630304, 'border_count': 225, "random_state": random.randint(0, 1e6), "verbose": 100, "early_stopping_rounds": 100, 'bagging_temperature': 0.002896495651214713, 'random_strength': 3.6021018636975657, 'min_data_in_leaf': 10, 'subsample': 0.7026345127588408},
    {"iterations": 5000, 'learning_rate': 0.05004780310885287, 'depth': 10, 'l2_leaf_reg': 9.971286095901855, 'border_count': 235, "random_state": random.randint(0, 1e6), "verbose": 100, "early_stopping_rounds": 100, 'bagging_temperature': 0.21484066598183904, 'random_strength': 5.1869956019173635, 'min_data_in_leaf': 11, 'subsample': 0.7296987846486419}
]

models = []
for param in best_params:
    model = CatBoostRegressor(cat_features=cat_features, **param)
    model.fit(X, y)

0:	learn: 176736.8441831	total: 85.9ms	remaining: 7m 9s
100:	learn: 42974.7548630	total: 8.41s	remaining: 6m 47s
200:	learn: 35626.3738795	total: 17.5s	remaining: 6m 57s
300:	learn: 31716.3625723	total: 26.6s	remaining: 6m 55s
400:	learn: 29645.3820128	total: 35.2s	remaining: 6m 44s
500:	learn: 28216.8180789	total: 44s	remaining: 6m 35s
600:	learn: 27157.7100359	total: 52.9s	remaining: 6m 27s
700:	learn: 26328.4909724	total: 1m 1s	remaining: 6m 19s
800:	learn: 25731.4567649	total: 1m 10s	remaining: 6m 9s
900:	learn: 25231.1435446	total: 1m 19s	remaining: 6m 3s
1000:	learn: 24776.4482973	total: 1m 29s	remaining: 5m 55s
