Feature : Price Estimate using CatBoost with best parameter

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime

# ========================================================
# 1. Load data
# =======================================================
df = pd.read_csv('raw_data_main.csv')

# Exclude outliers (remove rows where IS_OUTLIERS = 1)
df = df[df['IS_OUTLIERS'] != 1]

# Create DATE_IDX (optional)
df['DATE_IDX'] = df['YEAR'] * 12 + df['MONTH_NUM']

# Log-transform target
df['RESALE_PRICE'] = np.log1p(df['RESALE_PRICE'])

# --------------------------------------------------------
# 2. Drop unwanted columns BEFORE preparing features
# --------------------------------------------------------
drop_cols = ['IS_OUTLIERS', 'STOREY_RANGE', 'PRICE_PER_SQM', 'MONTH_NUM','PRICE_TIER','SEASON','AGE_GROUP']
df = df.drop(columns=drop_cols, errors='ignore')

# Define categorical variables to encode
categorical_cols = ['TOWN', 'FLAT_TYPE']
categorical_cols = [col for col in categorical_cols if col in df.columns]

print(f"ðŸ“Œ Applying one-hot encoding on: {categorical_cols}")
df = pd.get_dummies(df, columns=categorical_cols, dtype=int)
print("âœ… One-hot encoding complete.")
print("ðŸ“Œ Encoded columns preview:", df.columns.tolist()[:20])
print(df.head())

# Optional: sample smaller subset for quick experiments
#df = df.sample(20000, random_state=42)

# Create bin for stratified sampling
df['price_bin'] = pd.qcut(df['RESALE_PRICE'], q=4, labels=False)

# --------------------------------------------------------
# 3. Train / Validation / Test split
# --------------------------------------------------------
df_trainval, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['price_bin'],
    random_state=42
)

df_train, df_valid = train_test_split(
    df_trainval,
    test_size=0.25,
    stratify=df_trainval['price_bin'],
    random_state=42
)

# Drop helper column used for stratification
df_train = df_train.drop(columns=['price_bin'])
df_valid = df_valid.drop(columns=['price_bin'])
df_test  = df_test.drop(columns=['price_bin'])

# ========================================================
# 4. Prepare features and target (no further dropping needed)
# ========================================================
X_train = df_train.drop(columns=['RESALE_PRICE'])
y_train = df_train['RESALE_PRICE']

X_valid = df_valid.drop(columns=['RESALE_PRICE'])
y_valid = df_valid['RESALE_PRICE']

X_test  = df_test.drop(columns=['RESALE_PRICE'])
y_test  = df_test['RESALE_PRICE']

# Ensure all numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
X_valid = X_valid.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test  = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)

# Optional: sanity check
for name, dfX in [("train", X_train), ("valid", X_valid), ("test", X_test)]:
    print(f"âœ… {name} set numeric dtypes:", dfX.dtypes.value_counts())

timestamp = datetime.now().strftime("%H:%M:%S")
print(f"{timestamp} \nâœ… Data ready for training (train/valid/test) - {timestamp}")


ðŸ“Œ Applying one-hot encoding on: ['TOWN', 'FLAT_TYPE']
âœ… One-hot encoding complete.
ðŸ“Œ Encoded columns preview: ['FLOOR_AREA_SQM', 'RESALE_PRICE', 'AGE', 'YEAR', 'STOREY_NUMERIC', 'DATE_IDX', 'TOWN_ANG MO KIO', 'TOWN_BEDOK', 'TOWN_BISHAN', 'TOWN_BUKIT BATOK', 'TOWN_BUKIT MERAH', 'TOWN_BUKIT PANJANG', 'TOWN_BUKIT TIMAH', 'TOWN_CENTRAL AREA', 'TOWN_CHOA CHU KANG', 'TOWN_CLEMENTI', 'TOWN_GEYLANG', 'TOWN_HOUGANG', 'TOWN_JURONG EAST', 'TOWN_JURONG WEST']
   FLOOR_AREA_SQM  RESALE_PRICE  AGE  YEAR  STOREY_NUMERIC  DATE_IDX  \
0              59     12.842652   50  2012               8     24150   
1              65     12.906694   50  2012               8     24150   
2              65     12.945629   50  2012               8     24150   
3              65     12.985400   50  2012               8     24150   
4              68     12.994532   49  2012               8     24150   

   TOWN_ANG MO KIO  TOWN_BEDOK  TOWN_BISHAN  TOWN_BUKIT BATOK  ...  \
0                0           0       

In [4]:
from datetime import datetime  # Make sure you have this imported

def evaluate(model, X, y, label="Model"):
    preds = np.expm1(model.predict(X))
    y_true = np.expm1(y)
    rmse = np.sqrt(mean_squared_error(y_true, preds))
    mae = mean_absolute_error(y_true, preds)
    r2 = r2_score(y_true, preds)
    
    print(f"\nðŸ“Š {label} Test Metrics at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:")
    print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | RÂ²: {r2:.4f}")
    
    return rmse, mae, r2

In [5]:
# ========================================================
# ðŸ§  Load best CatBoost parameters and train model
# ========================================================

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import json
import numpy as np

# 1. Load best parameters from JSON
with open("best_catboost_params.json", "r") as f:
    best_params = json.load(f)

# Remove RÂ² score from params dict if it exists
best_params.pop('r2', None)

# 2. Train the model on X_train
model = CatBoostRegressor(
    verbose=1000,
    random_state=42,
    loss_function='RMSE',
    **best_params
)

model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),
    early_stopping_rounds=50
)

# 3. Predict on validation set
y_pred = model.predict(X_valid)

# 4. Evaluate model
rmse, mae, r2 = evaluate(model, X_valid, y_valid, label="CatBoost Validation")

print(f"\nðŸ“Š Validation Results")
print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | RÂ²: {r2:.4f}")

# 5. (Optional) Save the trained model
joblib.dump(model, "catboost_model_valid.pkl")
print("âœ… Trained CatBoost model saved to 'catboost_model_valid.pkl'")

# 6. Save the feature names used during training
features_used = X_train.columns.tolist()
best_params['features_used'] = features_used
with open("best_catboost_params.json", "w") as f:
    json.dump(best_params, f)
print("âœ… Updated 'best_catboost_params.json' with feature names")



0:	learn: 0.2925791	test: 0.2936916	best: 0.2936916 (0)	total: 168ms	remaining: 4m 27s
1000:	learn: 0.0658374	test: 0.0681861	best: 0.0681861 (1000)	total: 17.3s	remaining: 10.2s
1593:	learn: 0.0629804	test: 0.0662806	best: 0.0662806 (1593)	total: 30.9s	remaining: 0us

bestTest = 0.06628056504
bestIteration = 1593


ðŸ“Š CatBoost Validation Test Metrics at 2025-09-23 11:32:50:
RMSE: 35,249.68 | MAE: 24,942.93 | RÂ²: 0.9541

ðŸ“Š Validation Results
RMSE: 35,249.68 | MAE: 24,942.93 | RÂ²: 0.9541
âœ… Trained CatBoost model saved to 'catboost_model_valid.pkl'
âœ… Updated 'best_catboost_params.json' with feature names


In [44]:
# save the list of features actually used in the model
joblib.dump(features_used, "CatBoost_features_used.pkl")

['CatBoost_features_used.pkl']

In [11]:
# ========================================================
# ðŸ§  Train with different years and data size combination with best CatBoost parameters 
# ========================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import json

# ========================================================
# 1. Load data & Filter by year
# ========================================================
df = pd.read_csv('raw_data_main.csv')

# --- Filter data for years 2020 to 2025 ---
# df = df[(df['YEAR'] >= 2015) & (df['YEAR'] <= 2025)].copy()
# The .copy() ensures we're working on a new DataFrame to avoid a SettingWithCopyWarning.

# Create DATE_IDX (optional)
df['DATE_IDX'] = df['YEAR'] * 12 + df['MONTH_NUM']

# Log-transform target
df['RESALE_PRICE'] = np.log1p(df['RESALE_PRICE'])

# --------------------------------------------------------
# 2. Drop unwanted columns BEFORE preparing features
# --------------------------------------------------------
drop_cols = ['IS_OUTLIERS', 'STOREY_RANGE', 'PRICE_PER_SQM', 'YEAR', 'MONTH_NUM','PRICE_TIER','SEASON','AGE_GROUP']
df = df.drop(columns=drop_cols, errors='ignore')

# Define categorical variables to encode
categorical_cols = ['TOWN', 'FLAT_TYPE']
categorical_cols = [col for col in categorical_cols if col in df.columns]

print(f"ðŸ“Œ Applying one-hot encoding on: {categorical_cols}")
df = pd.get_dummies(df, columns=categorical_cols, dtype=int)
print("âœ… One-hot encoding complete.")
print("ðŸ“Œ Encoded columns preview:", df.columns.tolist()[:20])
print(df.head())

# Optional: sample smaller subset for quick experiments
#df = df.sample(min(100000, len(df)), random_state=42)
# Updated to handle datasets smaller than 100k

# Create bin for stratified sampling
df['price_bin'] = pd.qcut(df['RESALE_PRICE'], q=4, labels=False)

# --------------------------------------------------------
# 3. Train / Validation / Test split
# --------------------------------------------------------
df_trainval, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['price_bin'],
    random_state=42
)

df_train, df_valid = train_test_split(
    df_trainval,
    test_size=0.25,
    stratify=df_trainval['price_bin'],
    random_state=42
)

# Drop helper column used for stratification
df_train = df_train.drop(columns=['price_bin'])
df_valid = df_valid.drop(columns=['price_bin'])
df_test = df_test.drop(columns=['price_bin'])

# ========================================================
# 4. Prepare features and target (no further dropping needed)
# ========================================================
X_train = df_train.drop(columns=['RESALE_PRICE'])
y_train = df_train['RESALE_PRICE']

X_valid = df_valid.drop(columns=['RESALE_PRICE'])
y_valid = df_valid['RESALE_PRICE']

X_test = df_test.drop(columns=['RESALE_PRICE'])
y_test = df_test['RESALE_PRICE']

# Ensure all numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
X_valid = X_valid.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)

# Optional: sanity check
for name, dfX in [("train", X_train), ("valid", X_valid), ("test", X_test)]:
    print(f"âœ… {name} set numeric dtypes:", dfX.dtypes.value_counts())

timestamp = datetime.now().strftime("%H:%M:%S")
print(f"âœ… Data ready for training (train/valid/test) - {timestamp}")


# ========================================================
# 5. Load best CatBoost parameters and train model
# ========================================================
'''
def evaluate(model, X, y, label=""):
    """Helper function to evaluate the model and print metrics."""
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    
    print(f"\nðŸ“Š {label} Results")
    print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | RÂ²: {r2:.4f}")
    return rmse, mae, r2
'''
def evaluate_actual_scale(model, X, y_log, label=""):
    """Evaluate model predictions in original price scale."""
    # Inverse transform
    y_pred_log = model.predict(X)
    y_pred = np.expm1(y_pred_log)
    y_true = np.expm1(y_log)

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\nðŸ“Š {label} Results (Actual Price Scale)")
    print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | RÂ²: {r2:.4f}")
    return rmse, mae, r2
    
# 1. Load best parameters from JSON
with open("best_catboost_params.json", "r") as f:
    best_params = json.load(f)

# Remove RÂ² score from params dict if it exists
best_params.pop('r2', None)

# 2. Train the model on X_train
model = CatBoostRegressor(
    verbose=100, # Reduced verbosity for cleaner output
    random_state=42,
    loss_function='RMSE',
    **best_params
)

model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),
    early_stopping_rounds=50
)

# 3. Predict on validation set & Evaluate model
# evaluate(model, X_valid, y_valid, label="CatBoost Validation")

# Training set
evaluate_actual_scale(model, X_train, y_train, label="CatBoost Training")

# Validation set
evaluate_actual_scale(model, X_valid, y_valid, label="CatBoost Validation")

# Test set
evaluate_actual_scale(model, X_test, y_test, label="CatBoost Test")


# 4. Save the trained model
joblib.dump(model, "catboost_model_valid_test.pkl")
print("âœ… Trained CatBoost model saved to 'catboost_model_valid_test.pkl'")

ðŸ“Œ Applying one-hot encoding on: ['TOWN', 'FLAT_TYPE']
âœ… One-hot encoding complete.
ðŸ“Œ Encoded columns preview: ['FLOOR_AREA_SQM', 'RESALE_PRICE', 'AGE', 'STOREY_NUMERIC', 'DATE_IDX', 'TOWN_ANG MO KIO', 'TOWN_BEDOK', 'TOWN_BISHAN', 'TOWN_BUKIT BATOK', 'TOWN_BUKIT MERAH', 'TOWN_BUKIT PANJANG', 'TOWN_BUKIT TIMAH', 'TOWN_CENTRAL AREA', 'TOWN_CHOA CHU KANG', 'TOWN_CLEMENTI', 'TOWN_GEYLANG', 'TOWN_HOUGANG', 'TOWN_JURONG EAST', 'TOWN_JURONG WEST', 'TOWN_KALLANG/WHAMPOA']
   FLOOR_AREA_SQM  RESALE_PRICE  AGE  STOREY_NUMERIC  DATE_IDX  \
0              67     12.868763   44              11     24153   
1              70     12.886644   49               8     24153   
2              59     12.889172   54               5     24153   
3              67     12.906694   43               8     24153   
4              67     12.911645   49              11     24153   

   TOWN_ANG MO KIO  TOWN_BEDOK  TOWN_BISHAN  TOWN_BUKIT BATOK  \
0                0           0            0                 0 