Feature : Price Estimate using XGBoost, RandomForest, LightGBM, CatBoost

In [7]:
pip install --upgrade xgboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
import xgboost
from xgboost import XGBRegressor
print(xgboost.__version__)
print(xgboost.__file__)
print(XGBRegressor.__module__)

3.0.5
C:\Users\sit\AppData\Roaming\Python\Python312\site-packages\xgboost\__init__.py
xgboost.sklearn


In [10]:
import sys
print(sys.executable)

C:\anaconda3\python.exe


In [1]:
import xgboost
from xgboost import XGBRegressor

print(xgboost.__version__)  # should be 3.1+ (or latest)
print(XGBRegressor.__module__)

3.0.5
xgboost.sklearn


In [8]:
import sys
!{sys.executable} -m pip install --upgrade scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime

# ======================================================
# 1. Load data
# ========================================================
df = pd.read_csv('raw_data_main.csv')

# Exclude outliers (remove rows where IS_OUTLIERS = 1)
df = df[df['IS_OUTLIERS'] != 1]

# Create DATE_IDX (optional)
df['DATE_IDX'] = df['YEAR'] * 12 + df['MONTH_NUM']

# Log-transform target
df['RESALE_PRICE'] = np.log1p(df['RESALE_PRICE'])

# --------------------------------------------------------
# 2. Drop unwanted columns BEFORE preparing features
# --------------------------------------------------------
drop_cols = ['IS_OUTLIERS', 'STOREY_RANGE', 'PRICE_PER_SQM', 'YEAR', 'MONTH_NUM','PRICE_TIER','SEASON','AGE_GROUP']
df = df.drop(columns=drop_cols, errors='ignore')

# Define categorical variables to encode
categorical_cols = ['TOWN', 'FLAT_TYPE']
categorical_cols = [col for col in categorical_cols if col in df.columns]

print(f"üìå Applying one-hot encoding on: {categorical_cols}")
df = pd.get_dummies(df, columns=categorical_cols, dtype=int)
print("‚úÖ One-hot encoding complete.")
print("üìå Encoded columns preview:", df.columns.tolist()[:20])
print(df.head())

# Optional: sample smaller subset for quick experiments
df = df.sample(20000, random_state=42)

# Create bin for stratified sampling
df['price_bin'] = pd.qcut(df['RESALE_PRICE'], q=4, labels=False)

# --------------------------------------------------------
# 3. Train / Validation / Test split
# --------------------------------------------------------
df_trainval, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['price_bin'],
    random_state=42
)

df_train, df_valid = train_test_split(
    df_trainval,
    test_size=0.25,
    stratify=df_trainval['price_bin'],
    random_state=42
)

# Drop helper column used for stratification
df_train = df_train.drop(columns=['price_bin'])
df_valid = df_valid.drop(columns=['price_bin'])
df_test  = df_test.drop(columns=['price_bin'])

# ========================================================
# 4. Prepare features and target (no further dropping needed)
# ========================================================
X_train = df_train.drop(columns=['RESALE_PRICE'])
y_train = df_train['RESALE_PRICE']

X_valid = df_valid.drop(columns=['RESALE_PRICE'])
y_valid = df_valid['RESALE_PRICE']

X_test  = df_test.drop(columns=['RESALE_PRICE'])
y_test  = df_test['RESALE_PRICE']

# Ensure all numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
X_valid = X_valid.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test  = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)

# Optional: sanity check
for name, dfX in [("train", X_train), ("valid", X_valid), ("test", X_test)]:
    print(f"‚úÖ {name} set numeric dtypes:", dfX.dtypes.value_counts())

timestamp = datetime.now().strftime("%H:%M:%S")
print(f"{timestamp} \n‚úÖ Data ready for training (train/valid/test) - {timestamp}")


üìå Applying one-hot encoding on: ['TOWN', 'FLAT_TYPE']
‚úÖ One-hot encoding complete.
üìå Encoded columns preview: ['FLOOR_AREA_SQM', 'RESALE_PRICE', 'AGE', 'STOREY_NUMERIC', 'DATE_IDX', 'TOWN_ANG MO KIO', 'TOWN_BEDOK', 'TOWN_BISHAN', 'TOWN_BUKIT BATOK', 'TOWN_BUKIT MERAH', 'TOWN_BUKIT PANJANG', 'TOWN_BUKIT TIMAH', 'TOWN_CENTRAL AREA', 'TOWN_CHOA CHU KANG', 'TOWN_CLEMENTI', 'TOWN_GEYLANG', 'TOWN_HOUGANG', 'TOWN_JURONG EAST', 'TOWN_JURONG WEST', 'TOWN_KALLANG/WHAMPOA']
   FLOOR_AREA_SQM  RESALE_PRICE  AGE  STOREY_NUMERIC  DATE_IDX  \
0              59     12.842652   50               8     24150   
1              65     12.906694   50               8     24150   
2              65     12.945629   50               8     24150   
3              65     12.985400   50               8     24150   
4              68     12.994532   49               8     24150   

   TOWN_ANG MO KIO  TOWN_BEDOK  TOWN_BISHAN  TOWN_BUKIT BATOK  \
0                0           0            0                 0 

In [25]:
# =================================================================================================
# Training on Random Forest GridSearch and XGBoost GridSearch to find best result
# =================================================================================================

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import xgboost as xgb

pd.set_option('display.max_colwidth', None)

# ========================================================
# 2Ô∏è‚É£ Random Forest GridSearch
# ========================================================
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_param_grid = {
    'n_estimators': [500, 800, 1200],
    'max_depth': [25, 30, 40],
 #   'min_samples_split': [2, 5, 10]
    'min_samples_split': [5]
}

rf_grid = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)
rf_grid.fit(X_train, y_train)

rf_results = pd.DataFrame(rf_grid.cv_results_).sort_values(by='mean_test_score', ascending=False)
print(f"{timestamp}\nüìä Random Forest Top Results:")
print(rf_results[['mean_test_score','std_test_score','params']].head())

# ========================================================
# 3Ô∏è‚É£ XGBoost GridSearch (without early stopping)
# ========================================================
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_jobs=-1,
    tree_method='hist',
    random_state=42
)

xgb_param_grid = {
#    'n_estimators': [800, 1200, 1500],
    'n_estimators': [1200],
    'max_depth': [6],
#    'max_depth': [6, 10, 12],
#    'learning_rate': [0.01, 0.05, 0.1],
     'learning_rate': [0.1, 0.21],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0,1.4]
}

xgb_grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=xgb_param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)
xgb_grid.fit(X_train, y_train)

xgb_results = pd.DataFrame(xgb_grid.cv_results_).sort_values(by='mean_test_score', ascending=False)
print(f"{timestamp}\nüìä XGBoost Top Results:")
print(xgb_results[['mean_test_score','std_test_score','params']].head())

# ========================================================
# 4Ô∏è‚É£ Combine Top Results
# ========================================================
rf_top = rf_results[['mean_test_score','std_test_score','params']].head().copy()
rf_top['model'] = 'RandomForest'

xgb_top = xgb_results[['mean_test_score','std_test_score','params']].head().copy()
xgb_top['model'] = 'XGBoost'

combined_top = pd.concat([rf_top, xgb_top]).sort_values(by='mean_test_score', ascending=False).reset_index(drop=True)
print("\nüèÜ Combined Top 5 Results (RF + XGB):")
print(combined_top)

# ========================================================
# 5Ô∏è‚É£ Print best params
# ========================================================
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"\nüìå Grid Search completed at {timestamp}")

print(f"{timestamp}\n‚úÖ Best Random Forest params:")
print(rf_grid.best_params_)
print(f"R¬≤: {rf_grid.best_score_:.4f}")

timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"{timestamp}\n‚úÖ Best XGBoost params:")
print(xgb_grid.best_params_)
print(f"R¬≤: {xgb_grid.best_score_:.4f}")

# ========================================================
# 6Ô∏è‚É£ Retrain XGBoost best model with Early Stopping
# ========================================================
xgb_best_params = xgb_grid.best_params_
xgb_best = xgb.XGBRegressor(
    **xgb_best_params,
    objective='reg:squarederror',
    n_jobs=-1,
    tree_method='hist',
    random_state=42
)

# Early stopping on validation set
xgb_best.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    early_stopping_rounds=20,
    verbose=True
)

# ========================================================
# 7Ô∏è‚É£ Evaluate function
# ========================================================
def evaluate(model, X, y, label="Model"):
    preds = np.expm1(model.predict(X))
    y_true = np.expm1(y)
    rmse = np.sqrt(mean_squared_error(y_true, preds))
    mae = mean_absolute_error(y_true, preds)
    r2 = r2_score(y_true, preds)
    print(f"\nüìä {label} Test Metrics:")
    print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | R¬≤: {r2:.4f}")

# Evaluate
evaluate(rf_grid.best_estimator_, X_test, y_test, label="Random Forest")
evaluate(xgb_best, X_test, y_test, label="XGBoost with Early Stopping")

Fitting 3 folds for each of 9 candidates, totalling 27 fits
11:08:45
üìä Random Forest Top Results:
   mean_test_score  std_test_score  \
4         0.857309        0.001226   
5         0.857307        0.001092   
7         0.857258        0.001224   
8         0.857254        0.001076   
3         0.857198        0.001287   

                                                            params  
4   {'max_depth': 30, 'min_samples_split': 5, 'n_estimators': 800}  
5  {'max_depth': 30, 'min_samples_split': 5, 'n_estimators': 1200}  
7   {'max_depth': 40, 'min_samples_split': 5, 'n_estimators': 800}  
8  {'max_depth': 40, 'min_samples_split': 5, 'n_estimators': 1200}  
3   {'max_depth': 30, 'min_samples_split': 5, 'n_estimators': 500}  
Fitting 3 folds for each of 12 candidates, totalling 36 fits


12 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sit\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sit\AppData\Roaming\Python\Python312\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\sit\AppData\Roaming\Python\Python312\site-packages\xgboost\sklearn.py", line 1247, in fit
    self._Booster = train(
                    ^^^^^^
  File "C:\Users\sit\AppData\Roaming\Python\Python312\site-packages\xgboost\cor

11:08:45
üìä XGBoost Top Results:
   mean_test_score  std_test_score  \
1         0.889657        0.002858   
5         0.887185        0.002575   
0         0.886666        0.002386   
4         0.883848        0.002466   
3         0.879793        0.003004   

                                                                                                     params  
1   {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1200, 'subsample': 1.0}  
5   {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1200, 'subsample': 1.0}  
0   {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1200, 'subsample': 0.8}  
4   {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 1200, 'subsample': 0.8}  
3  {'colsample_bytree': 0.8, 'learning_rate': 0.21, 'max_depth': 6, 'n_estimators': 1200, 'subsample': 1.0}  

üèÜ Combined Top 5 Results (RF + XGB):
   mean_test_score  std_test_score 

TypeError: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [26]:
from datetime import datetime
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ========================================================
# üîç Training GridSearchCV for LightGBM and CatBoost
# ========================================================
print(f"\n‚è≥ Starting GridSearch for LightGBM at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

lgb_model = LGBMRegressor(objective='regression', random_state=42, n_jobs=-1)

lgb_param_grid = {
#    'n_estimators': [500, 800],
    'n_estimators': [800, 1200],
    'max_depth': [6, 12],
#    'learning_rate': [0.01, 0.05, 0.1],
    'learning_rate': [0.1],
#    'subsample': [0.8, 1.0],
#    'colsample_bytree': [0.8, 1.0]
    'subsample': [0.3, 0.8],
    'colsample_bytree': [0.3, 0.8]
}

lgb_grid = GridSearchCV(
    estimator=lgb_model,
    param_grid=lgb_param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

lgb_grid.fit(X_train, y_train)

print(f"\n‚úÖ LightGBM GridSearch completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# --------------------------------------------------------
print(f"\n‚è≥ Starting GridSearch for CatBoost at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

cat_model = CatBoostRegressor(
    verbose=0,
    random_state=42,
    loss_function='RMSE'
)

cat_param_grid = {
#    'iterations': [500, 800],
    'iterations': [800,1200],
    'depth': [6, 10],
#    'learning_rate': [0.01, 0.05, 0.1],
    'learning_rate': [0.1],
#    'l2_leaf_reg': [3, 5]
     'l2_leaf_reg': [3, 8]
}

cat_grid = GridSearchCV(
    estimator=cat_model,
    param_grid=cat_param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

cat_grid.fit(X_train, y_train)

print(f"\n‚úÖ CatBoost GridSearch completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# --------------------------------------------------------
# Results Summary
# --------------------------------------------------------
print(f"\nüìå Final Results Summary at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

print(f"\n‚úÖ Best LightGBM params:")
print(lgb_grid.best_params_)
print(f"R¬≤: {lgb_grid.best_score_:.4f}")

print(f"\n‚úÖ Best CatBoost params:")
print(cat_grid.best_params_)
print(f"R¬≤: {cat_grid.best_score_:.4f}")

# --------------------------------------------------------
# Evaluation Function
# --------------------------------------------------------
def evaluate(model, X, y, label="Model"):
    preds = np.expm1(model.predict(X))
    y_true = np.expm1(y)
    rmse = np.sqrt(mean_squared_error(y_true, preds))
    mae = mean_absolute_error(y_true, preds)
    r2 = r2_score(y_true, preds)
    print(f"\nüìä {label} Test Metrics at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:")
    print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | R¬≤: {r2:.4f}")

# --------------------------------------------------------
# Final Evaluation on Test Set
# --------------------------------------------------------
evaluate(lgb_grid.best_estimator_, X_test, y_test, label="LightGBM")
evaluate(cat_grid.best_estimator_, X_test, y_test, label="CatBoost")



‚è≥ Starting GridSearch for LightGBM at 2025-09-15 11:16:03
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000710 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 421
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 34
[LightGBM] [Info] Start training from score 13.077856

‚úÖ LightGBM GridSearch completed at 2025-09-15 11:17:09

‚è≥ Starting GridSearch for CatBoost at 2025-09-15 11:17:09
Fitting 3 folds for each of 8 candidates, totalling 24 fits

‚úÖ CatBoost GridSearch completed at 2025-09-15 11:18:32

üìå Final Results Summary at 2025-09-15 11:18:32

‚úÖ Best LightGBM params:
{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 800, 'subsample': 0.3}
R¬≤: 0.8853

‚úÖ Best CatBoost params:
{'depth': 6, 'iterations': 1200, 'l2_leaf_reg': 3, 'learning_rate': 0.1}
R¬≤: 0

In [27]:
# ========================================================
# üîç  Weighted Ensemble Evaluation for XGB, LightGBM, CatBoost
# ========================================================

import xgboost as xgb
from datetime import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print(f"\n‚è≥ Starting Weighted Ensemble Evaluation at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# --------------------------------------------------------
# Convert to DMatrix
# --------------------------------------------------------
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest  = xgb.DMatrix(X_test)

# --------------------------------------------------------
# Set Parameters
# --------------------------------------------------------
xgb_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 1200,  # used in num_boost_round
    'subsample': 1.0,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse',
    'seed': 42
}

# --------------------------------------------------------
# Train with Early Stopping
# --------------------------------------------------------
xgb_model = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=1200,
    evals=[(dvalid, 'validation')],
    early_stopping_rounds=20,
    verbose_eval=False
)

# --------------------------------------------------------
# Individual Predictions (reverse log-transform)
# --------------------------------------------------------
pred_cat = np.expm1(cat_grid.best_estimator_.predict(X_test))
pred_xgb = np.expm1(xgb_model.predict(dtest))
pred_lgb = np.expm1(lgb_grid.best_estimator_.predict(X_test))

# --------------------------------------------------------
# Weighted Ensemble Prediction
# --------------------------------------------------------
ensemble_pred = (0.7 * pred_cat + 0.2 * pred_xgb + 0.1 * pred_lgb)
y_true = np.expm1(y_test)

# --------------------------------------------------------
# Evaluation
# --------------------------------------------------------
rmse = np.sqrt(mean_squared_error(y_true, ensemble_pred))
mae = mean_absolute_error(y_true, ensemble_pred)
r2 = r2_score(y_true, ensemble_pred)

print(f"\nüìä Weighted Ensemble Test Metrics at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:")
print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | R¬≤: {r2:.4f}")








‚è≥ Starting Weighted Ensemble Evaluation at 2025-09-15 11:18:49


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()



üìä Weighted Ensemble Test Metrics at 2025-09-15 11:18:51:
RMSE: 51,771.76 | MAE: 35,828.82 | R¬≤: 0.9079


In [36]:
# ========================================================
# Validating model with separate data
# ========================================================

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from datetime import datetime
import xgboost as xgb

print(f"\n‚è≥ Starting Validation on df_valid at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# --------------------------------------------------------
# Prepare validation features and target
# --------------------------------------------------------
X_valid_eval = df_valid.drop(columns=['RESALE_PRICE', 'price_bin'], errors='ignore')
y_valid_eval = df_valid['RESALE_PRICE']

# Ensure numeric
X_valid_eval = X_valid_eval.apply(pd.to_numeric, errors='coerce').fillna(0)

# Convert to DMatrix for XGBoost
dvalid_eval = xgb.DMatrix(X_valid_eval)

# --------------------------------------------------------
# Individual Predictions (reverse log-transform)
# --------------------------------------------------------
pred_cat_val = np.expm1(cat_grid.best_estimator_.predict(X_valid_eval))
pred_xgb_val = np.expm1(xgb_model.predict(dvalid_eval))
pred_lgb_val = np.expm1(lgb_grid.best_estimator_.predict(X_valid_eval))

# --------------------------------------------------------
# Weighted Ensemble Prediction
# --------------------------------------------------------
ensemble_pred_val = (0.4 * pred_cat_val + 0.3 * pred_xgb_val + 0.3 * pred_lgb_val)
y_true_val = np.expm1(y_valid_eval)

# --------------------------------------------------------
# Evaluation
# --------------------------------------------------------
rmse_val = np.sqrt(mean_squared_error(y_true_val, ensemble_pred_val))
mae_val = mean_absolute_error(y_true_val, ensemble_pred_val)
r2_val = r2_score(y_true_val, ensemble_pred_val)

print(f"\nüìä Weighted Ensemble Validation Metrics at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:")
print(f"RMSE: {rmse_val:,.2f} | MAE: {mae_val:,.2f} | R¬≤: {r2_val:.4f}")



‚è≥ Starting Validation on df_valid at 2025-09-12 14:36:32

üìä Weighted Ensemble Validation Metrics at 2025-09-12 14:36:33:
RMSE: 47,827.53 | MAE: 32,419.19 | R¬≤: 0.9180


In [3]:
# ========================================================
# üîç Training CatBoost with GridSearchCV
# ========================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostRegressor

# 1. Load and preprocess data
df = pd.read_csv('raw_data_main.csv')

# Optional: sample smaller subset for quick experiments
df = df.sample(10000, random_state=42)

# Create DATE_IDX (optional)
df['DATE_IDX'] = df['YEAR'] * 12 + df['MONTH_NUM']

# Log-transform target
df['RESALE_PRICE'] = np.log1p(df['RESALE_PRICE'])

# Drop unwanted columns BEFORE feature prep
drop_cols = ['IS_OUTLIERS', 'STOREY_RANGE', 'PRICE_PER_SQM', 'YEAR', 'MONTH_NUM', 'PRICE_TIER', 'SEASON', 'AGE_GROUP']
df = df.drop(columns=drop_cols, errors='ignore')

# 2. Split data
X = df.drop(columns=['RESALE_PRICE'])
y = df['RESALE_PRICE']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 3. Convert categorical features (assumes these columns are categorical but NOT one-hot encoded)
cat_features = ['TOWN', 'FLAT_TYPE']  # Add other categorical feature names here as needed

for col in cat_features:
    X_train[col] = X_train[col].astype('category')
    X_valid[col] = X_valid[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# 4. Define CatBoost model & parameter grid
cat_model = CatBoostRegressor(
    verbose=0,
    random_state=42,
    loss_function='RMSE'
)
'''
cat_param_grid = {
    'iterations': [800, 1200, 1600],
    'depth': [6, 10],
    'learning_rate': [0.03, 0.05, 0.1],
    'l2_leaf_reg': [3, 8, 10],
    'bagging_temperature': [0, 1],
    'border_count': [64, 128, 254],
    'grow_policy': ['SymmetricTree', 'Depthwise']
}
'''
cat_param_grid = {
    'iterations': [1200,1600],
    'depth': [6,10],
    'learning_rate': [0.05,0.1],
    'l2_leaf_reg': [3],
    'bagging_temperature': [0],
    'border_count': [128,254],
    'grow_policy': ['SymmetricTree']
}
# 5. Grid Search CV
cat_grid = GridSearchCV(
    estimator=cat_model,
    param_grid=cat_param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

cat_grid.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

print(f"\n‚úÖ Best CatBoost params: {cat_grid.best_params_}")
print(f"R¬≤: {cat_grid.best_score_:.4f}")



Fitting 3 folds for each of 16 candidates, totalling 48 fits

‚úÖ Best CatBoost params: {'bagging_temperature': 0, 'border_count': 128, 'depth': 6, 'grow_policy': 'SymmetricTree', 'iterations': 1600, 'l2_leaf_reg': 3, 'learning_rate': 0.05}
R¬≤: 0.8793


In [2]:
# ========================================================
# üîç Bayesian Optimization + Train CatBoost + Evaluate
# ========================================================
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
import numpy as np
from datetime import datetime
import json
import joblib
import pandas as pd

# -----------------------------
# Ensure X_train / X_valid are DataFrames
# -----------------------------
if not isinstance(X_train, pd.DataFrame):
    X_train = pd.DataFrame(X_train, columns=X_columns)
if not isinstance(X_valid, pd.DataFrame):
    X_valid = pd.DataFrame(X_valid, columns=X_columns)

# -----------------------------
# Detect categorical columns automatically
# -----------------------------
# Replace with your actual categorical types if needed
cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
print("Detected categorical features:", cat_features)

# -----------------------------
# Hyperparameter space
# -----------------------------
param_space = [
    Integer(800, 1600, name='iterations'),
    Integer(6, 10, name='depth'),
    Real(0.06, 0.12, name='learning_rate'),
    Integer(3, 6, name='l2_leaf_reg'),
    Real(0, 1, name='bagging_temperature'),
    Integer(64, 254, name='border_count'),
    Categorical(['SymmetricTree', 'Depthwise'], name='grow_policy')
]

# -----------------------------
# Objective function with debug
# -----------------------------
@use_named_args(param_space)
def objective(**params):
    print("\n--- New Iteration ---")
    print("Params:", params)
    print("X_train shape:", X_train.shape, "X_valid shape:", X_valid.shape)
    print("cat_features:", cat_features)

    model = CatBoostRegressor(
        verbose=0,
        random_state=42,
        loss_function='RMSE',
        **params
    )

    model.fit(
        X_train, y_train,
        cat_features=cat_features,
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=50
    )

    preds = model.predict(X_valid)
    r2 = r2_score(y_valid, preds)
    print("R¬≤:", r2)
    return -r2  # minimize negative R¬≤

# -----------------------------
# Run Bayesian Optimization
# -----------------------------
print("üîç Running Bayesian Optimization for CatBoost...")
start_time = datetime.now()

results = gp_minimize(
    func=objective,
    dimensions=param_space,
    n_calls=30,
    n_initial_points=5,
    random_state=42,
    verbose=0
)

end_time = datetime.now()
print(f"\n‚è±Ô∏è Optimization finished in {end_time - start_time}")

# -----------------------------
# Extract best parameters
# -----------------------------
best_r2 = -results.fun
best_params = {dim.name: val for dim, val in zip(param_space, results.x)}
best_params['r2'] = best_r2

# Convert NumPy types to native Python types
best_params_clean = {
    k: float(v) if isinstance(v, (np.float32, np.float64))
    else int(v) if isinstance(v, (np.int32, np.int64))
    else v
    for k, v in best_params.items()
}

# Save best parameters
with open("best_catboost_params.json", "w") as f:
    json.dump(best_params_clean, f, indent=4)
print("üìÅ Saved best parameters to 'best_catboost_params.json'")

# -


Detected categorical features: []
üîç Running Bayesian Optimization for CatBoost...

--- New Iteration ---
Params: {'iterations': 1437, 'depth': 7, 'learning_rate': 0.10678146001636617, 'l2_leaf_reg': 5, 'bagging_temperature': 0.44583275285359125, 'border_count': 83, 'grow_policy': 'SymmetricTree'}
X_train shape: (12000, 35) X_valid shape: (4000, 35)
cat_features: []
R¬≤: 0.9354538710284577

--- New Iteration ---
Params: {'iterations': 1067, 'depth': 7, 'learning_rate': 0.09905330837693117, 'l2_leaf_reg': 3, 'bagging_temperature': 0.7219987722668249, 'border_count': 242, 'grow_policy': 'SymmetricTree'}
X_train shape: (12000, 35) X_valid shape: (4000, 35)
cat_features: []
R¬≤: 0.9359597123371913

--- New Iteration ---
Params: {'iterations': 1594, 'depth': 8, 'learning_rate': 0.09669918962929686, 'l2_leaf_reg': 3, 'bagging_temperature': 0.02306242504141576, 'border_count': 164, 'grow_policy': 'SymmetricTree'}
X_train shape: (12000, 35) X_valid shape: (4000, 35)
cat_features: []
R¬≤: 0.9

In [40]:
def evaluate(model, X, y, label="Model"):
    preds = np.expm1(model.predict(X))
    y_true = np.expm1(y)
    rmse = np.sqrt(mean_squared_error(y_true, preds))
    mae = mean_absolute_error(y_true, preds)
    r2 = r2_score(y_true, preds)
    print(f"\nüìä {label} Test Metrics at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}:")
    print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | R¬≤: {r2:.4f}")


In [42]:
# ========================================================
# üß† Load best CatBoost parameters and train model
# ========================================================

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import json
import numpy as np

# 1. Load best parameters from JSON
with open("best_catboost_params.json", "r") as f:
    best_params = json.load(f)

# Remove R¬≤ score from params dict if it exists
best_params.pop('r2', None)

# 2. Train the model on X_train
model = CatBoostRegressor(
    verbose=1000,
    random_state=42,
    loss_function='RMSE',
    **best_params
)

model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),
    early_stopping_rounds=50
)

# 3. Predict on validation set
y_pred = model.predict(X_valid)

# 4. Evaluate model
evaluate(model, X_valid, y_valid, label="CatBoost Validation")

print(f"\nüìä Validation Results")
print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | R¬≤: {r2:.4f}")

# 5. (Optional) Save the trained model
joblib.dump(model, "catboost_model_valid.pkl")
print("‚úÖ Trained CatBoost model saved to 'catboost_model_valid.pkl'")


0:	learn: 0.3087147	test: 0.3051864	best: 0.3051864 (0)	total: 116ms	remaining: 3m 5s
1000:	learn: 0.0721279	test: 0.0887405	best: 0.0887405 (1000)	total: 1m 47s	remaining: 1m 4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.08853360709
bestIteration = 1318

Shrink model to first 1319 iterations.

üìä CatBoost Validation Test Metrics at 2025-09-15 12:18:30:
RMSE: 47,910.55 | MAE: 32,354.26 | R¬≤: 0.9177

üìä Validation Results
RMSE: 51,771.76 | MAE: 35,828.82 | R¬≤: 0.9079
‚úÖ Trained CatBoost model saved to 'catboost_model_valid.pkl'


In [44]:
# save the list of features actually used in the model
joblib.dump(features_used, "CatBoost_features_used.pkl")

['CatBoost_features_used.pkl']

In [47]:
# ========================================================
# Download trained model on ensemble_model (TRIED, discard because too complex)
# ========================================================
import pickle

ensemble_model = {
    "catboost": cat_model,
    "xgboost_native": xgb_native_model,
    "lightgbm": lgb_model,
    "weights": {
        "catboost": 0.7,
        "xgboost_native": 0.2,
        "lightgbm": 0.1
    },
    "feature_names": list(X_train.columns)  # Save feature order
}

with open("E_Price_model.pkl", "wb") as f:
    pickle.dump(ensemble_model, f)

print("‚úÖ Saved weighted ensemble to 'E_Price_model.pkl'")


‚úÖ Saved weighted ensemble to 'E_Price_model.pkl'


In [1]:
# ========================================================
# üß† Train with different years and data size combination with best CatBoost parameters 
# ========================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import json

# ========================================================
# 1. Load data & Filter by year
# ========================================================
df = pd.read_csv('raw_data_main.csv')

# --- Filter data for years 2020 to 2025 ---
# df = df[(df['YEAR'] >= 2015) & (df['YEAR'] <= 2025)].copy()
# The .copy() ensures we're working on a new DataFrame to avoid a SettingWithCopyWarning.

# Exclude outliers (remove rows where IS_OUTLIERS = 1)
df = df[df['IS_OUTLIERS'] != 1]

# Create DATE_IDX (optional)
df['DATE_IDX'] = df['YEAR'] * 12 + df['MONTH_NUM']

# Log-transform target
df['RESALE_PRICE'] = np.log1p(df['RESALE_PRICE'])

# --------------------------------------------------------
# 2. Drop unwanted columns BEFORE preparing features
# --------------------------------------------------------
drop_cols = ['IS_OUTLIERS', 'STOREY_RANGE', 'PRICE_PER_SQM', 'MONTH_NUM','PRICE_TIER','SEASON','AGE_GROUP']
df = df.drop(columns=drop_cols, errors='ignore')

# Define categorical variables to encode
categorical_cols = ['TOWN', 'FLAT_TYPE']
categorical_cols = [col for col in categorical_cols if col in df.columns]

print(f"üìå Applying one-hot encoding on: {categorical_cols}")
df = pd.get_dummies(df, columns=categorical_cols, dtype=int)
print("‚úÖ One-hot encoding complete.")
print("üìå Encoded columns preview:", df.columns.tolist()[:20])
print(df.head())

# Optional: sample smaller subset for quick experiments
#df = df.sample(min(100000, len(df)), random_state=42)
# Updated to handle datasets smaller than 100k

# Create bin for stratified sampling
df['price_bin'] = pd.qcut(df['RESALE_PRICE'], q=4, labels=False)

# --------------------------------------------------------
# 3. Train / Validation / Test split
# --------------------------------------------------------
df_trainval, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['price_bin'],
    random_state=42
)

df_train, df_valid = train_test_split(
    df_trainval,
    test_size=0.25,
    stratify=df_trainval['price_bin'],
    random_state=42
)

# Drop helper column used for stratification
df_train = df_train.drop(columns=['price_bin'])
df_valid = df_valid.drop(columns=['price_bin'])
df_test = df_test.drop(columns=['price_bin'])

# ========================================================
# 4. Prepare features and target (no further dropping needed)
# ========================================================
X_train = df_train.drop(columns=['RESALE_PRICE'])
y_train = df_train['RESALE_PRICE']

X_valid = df_valid.drop(columns=['RESALE_PRICE'])
y_valid = df_valid['RESALE_PRICE']

X_test = df_test.drop(columns=['RESALE_PRICE'])
y_test = df_test['RESALE_PRICE']

# Ensure all numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
X_valid = X_valid.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)

# Optional: sanity check
for name, dfX in [("train", X_train), ("valid", X_valid), ("test", X_test)]:
    print(f"‚úÖ {name} set numeric dtypes:", dfX.dtypes.value_counts())

timestamp = datetime.now().strftime("%H:%M:%S")
print(f"‚úÖ Data ready for training (train/valid/test) - {timestamp}")


# ========================================================
# 5. Load best CatBoost parameters and train model
# ========================================================
'''
def evaluate(model, X, y, label=""):
    """Helper function to evaluate the model and print metrics."""
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    
    print(f"\nüìä {label} Results")
    print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | R¬≤: {r2:.4f}")
    return rmse, mae, r2
'''
def evaluate_actual_scale(model, X, y_log, label=""):
    """Evaluate model predictions in original price scale."""
    # Inverse transform
    y_pred_log = model.predict(X)
    y_pred = np.expm1(y_pred_log)
    y_true = np.expm1(y_log)

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\nüìä {label} Results (Actual Price Scale)")
    print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | R¬≤: {r2:.4f}")
    return rmse, mae, r2
    
# 1. Load best parameters from JSON
with open("best_catboost_params.json", "r") as f:
    best_params = json.load(f)

# Remove R¬≤ score from params dict if it exists
best_params.pop('r2', None)

# 2. Train the model on X_train
model = CatBoostRegressor(
    verbose=100, # Reduced verbosity for cleaner output
    random_state=42,
    loss_function='RMSE',
    **best_params
)

model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),
    early_stopping_rounds=50
)

# 3. Predict on validation set & Evaluate model
# evaluate(model, X_valid, y_valid, label="CatBoost Validation")

# Training set
evaluate_actual_scale(model, X_train, y_train, label="CatBoost Training")

# Validation set
evaluate_actual_scale(model, X_valid, y_valid, label="CatBoost Validation")

# Test set
evaluate_actual_scale(model, X_test, y_test, label="CatBoost Test")


# 4. Save the trained model
joblib.dump(model, "catboost_model_valid_test.pkl")
print("‚úÖ Trained CatBoost model saved to 'catboost_model_valid_test.pkl'")

üìå Applying one-hot encoding on: ['TOWN', 'FLAT_TYPE']
‚úÖ One-hot encoding complete.
üìå Encoded columns preview: ['FLOOR_AREA_SQM', 'RESALE_PRICE', 'AGE', 'YEAR', 'STOREY_NUMERIC', 'DATE_IDX', 'TOWN_ANG MO KIO', 'TOWN_BEDOK', 'TOWN_BISHAN', 'TOWN_BUKIT BATOK', 'TOWN_BUKIT MERAH', 'TOWN_BUKIT PANJANG', 'TOWN_BUKIT TIMAH', 'TOWN_CENTRAL AREA', 'TOWN_CHOA CHU KANG', 'TOWN_CLEMENTI', 'TOWN_GEYLANG', 'TOWN_HOUGANG', 'TOWN_JURONG EAST', 'TOWN_JURONG WEST']
   FLOOR_AREA_SQM  RESALE_PRICE  AGE  YEAR  STOREY_NUMERIC  DATE_IDX  \
0              91     12.971543   68  2023               2     24280   
1              74     12.971543   63  2023               8     24280   
2              84     13.012551   62  2023               5     24280   
3              84     13.017005   64  2023               5     24280   
4              89     13.023650   62  2023               2     24280   

   TOWN_ANG MO KIO  TOWN_BEDOK  TOWN_BISHAN  TOWN_BUKIT BATOK  ...  \
0                0           0       

In [12]:
# ========================================================
# üîÅ Compare Box-Cox vs Yeo-Johnson Target Transformations
# ========================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
import json

# Load data
df = pd.read_csv('raw_data_main.csv')

# Create DATE_IDX
df['DATE_IDX'] = df['YEAR'] * 12 + df['MONTH_NUM']

# Drop unwanted columns
drop_cols = ['IS_OUTLIERS', 'STOREY_RANGE', 'PRICE_PER_SQM', 'YEAR', 'MONTH_NUM','PRICE_TIER','SEASON','AGE_GROUP']
df = df.drop(columns=drop_cols, errors='ignore')

# One-hot encode categorical features
categorical_cols = ['TOWN', 'FLAT_TYPE']
categorical_cols = [col for col in categorical_cols if col in df.columns]
df = pd.get_dummies(df, columns=categorical_cols, dtype=int)

# Create stratification bin
df['price_bin'] = pd.qcut(df['RESALE_PRICE'], q=4, labels=False)

# Train/valid/test split
df_trainval, df_test = train_test_split(df, test_size=0.2, stratify=df['price_bin'], random_state=42)
df_train, df_valid = train_test_split(df_trainval, test_size=0.25, stratify=df_trainval['price_bin'], random_state=42)

# Drop bin column
for d in [df_train, df_valid, df_test]:
    d.drop(columns=['price_bin'], inplace=True)

# Prepare features and target
def prepare_X_y(df):
    X = df.drop(columns=['RESALE_PRICE'])
    y = df['RESALE_PRICE']
    X = X.apply(pd.to_numeric, errors='coerce').fillna(0)
    return X, y

X_train, y_train_raw = prepare_X_y(df_train)
X_valid, y_valid_raw = prepare_X_y(df_valid)
X_test, y_test_raw = prepare_X_y(df_test)

# Apply PowerTransformers
pt_boxcox = PowerTransformer(method='box-cox', standardize=False)
pt_yeojohnson = PowerTransformer(method='yeo-johnson', standardize=False)

# Box-Cox requires strictly positive values
y_train_bc = pt_boxcox.fit_transform(y_train_raw.values.reshape(-1, 1)).flatten()
y_valid_bc = pt_boxcox.transform(y_valid_raw.values.reshape(-1, 1)).flatten()
y_test_bc  = pt_boxcox.transform(y_test_raw.values.reshape(-1, 1)).flatten()

# Yeo-Johnson works with zero or negative values
y_train_yj = pt_yeojohnson.fit_transform(y_train_raw.values.reshape(-1, 1)).flatten()
y_valid_yj = pt_yeojohnson.transform(y_valid_raw.values.reshape(-1, 1)).flatten()
y_test_yj  = pt_yeojohnson.transform(y_test_raw.values.reshape(-1, 1)).flatten()

# Load best CatBoost parameters
with open("best_catboost_params.json", "r") as f:
    best_params = json.load(f)
best_params.pop('r2', None)

# Evaluation function
def evaluate_inverse(model, X, y_transformed, transformer, label=""):
    y_pred_transformed = model.predict(X)
    y_pred = transformer.inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()
    y_true = transformer.inverse_transform(y_transformed.reshape(-1, 1)).flatten()
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\nüìä {label} Results (Actual Price Scale)")
    print(f"RMSE: {rmse:,.2f} | MAE: {mae:,.2f} | R¬≤: {r2:.4f}")
    return rmse, mae, r2

# Train and evaluate Box-Cox model
model_bc = CatBoostRegressor(verbose=0, random_state=42, loss_function='RMSE', **best_params)
model_bc.fit(X_train, y_train_bc, eval_set=(X_valid, y_valid_bc), early_stopping_rounds=50)

evaluate_inverse(model_bc, X_train, y_train_bc, pt_boxcox, label="Box-Cox Training")
evaluate_inverse(model_bc, X_valid, y_valid_bc, pt_boxcox, label="Box-Cox Validation")
evaluate_inverse(model_bc, X_test,  y_test_bc,  pt_boxcox, label="Box-Cox Test")

# Train and evaluate Yeo-Johnson model
model_yj = CatBoostRegressor(verbose=0, random_state=42, loss_function='RMSE', **best_params)
model_yj.fit(X_train, y_train_yj, eval_set=(X_valid, y_valid_yj), early_stopping_rounds=50)

evaluate_inverse(model_yj, X_train, y_train_yj, pt_yeojohnson, label="Yeo-Johnson Training")
evaluate_inverse(model_yj, X_valid, y_valid_yj, pt_yeojohnson, label="Yeo-Johnson Validation")
evaluate_inverse(model_yj, X_test,  y_test_yj,  pt_yeojohnson, label="Yeo-Johnson Test")



üìä Box-Cox Training Results (Actual Price Scale)
RMSE: 38,975.50 | MAE: 26,126.16 | R¬≤: 0.9470

üìä Box-Cox Validation Results (Actual Price Scale)
RMSE: 45,499.71 | MAE: 30,309.41 | R¬≤: 0.9273

üìä Box-Cox Test Results (Actual Price Scale)
RMSE: 45,632.29 | MAE: 30,464.16 | R¬≤: 0.9276

üìä Yeo-Johnson Training Results (Actual Price Scale)
RMSE: 38,932.62 | MAE: 26,102.33 | R¬≤: 0.9471

üìä Yeo-Johnson Validation Results (Actual Price Scale)
RMSE: 45,452.76 | MAE: 30,271.14 | R¬≤: 0.9275

üìä Yeo-Johnson Test Results (Actual Price Scale)
RMSE: 45,579.65 | MAE: 30,435.70 | R¬≤: 0.9277


(45579.653289440546, 30435.696462975953, 0.9277205101104038)