In [5]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import randint, uniform

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, PowerTransformer
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.inspection import permutation_importance
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_PATH = "D:\Ai_machine_learning_deep_learning_air_university_lab_islamabad\data\Real_Estate.csv"  
TARGET_COL = "House price of unit area"
RANDOM_STATE = 42

In [3]:
RF_N_ITER = 16
HGB_N_ITER = 16

In [6]:
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at: {DATA_PATH}. Update DATA_PATH variable.")
df = pd.read_csv(DATA_PATH)
print("Loaded:", DATA_PATH, "shape:", df.shape)

Loaded: D:\Ai_machine_learning_deep_learning_air_university_lab_islamabad\data\Real_Estate.csv shape: (414, 7)


In [7]:
df.head()

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012-09-02 16:42:30.519336,13.3,4082.015,8,25.007059,121.561694,6.488673
1,2012-09-04 22:52:29.919544,35.5,274.0144,2,25.012148,121.54699,24.970725
2,2012-09-05 01:10:52.349449,1.1,1978.671,10,25.00385,121.528336,26.694267
3,2012-09-05 13:26:01.189083,22.2,1055.067,5,24.962887,121.482178,38.091638
4,2012-09-06 08:29:47.910523,8.5,967.4,6,25.011037,121.479946,21.65471


In [8]:
df.drop(columns=["Transaction date"], inplace=True)

In [9]:
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in data. Columns: {list(df.columns)}")
y = df[TARGET_COL].copy()
X = df.drop(columns=[TARGET_COL]).copy()

In [10]:
print("Missing per column:\n", X.isnull().sum())

Missing per column:
 House age                              0
Distance to the nearest MRT station    0
Number of convenience stores           0
Latitude                               0
Longitude                              0
dtype: int64


In [11]:
z = np.abs(stats.zscore(y))
outlier_mask = z > 3
print("Outlier (target) count to remove:", int(outlier_mask.sum()))
if outlier_mask.sum() > 0:
    X = X[~outlier_mask]
    y = y[~outlier_mask]

Outlier (target) count to remove: 0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RANDOM_STATE)
print("Train/test sizes:", X_train.shape, X_test.shape)

Train/test sizes: (331, 5) (83, 5)


In [13]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", num_cols)

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("power", PowerTransformer(method='yeo-johnson')),  # stabilizes skew, works with zeros/negatives
    ("scaler", RobustScaler())                          # robust to outliers
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols)
], remainder='drop')

Numeric columns: ['House age', 'Distance to the nearest MRT station', 'Number of convenience stores', 'Latitude', 'Longitude']


In [14]:
rf = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=1)
hgb = HistGradientBoostingRegressor(random_state=RANDOM_STATE)

In [15]:
kf = KFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)
def cv_r2_score(model):
    pipe = Pipeline([("pre", preprocessor), ("model", model)])
    return cross_val_score(pipe, X_train, y_train, scoring='r2', cv=kf, n_jobs=1).mean()

print("Baseline CV R2 (RandomForest):", cv_r2_score(rf))
print("Baseline CV R2 (HistGradientBoosting):", cv_r2_score(hgb))

Baseline CV R2 (RandomForest): nan
Baseline CV R2 (HistGradientBoosting): nan


In [16]:
rf_param_dist = {
    "model__n_estimators": randint(50, 400),
    "model__max_depth": randint(3, 25),
    "model__min_samples_split": randint(2, 10),
    "model__min_samples_leaf": randint(1, 6),
    "model__max_features": ['auto', 'sqrt', 'log2', 0.6, 0.8]
}

In [17]:
hgb_param_dist = {
    "model__max_iter": randint(50, 400),
    "model__max_depth": randint(2, 20),
    "model__learning_rate": uniform(0.01, 0.5),
    "model__min_samples_leaf": randint(1, 30),
    "model__l2_regularization": uniform(0.0, 2.0)
}

In [18]:
def randomized_search(pipe, param_dist, n_iter=20):
    rs = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=n_iter,
                            scoring='neg_mean_squared_error', cv=3, random_state=RANDOM_STATE, n_jobs=1, verbose=1)
    rs.fit(X_train, y_train)
    return rs

print("Running RandomizedSearch for RandomForest...")
rs_rf = randomized_search(Pipeline([("pre", preprocessor), ("model", rf)]), rf_param_dist, n_iter=RF_N_ITER)
print("RF best params:", rs_rf.best_params_, "best score (neg MSE):", rs_rf.best_score_)

print("Running RandomizedSearch for HistGradientBoosting...")
rs_hgb = randomized_search(Pipeline([("pre", preprocessor), ("model", hgb)]), hgb_param_dist, n_iter=HGB_N_ITER)
print("HGB best params:", rs_hgb.best_params_, "best score (neg MSE):", rs_hgb.best_score_)

Running RandomizedSearch for RandomForest...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
RF best params: {'model__max_depth': 10, 'model__max_features': 0.8, 'model__min_samples_leaf': 5, 'model__min_samples_split': 8, 'model__n_estimators': 171} best score (neg MSE): -126.77769385447759
Running RandomizedSearch for HistGradientBoosting...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
HGB best params: {'model__l2_regularization': 1.8771054180315003, 'model__learning_rate': 0.010389382920507164, 'model__max_depth': 2, 'model__max_iter': 363, 'model__min_samples_leaf': 22} best score (neg MSE): -131.27941414518838


In [19]:
best_rf_pipeline = rs_rf.best_estimator_
best_hgb_pipeline = rs_hgb.best_estimator_

In [20]:
best_rf_pipeline.fit(X_train, y_train)
best_hgb_pipeline.fit(X_train, y_train)

In [21]:
from sklearn.linear_model import Ridge
# Extract the trained models from pipelines to pass into StackingRegressor
best_rf_model = best_rf_pipeline.named_steps['model']
best_hgb_model = best_hgb_pipeline.named_steps['model']

stack = StackingRegressor(
    estimators=[('rf', best_rf_model), ('hgb', best_hgb_model)],
    final_estimator=Ridge(),
    passthrough=True,
    n_jobs=1
)

pipe_stack = Pipeline([("pre", preprocessor), ("model", stack)])
print("Training stacking ensemble...")
pipe_stack.fit(X_train, y_train)

Training stacking ensemble...


In [22]:
def evaluate(pipe):
    preds = pipe.predict(X_test)
    return {
        "r2": r2_score(y_test, preds),
        "mse": mean_squared_error(y_test, preds),
        "mae": mean_absolute_error(y_test, preds)
    }

evaluations = {
    "RF": evaluate(best_rf_pipeline),
    "HGB": evaluate(best_hgb_pipeline),
    "Stack": evaluate(pipe_stack)
}

print("\nEvaluation on test set:")
for name, metrics in evaluations.items():
    print(f"{name}: R2={metrics['r2']:.4f}, MSE={metrics['mse']:.4f}, MAE={metrics['mae']:.4f}")


Evaluation on test set:
RF: R2=0.5324, MSE=129.3644, MAE=9.7175
HGB: R2=0.5374, MSE=127.9792, MAE=9.6991
Stack: R2=0.5213, MSE=132.4190, MAE=9.8518


In [23]:
from sklearn.inspection import permutation_importance
prepped_X_test = preprocessor.transform(X_test)
perm = permutation_importance(pipe_stack.named_steps['model'], prepped_X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=1)
feat_names = num_cols
imp_df = pd.DataFrame({"feature": feat_names, "importance_mean": perm.importances_mean, "importance_std": perm.importances_std})
imp_df = imp_df.sort_values("importance_mean", ascending=False)
print("\nPermutation importances:\n", imp_df)


Permutation importances:
                                feature  importance_mean  importance_std
1  Distance to the nearest MRT station         0.796611        0.125534
2         Number of convenience stores         0.170542        0.045831
4                            Longitude         0.007509        0.010523
3                             Latitude         0.001476        0.008866
0                            House age        -0.002921        0.002562


In [24]:
best_name = max(evaluations.items(), key=lambda kv: kv[1]['r2'])[0]
if best_name == 'Stack':
    best_pipeline = pipe_stack
elif best_name == 'RF':
    best_pipeline = best_rf_pipeline
else:
    best_pipeline = best_hgb_pipeline

model_path = "best_model.joblib"
joblib.dump(best_pipeline, model_path)
print("\nSaved best model:", best_name, "->", model_path)
print("\nFinished.")


Saved best model: HGB -> best_model.joblib

Finished.
