In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
data = pd.read_csv("Boston House Prices Dataset.csv")

# Handling Outliers
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])

for column in ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']:
    cap_outliers(data, column)

# Feature Scaling - Normalization:
X = data.drop(columns=['MEDV'])
y = data['MEDV']
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
data_normalized = pd.DataFrame(X_normalized, columns=X.columns)
data_normalized['MEDV'] = y

# Feature Selection:
selected_features = ['RM', 'PTRATIO', 'LSTAT', 'NOX', 'CRIM', 'TAX', 'INDUS']
X = data[selected_features]
y = data['MEDV']

# Split the Dataset into Training and Test Sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)

# Define Base Models for Stacking:
rf_model = RandomForestRegressor(n_estimators=200, max_depth=12, min_samples_split=4, min_samples_leaf=2, max_features='sqrt', bootstrap=True, random_state=52)

xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, subsample=0.85, colsample_bytree=0.8, random_state=52)

extra_trees_model = ExtraTreesRegressor(n_estimators=200, max_depth=12, min_samples_split=4, min_samples_leaf=2, random_state=52)

# Train base models
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
extra_trees_model.fit(X_train, y_train)

# Get predictions from the base models:
rf_preds_train = rf_model.predict(X_train)
xgb_preds_train = xgb_model.predict(X_train)
extra_trees_preds_train = extra_trees_model.predict(X_train)
train_preds = np.column_stack((rf_preds_train, xgb_preds_train, extra_trees_preds_train))

# Train the Meta-Model (Ridge Regression)
meta_model = Ridge(alpha=1.0)
meta_model.fit(train_preds, y_train)

# Make Final Predictions:
rf_preds_test = rf_model.predict(X_test)
xgb_preds_test = xgb_model.predict(X_test)
extra_trees_preds_test = extra_trees_model.predict(X_test)
test_preds = np.column_stack((rf_preds_test, xgb_preds_test, extra_trees_preds_test))
final_preds = meta_model.predict(test_preds)

# Evaluate performance
mae = mean_absolute_error(y_test, final_preds)
mse = mean_squared_error(y_test, final_preds)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, final_preds)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-Squared (R²): {r2}")


Mean Absolute Error (MAE): 1.7306658251846434
Mean Squared Error (MSE): 5.955971510511089
Root Mean Squared Error (RMSE): 2.4404859168843998
R-Squared (R²): 0.91457788981086
