In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from scipy.optimize import minimize

# Load the dataset
data = pd.read_csv("Boston House Prices Dataset.csv")

# Handling Outliers
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])

for column in ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']:
    cap_outliers(data, column)

# Feature Scaling - Normalization:
X = data.drop(columns=['MEDV'])
y = data['MEDV']
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
data_normalized = pd.DataFrame(X_normalized, columns=X.columns)
data_normalized['MEDV'] = y

# Feature Selection:
selected_features = ['RM', 'PTRATIO', 'LSTAT', 'NOX', 'CRIM', 'TAX', 'INDUS']
X = data[selected_features]
y = data['MEDV']

# Split the Dataset into Training and Test Sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)

# Define Base Models for Weighted Ensemble:
rf_model = RandomForestRegressor(n_estimators=700, max_depth=25, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', bootstrap=True, random_state=52)

xgb_model = XGBRegressor(n_estimators=700, learning_rate=0.015, max_depth=8, subsample=0.98, colsample_bytree=0.95, random_state=52)

catboost_model = CatBoostRegressor(iterations=700, learning_rate=0.015, depth=8, verbose=0, random_state=52)

# Train base models
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
catboost_model.fit(X_train, y_train)

# Get predictions from the base models:
rf_preds_test = rf_model.predict(X_test)
xgb_preds_test = xgb_model.predict(X_test)
catboost_preds_test = catboost_model.predict(X_test)
predictions = np.column_stack((rf_preds_test, xgb_preds_test, catboost_preds_test))

# Function to optimize weights using a combination of MAE and RMSE
def loss_function(weights):
    weighted_pred = np.dot(predictions, weights)
    return 0.5 * mean_absolute_error(y_test, weighted_pred) + 0.5 * np.sqrt(mean_squared_error(y_test, weighted_pred))

# Initial weights
def constraint(weights):
    return np.sum(weights) - 1

initial_weights = [1/3, 1/3, 1/3]
bounds = [(0,1)] * 3
constraints = ({'type': 'eq', 'fun': constraint})

# Optimize weights
result = minimize(loss_function, initial_weights, bounds=bounds, constraints=constraints, method='SLSQP', options={'maxiter': 500})
best_weights = result.x

# Final weighted prediction
final_preds = np.dot(predictions, best_weights)

# Evaluate performance
mae = mean_absolute_error(y_test, final_preds)
mse = mean_squared_error(y_test, final_preds)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, final_preds)

print(f"Optimized Weights: {best_weights}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-Squared (R²): {r2}")


Optimized Weights: [2.28169366e-17 6.29963438e-01 3.70036562e-01]
Mean Absolute Error (MAE): 1.7428445582902345
Mean Squared Error (MSE): 6.149701266140147
Root Mean Squared Error (RMSE): 2.4798591222366135
R-Squared (R²): 0.9117993666928338
