# Training the Model

In [43]:
# Load the dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
df = pd.read_csv('housing_v2.csv')

# One Hot Encoding for 'bedroom_bins'
df = pd.get_dummies(df, columns=['bedroom_bins'], drop_first=True)

# Log-transform the target variable
df['log_median_house_value'] = np.log1p(df['median_house_value'])
pd.set_option('display.max_columns', None)


df = df.rename(columns={'ocean_proximity_<1H OCEAN': 'ocean_proximity_Less_1H OCEAN'})

# Experiminet:
# These features were dropped as they are already correlated to other features derived from these:
df = df.drop(columns=['total_rooms', 'total_bedrooms', 'population', 'households', 'households_per_room'])

df.head()

Unnamed: 0,longitude,latitude,housing_median_age,median_income,median_house_value,ocean_proximity_Less_1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,rooms_per_household,bedrooms_per_room,population_per_household,distance_to_major_city,distance_to_coast,rooms_income_interaction,population_per_bedroom,location_cluster,bedroom_bins_1-2,bedroom_bins_2-3,bedroom_bins_3-4,bedroom_bins_4-5,bedroom_bins_5-10,log_median_house_value
0,-1.327835,1.052548,0.982143,1.995505,452600.0,0,0,0,1,0,1.509871,-1.978498,0.192015,1.070796,-0.560872,1.095754,0.269186,1,0,0,0,1,0,13.022766
1,-1.322844,1.043185,-0.607019,1.98838,358500.0,0,0,0,1,0,-0.266995,-0.055321,-0.985206,1.056172,-0.582773,2.547903,-0.657595,1,0,0,0,0,1,12.789687
2,-1.332827,1.038503,1.856182,1.656444,352100.0,0,0,0,1,0,1.603113,-1.912304,0.266154,1.060883,-0.580436,1.17508,0.209788,1,0,0,0,0,1,12.771673
3,-1.337818,1.038503,1.856182,1.049948,341300.0,0,0,0,1,0,0.568638,-0.752705,-0.076961,1.065706,-0.575072,0.594707,-0.084189,1,0,0,0,0,1,12.74052
4,-1.337818,1.038503,1.856182,0.170631,342200.0,0,0,0,1,0,0.621044,-0.796494,-0.52512,1.065706,-0.575072,0.00919,-0.491822,1,0,0,0,0,1,12.743154


In [44]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import numpy as np

# Split the data
X = df.drop(columns=['median_house_value', 'log_median_house_value'])
y = df['log_median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
linear_reg = LinearRegression()
gbr = GradientBoostingRegressor(random_state=42)
svr = SVR()
xgbr = xgb.XGBRegressor(objective ='reg:squarederror')
lasso = Lasso()
ridge = Ridge()
nn = MLPRegressor(random_state=42)  # Simple neural network for regression

# List of models and their names
models = [linear_reg, gbr, svr, xgbr, lasso, ridge, nn]
model_names = [
    "Linear Regression", "Gradient Boosting Regressor", "Support Vector Regression",
    "XGBoost Regressor", "Lasso Regression", "Ridge Regression", "Neural Networks"
]

# Function to train and predict using the model
def train_and_predict(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    return model.predict(X_test)

# Function to calculate errors
def calculate_errors(y_test, y_pred):
    mse = mean_squared_error(np.expm1(y_test), np.expm1(y_pred))
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred))
    return mse, rmse, mae

# Iterate over models, train, predict, and display errors
for i, model in enumerate(models):
    y_pred = train_and_predict(model, X_train, y_train, X_test)
    mse, rmse, mae = calculate_errors(y_test, y_pred)
    print(f"{model_names[i]}:\nMSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\n{'-'*40}")



Linear Regression:
MSE: 7379448142.2924
RMSE: 85903.71436842764
MAE: 48590.14928022824
----------------------------------------
Gradient Boosting Regressor:
MSE: 3045955541.2924137
RMSE: 55190.17613028983
MAE: 36388.80343660589
----------------------------------------
Support Vector Regression:
MSE: 2971206773.471002
RMSE: 54508.77703151119
MAE: 35896.268625829325
----------------------------------------
XGBoost Regressor:
MSE: 2260840875.273453
RMSE: 47548.300445688415
MAE: 30138.388998342107
----------------------------------------
Lasso Regression:
MSE: 13890403317.506948
RMSE: 117857.5551990917
MAE: 87406.2216125053
----------------------------------------
Ridge Regression:
MSE: 8172221379.72651
RMSE: 90400.33948900031
MAE: 48824.69664989376
----------------------------------------
Neural Networks:
MSE: 3286588218.402244
RMSE: 57328.773041137414
MAE: 37341.62908136892
----------------------------------------


as we could see, XGBoost was the fastest, so will focus on it and fine tune it to make it faster:

In [39]:
# from sklearn.model_selection import RandomizedSearchCV

# # Define hyperparameters space for XGBoost
# xgbr_params = {
#     'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
#     'max_depth': [3, 4, 5, 6, 7, 8, 10],
#     'n_estimators': [50, 100, 150, 200, 300],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
#     'gamma': [0, 0.1, 0.2, 0.3, 0.5]
# }

# xgbr_search = RandomizedSearchCV(xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
#                                 param_distributions=xgbr_params,
#                                 n_iter=50,
#                                 scoring='neg_mean_squared_error',
#                                 n_jobs=-1,
#                                 cv=5,
#                                 verbose=3,
#                                 random_state=42)

# xgbr_search.fit(X_train, y_train)

# # Best hyperparameters for XGBoost
# xgbr_best_params = xgbr_search.best_params_

# print("Best hyperparameters for XGBoost Regressor:")
# print(xgbr_best_params)


### Code was commented as it takes time to come up with these results.

Result:

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best hyperparameters for XGBoost Regressor:
{'subsample': 0.8, 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.9}

In [40]:
# Hyperparameters for the XGBoost model (based on the earlier optimization)
params = {
    'objective': 'reg:squarederror',
    'subsample': 0.8,
    'n_estimators': 200,
    'max_depth': 7,
    'learning_rate': 0.1,
    'gamma': 0,
    'colsample_bytree': 0.9
}


# Train the XGBoost model with the best hyperparameters
best_xgbr = xgb.XGBRegressor(**params, random_state=42)
best_xgbr.fit(X_train, y_train)

# Predict on the test set
y_pred = best_xgbr.predict(X_test)

# Revert the log transformation for predictions and test values
y_pred_original = np.expm1(y_pred)
y_test_original = np.expm1(y_test)

# Compute the MAE and RMSE on the original scale
mse = mean_squared_error(y_test_original, y_pred_original)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_original, y_pred_original)

print(f"Improved XGBoost Regressor:\nMSE: {mse}\nRMSE: {rmse}\nMAE: {mae}")


Improved XGBoost Regressor:
MSE: 2029770015.5770974
RMSE: 45052.96899846998
MAE: 28190.255035163813



## Documentation for XGBoost Regression Model

**Objective**: The purpose of this cell is to train an XGBoost regression model using pre-determined hyperparameters, predict on the test dataset, and evaluate the performance using Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and Mean Absolute Error (MAE).

### Step-by-Step Explanation:

1. **Setting Hyperparameters**:
    - The hyperparameters for the XGBoost model are based on a previous optimization process.
    - The `params` dictionary contains the optimized hyperparameters:

        * `objective`: Objective function for the model, which in this case is 'reg:squarederror' suitable for regression tasks.
        * `subsample`: Proportion of training data to randomly sample in each boosting round.
        * `n_estimators`: Number of boosting rounds or trees to build. It’s important to tune it properly in order to avoid under/over-fitting.
        * `max_depth`: Maximum depth of a tree. Increasing this value makes the model more complex and likely to overfit.
        * `learning_rate`: Boosting learning rate (step size).
        * `gamma`: Regularization term to prevent over-fitting. It specifies a minimum loss reduction required to make a further partition.
        * `colsample_bytree`: Proportion of features randomly sampled for building trees.

2. **Training the Model**:
    - The `xgb.XGBRegressor()` function initializes the XGBoost regressor model.
    - `**params` is a technique in Python called argument unpacking, which passes the items in the `params` dictionary as individual key-value pairs to the function.
    - `random_state=42` is set to ensure reproducibility.
    - `.fit(X_train, y_train)` is used to train the model on the training dataset.

3. **Prediction**:
    - The model predicts on the test dataset using `.predict(X_test)`.
    - The predictions and test values are then reverted from their log-transformed scale (if they were previously log-transformed) using `np.expm1()`.

4. **Performance Evaluation**:
    - The Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and Mean Absolute Error (MAE) are computed for the original scale of test values and predictions.
    - The results are printed to the console for easy reference.

### Output:

The output for this cell indicates the performance of the trained XGBoost regressor model:

- **MSE (Mean Squared Error)**: This is the average of the squared differences between the predicted and actual values. A value of 2,029,770,015.58 suggests that the model's predictions deviate on average by the square root of this value from the actual values.

- **RMSE (Root Mean Squared Error)**: This is the square root of the MSE. An RMSE of 45,052.97 means that on average, the model's predictions are around ±45,052.97 units away from the actual values.

- **MAE (Mean Absolute Error)**: It represents the average absolute difference between the predicted and actual values. An MAE of 28,190.26 indicates that the model's predictions are, on average, off by ±28,190.26 units from the actual values.

It's important to interpret these values in the context of the dataset and the specific problem at hand. Lower values of RMSE and MAE are preferable as they indicate better model performance.


## Save the model so it could be used by the backend for prediction:


In [45]:
best_xgbr.save_model('XGBoost.json')

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

# Split the data
X = df.drop(columns=['median_house_value'])
y = df['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameters space for Random Forest
rf_params = {
    'n_estimators': [10, 50, 100, 150, 200, 300, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_search = RandomizedSearchCV(RandomForestRegressor(random_state=42),
                               param_distributions=rf_params,
                               n_iter=100,
                               scoring='neg_mean_squared_error',
                               n_jobs=-1,
                               cv=3,
                               verbose=3,
                               random_state=42)

rf_search.fit(X_train, y_train)

# Best hyperparameters for Random Forest
rf_best_params = rf_search.best_params_

print("Best hyperparameters for Random Forest Regressor:")
print(rf_best_params)

# Evaluate the model
rf_best = RandomForestRegressor(**rf_best_params, random_state=42)
rf_best.fit(X_train, y_train)
y_pred_rf = rf_best.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print(f"\nRandom Forest Regressor:\nMSE: {mse_rf}\nRMSE: {rmse_rf}\nMAE: {mae_rf}")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV 3/3] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=500;, score=-306488900.401 total time=   9.9s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=500;, score=-382767337.389 total time=  12.1s
[CV 1/3] END bootstrap=False, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300;, score=-179523506.102 total time=  11.3s
[CV 3/3] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=-244023448.481 total time=   6.6s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=500;, score=-366143323.808 total time=  11.4s
[CV 3/3] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;



[CV 3/3] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=-245800828.169 total time=   5.9s
[CV 3/3] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=-253444538.564 total time=   2.1s
[CV 2/3] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=150;, score=-248340505.886 total time=   3.6s
[CV 2/3] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=50;, score=-287434866.701 total time=   1.2s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=500;, score=-237650447.936 total time=  15.6s
[CV 2/3] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=10;, score=-595169917.888 total time=   0.2s
[CV 1/3] END bootstrap=F

84 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/waseemmilhim/anaconda3/envs/Udacity/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/waseemmilhim/anaconda3/envs/Udacity/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/waseemmilhim/anaconda3/envs/Udacity/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/waseemmilhim/anaconda3/envs/Udacity/lib/python3.10/sit

Best hyperparameters for Random Forest Regressor:
{'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}

Random Forest Regressor:
MSE: 147955799.0405594
RMSE: 12163.708276695861
MAE: 6860.105557919893


Best hyperparameters for Random Forest Regressor:
{'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}

Random Forest Regressor:
* MSE: 2172075001.5949993
* RMSE: 46605.52544060628
* MAE: 30380.52708898579

In [48]:
# Given the best hyperparameters
best_params = {
    'n_estimators': 300,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'max_depth': None,
    'bootstrap': False
}

# Train the model using the best hyperparameters
model = RandomForestRegressor(**best_params, random_state=42)
model.fit(X_train, y_train)

# Predict using the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print(f"Random Forest Regressor with Best Hyperparameters:")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


Random Forest Regressor with Best Hyperparameters:
MSE: 147955799.0405594
RMSE: 12163.708276695861
MAE: 6860.105557919893
[CV 2/3] END bootstrap=False, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=-213819257.939 total time=   3.2s
[CV 3/3] END bootstrap=False, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=-226437767.911 total time=   3.3s
[CV 3/3] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=-325291488.496 total time=   2.0s
[CV 2/3] END bootstrap=True, max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=150;, score=-232560580.436 total time=   3.6s
[CV 1/3] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=500;, score=-236348348.416 total time=  16.1s
[CV 1/3] END bootstrap=True, max_depth=10, max_features=sqrt, m

[CV 3/3] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300;, score=-238064040.382 total time=   6.7s
[CV 2/3] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=300;, score=-272188701.752 total time=   6.8s
[CV 1/3] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=-234180225.593 total time=   6.8s
[CV 3/3] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=-243226483.109 total time=   6.1s
[CV 2/3] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=150;, score=-196805905.932 total time=   5.8s
[CV 1/3] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=150;, score=-201943626.477 total time=   4.9s
[CV 2/3] END bootstr

[CV 1/3] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=-250138666.476 total time=   1.1s
[CV 1/3] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=-176859886.751 total time=   7.3s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=10;, score=-465108971.779 total time=   0.2s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=10;, score=-452175553.284 total time=   0.3s
[CV 1/3] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=-226747044.167 total time=   1.8s
[CV 1/3] END bootstrap=True, max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300;, score=nan total time=   0.0s
[CV 3/3] END bootstrap=True, max_depth