# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.datasets import fetch_california_housing

# Import Data

In [2]:
data = fetch_california_housing()

df = pd.DataFrame(data.data, columns=data.feature_names)

df['MedianHouseValue'] = data.target

In [3]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


# Feature Selection 

In [4]:
numerical_features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
target = 'MedianHouseValue'

# Train Test Split

In [5]:
X = df[numerical_features]
y = df[target]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Outlier Removal

In [7]:
# only on the training set!!! I'll probably use it because there are some very high values for columns due to resorts as explained in the doc

# Numerical Transformers

In [8]:
# numeric_transformer = Pipeline(steps=[
#     ('mean_imputer', SimpleImputer(strategy='mean')),  # Impute with mean
#     ('scaler', StandardScaler())  # Scale with StandardScaler
# ])

numeric_transformer = Pipeline(steps=[
    ('mean_imputer', SimpleImputer(strategy='mean')),  # Impute with mean
])

# Categorical Transformers

In [9]:
# DOES NOT APPLY

# Preprocessing Pipeline

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_transformer', numeric_transformer, numerical_features),
    ])

# Modeling Functions

In [11]:
def show_results(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return {'mae': mae, 'mse': mse, 'rmse': rmse, 'mape': mape, 'r2': r2}

In [12]:
def get_output_df(X_test, y_pred, y_test):
    df_compare = X_test.copy()
    
    df_compare['pred'] = y_pred
    df_compare['target'] = y_test.values

    df_compare['absolute_error'] = (df_compare['pred'] - df_compare['target']).abs()
    
    return df_compare

In [13]:
# Variable that stores the test scores of the different models to decide which one to use
report_ac = {}

# Modeling Parametrizations

In [14]:
metric_to_optimize = 'neg_mean_squared_error'

In [15]:
param_grid_lr = {
    # Linear Regression has no hyperparameters
}

param_grid_ridge = {
    'regressor__alpha': [0.01, 0.1, 1, 10, 100]
}

param_grid_lasso = {
    'regressor__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]
}

param_grid_elastic_net = {
    'regressor__alpha': [0.01, 0.1, 1, 10],
    'regressor__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

param_grid_svr = {
    'regressor__C': [0.1, 1, 10],
    'regressor__epsilon': [0.01, 0.1, 0.5],
    'regressor__kernel': ['linear', 'rbf']
}

param_grid_knn = {
    'regressor__n_neighbors': [3, 5, 7, 10],
    'regressor__weights': ['uniform', 'distance']
}

param_grid_mlp = {
    'regressor__hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'regressor__activation': ['relu', 'tanh'],
    'regressor__alpha': [0.0001, 0.001, 0.01],
    'regressor__learning_rate': ['constant', 'adaptive']
}

param_grid_rf = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [5, 10, 20],
    'regressor__min_samples_split': [2, 5]
}

param_grid_gb = {
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 5]
}

# Linear Regression

In [16]:
lr_model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [17]:
grid_search_lr = GridSearchCV(
    estimator=lr_model,
    param_grid=param_grid_lr,
    cv=5,
    scoring=metric_to_optimize,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

In [18]:
grid_search_lr.fit(X_train, y_train)

lr_model = grid_search_lr.best_estimator_

y_pred_lr = lr_model.predict(X_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [19]:
print("Best hyperparameters:", grid_search_lr.best_params_)

results = show_results(y_test, y_pred_lr)
print(results)

report_ac['lr'] = results

Best hyperparameters: {}
{'mae': 0.5317411607371965, 'mse': 0.5517948548705345, 'rmse': 0.7428289539796725, 'mape': 0.3179830385714785, 'r2': 0.5836532783592616}


In [20]:
#get_output_df(X_test, y_pred_lr, y_test)

# Ridge Regression

In [21]:
ridge_model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

In [22]:
grid_search_ridge = GridSearchCV(
    estimator=ridge_model,
    param_grid=param_grid_ridge,
    cv=5,
    scoring=metric_to_optimize,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

In [23]:
grid_search_ridge.fit(X_train, y_train)

ridge_model = grid_search_ridge.best_estimator_

y_pred_ridge = ridge_model.predict(X_test)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [24]:
print("Best hyperparameters:", grid_search_ridge.best_params_)

results = show_results(y_test, y_pred_ridge)
print(results)

report_ac['ridge'] = results

Best hyperparameters: {'regressor__alpha': 10}
{'mae': 0.5317480373002378, 'mse': 0.5507068791073516, 'rmse': 0.7420962734762597, 'mape': 0.31796187147871324, 'r2': 0.5844741905845705}


# Lasso Regression

In [25]:
lasso_model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor', Lasso())
])

In [26]:
grid_search_lasso = GridSearchCV(
    estimator=lasso_model,
    param_grid=param_grid_lasso,
    cv=5,
    scoring=metric_to_optimize,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

In [27]:
grid_search_lasso.fit(X_train, y_train)

lasso_model = grid_search_lasso.best_estimator_

y_pred_lasso = lasso_model.predict(X_test)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [28]:
print("Best hyperparameters:", grid_search_lasso.best_params_)

results = show_results(y_test, y_pred_lasso)
print(results)

report_ac['lasso'] = results

Best hyperparameters: {'regressor__alpha': 0.0001}
{'mae': 0.5317415962242504, 'mse': 0.5515403463438775, 'rmse': 0.7426576239047691, 'mape': 0.31797653042705615, 'r2': 0.5838453131159613}


# Elastic Net

In [29]:
elastic_net_model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet())
])

In [30]:
grid_search_elastic_net = GridSearchCV(
    estimator=elastic_net_model,
    param_grid=param_grid_elastic_net,
    cv=5,
    scoring=metric_to_optimize,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

In [31]:
grid_search_elastic_net.fit(X_train, y_train)

elastic_net_model = grid_search_elastic_net.best_estimator_

y_pred_elastic_net = elastic_net_model.predict(X_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [32]:
print("Best hyperparameters:", grid_search_elastic_net.best_params_)

results = show_results(y_test, y_pred_elastic_net)
print(results)

report_ac['elastic_net'] = results

Best hyperparameters: {'regressor__alpha': 0.01, 'regressor__l1_ratio': 0.1}
{'mae': 0.5326220643857013, 'mse': 0.5411470761952027, 'rmse': 0.7356269952871515, 'mape': 0.31816625713578517, 'r2': 0.5916873651310035}


# SVR

In [33]:
svr_model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

In [34]:
grid_search_svr = GridSearchCV(
    estimator=svr_model,
    param_grid=param_grid_svr,
    cv=5,
    scoring=metric_to_optimize,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

In [35]:
grid_search_svr.fit(X_train, y_train)

svr_model = grid_search_svr.best_estimator_

y_pred_svr = svr_model.predict(X_test)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


KeyboardInterrupt: 

In [None]:
print("Best hyperparameters:", grid_search_svr.best_params_)

results = show_results(y_test, y_pred_svr)
print(results)

report_ac['svr'] = results

# KNN Regressor

In [36]:
knn_model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

In [37]:
grid_search_knn = GridSearchCV(
    estimator=knn_model,
    param_grid=param_grid_knn,
    cv=5,
    scoring=metric_to_optimize,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

In [38]:
grid_search_knn.fit(X_train, y_train)

knn_model = grid_search_knn.best_estimator_

y_pred_knn = knn_model.predict(X_test)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [39]:
print("Best hyperparameters:", grid_search_knn.best_params_)

results = show_results(y_test, y_pred_knn)
print(results)

report_ac['knn'] = results

Best hyperparameters: {'regressor__n_neighbors': 10, 'regressor__weights': 'distance'}
{'mae': 0.8039570202402038, 'mse': 1.0859532733260229, 'rmse': 1.0420908181756632, 'mape': 0.5222934334224002, 'r2': 0.18061380744407163}


# MLP Regressor

In [40]:
mlp_model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor())
])

In [41]:
grid_search_mlp = GridSearchCV(
    estimator=mlp_model,
    param_grid=param_grid_mlp,
    cv=5,
    scoring=metric_to_optimize,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

In [42]:
grid_search_mlp.fit(X_train, y_train)

mlp_model = grid_search_mlp.best_estimator_

y_pred_mlp = mlp_model.predict(X_test)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [43]:
print("Best hyperparameters:", grid_search_mlp.best_params_)

results = show_results(y_test, y_pred_mlp)
print(results)

report_ac['mlp'] = results

Best hyperparameters: {'regressor__activation': 'tanh', 'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': (100,), 'regressor__learning_rate': 'adaptive'}
{'mae': 0.5520300243546152, 'mse': 0.5194088061915604, 'rmse': 0.7207002193641684, 'mape': 0.3713556988768809, 'r2': 0.6080895794145731}


# Random Forest Regressor

In [44]:
rf_model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [45]:
grid_search_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid_rf,
    cv=5,
    scoring=metric_to_optimize,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

In [46]:
grid_search_rf.fit(X_train, y_train)

rf_model = grid_search_rf.best_estimator_

y_pred_rf = rf_model.predict(X_test)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [47]:
print("Best hyperparameters:", grid_search_rf.best_params_)

results = show_results(y_test, y_pred_rf)
print(results)

report_ac['rf'] = results

Best hyperparameters: {'regressor__max_depth': 20, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
{'mae': 0.33217386372478536, 'mse': 0.2621752098399331, 'rmse': 0.5120304774522051, 'mape': 0.18840193972268118, 'r2': 0.8021804876416625}


# Gradient Booster Regressor

In [48]:
gb_model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

In [49]:
grid_search_gb = GridSearchCV(
    estimator=gb_model,
    param_grid=param_grid_gb,
    cv=5,
    scoring=metric_to_optimize,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

In [50]:
grid_search_gb.fit(X_train, y_train)

gb_model = grid_search_gb.best_estimator_

y_pred_gb = gb_model.predict(X_test)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [51]:
print("Best hyperparameters:", grid_search_gb.best_params_)

results = show_results(y_test, y_pred_gb)
print(results)

report_ac['gb'] = results

Best hyperparameters: {'regressor__learning_rate': 0.2, 'regressor__max_depth': 5, 'regressor__n_estimators': 200}
{'mae': 0.30152646635610725, 'mse': 0.2141203224296276, 'rmse': 0.4627313717802453, 'mape': 0.169895544589425, 'r2': 0.8384394245554359}


In [52]:
pd.set_option('display.max_rows', None)

get_output_df(X_test, y_pred_gb, y_test).sort_values(by=['absolute_error'], ascending=False)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,pred,target,absolute_error
89,1.2434,52.0,2.929412,0.917647,396.0,4.658824,37.8,-122.27,1.254116,5.00001,3.745894
20349,7.3004,32.0,5.724138,0.758621,63.0,2.172414,34.17,-119.08,4.414824,1.25,3.164824
15303,2.3182,24.0,5.574932,1.20436,812.0,2.212534,33.17,-117.36,1.848012,5.00001,3.151998
17819,1.7361,42.0,3.0,1.0,26.0,1.857143,37.39,-121.9,2.099192,5.00001,2.900818
15311,1.9891,26.0,4.606704,1.27933,1703.0,1.902793,33.19,-117.38,2.15193,5.0,2.84807
15856,1.4886,52.0,4.862745,1.333333,124.0,2.431373,37.76,-122.38,1.672252,4.5,2.827748
18464,4.2039,11.0,6.753927,1.031414,881.0,4.612565,37.19,-121.74,2.247397,5.00001,2.752613
19389,1.625,26.0,5.6,0.8,9.0,1.8,37.75,-120.85,3.589411,0.85,2.739411
18461,4.6458,17.0,4.901961,1.019608,141.0,2.764706,37.26,-121.76,2.31158,5.00001,2.68843
20322,2.5714,8.0,3.0375,0.9375,102.0,1.275,34.23,-119.14,2.313008,5.00001,2.687002


In [53]:
report_ac

{'lr': {'mae': 0.5317411607371965,
  'mse': 0.5517948548705345,
  'rmse': 0.7428289539796725,
  'mape': 0.3179830385714785,
  'r2': 0.5836532783592616},
 'ridge': {'mae': 0.5317480373002378,
  'mse': 0.5507068791073516,
  'rmse': 0.7420962734762597,
  'mape': 0.31796187147871324,
  'r2': 0.5844741905845705},
 'lasso': {'mae': 0.5317415962242504,
  'mse': 0.5515403463438775,
  'rmse': 0.7426576239047691,
  'mape': 0.31797653042705615,
  'r2': 0.5838453131159613},
 'elastic_net': {'mae': 0.5326220643857013,
  'mse': 0.5411470761952027,
  'rmse': 0.7356269952871515,
  'mape': 0.31816625713578517,
  'r2': 0.5916873651310035},
 'knn': {'mae': 0.8039570202402038,
  'mse': 1.0859532733260229,
  'rmse': 1.0420908181756632,
  'mape': 0.5222934334224002,
  'r2': 0.18061380744407163},
 'mlp': {'mae': 0.5520300243546152,
  'mse': 0.5194088061915604,
  'rmse': 0.7207002193641684,
  'mape': 0.3713556988768809,
  'r2': 0.6080895794145731},
 'rf': {'mae': 0.33217386372478536,
  'mse': 0.26217520983993