In [8]:
# Imports
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# Data
housing = fetch_california_housing()
X, y = housing['data'], housing['target']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=43)

# Define the base pipeline steps
base_pipeline = [
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
]

#### 1 - Create 5 pipelines with 5 different models as final estimator (keep the imputer and scaler unchanged):

In [9]:
# Define the models
models = {
    'Linear Regression': LinearRegression(),
    'SVM': SVR(),
    'Decision Tree': DecisionTreeRegressor(random_state=43),
    'Random Forest': RandomForestRegressor(random_state=43),
    'Gradient Boosting': GradientBoostingRegressor(random_state=43)
}

# Function to evaluate model
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    return r2, mae, mse

#### 2 - For each algorithm, print the R2, MSE and MAE on both train set and test set.


In [10]:
# Create pipelines and evaluate models
for model_name, model in models.items():
    print(f"\n~~~\n{model_name}\n")
    
    # Create pipeline
    pipeline = base_pipeline + [('model', model)]
    pipe = Pipeline(pipeline)
    
    # Fit the model
    pipe.fit(X_train, y_train)
    
    # Evaluate on train set
    train_r2, train_mae, train_mse = evaluate_model(pipe, X_train, y_train)
    print("TRAIN")
    print(f"r2 score:        {train_r2}")
    print(f"MAE:             {train_mae}")
    print(f"MSE:             {train_mse}")
    
    # Evaluate on test set
    test_r2, test_mae, test_mse = evaluate_model(pipe, X_test, y_test)
    print("\nTEST")
    print(f"r2 score:        {test_r2}")
    print(f"MAE:             {test_mae}")
    print(f"MSE:             {test_mse}")


~~~
Linear Regression

TRAIN
r2 score:        0.6054131599242079
MAE:             0.5330920012614552
MSE:             0.5273648371379568

TEST
r2 score:        0.6128959462132963
MAE:             0.5196420310323715
MSE:             0.49761195027083815

~~~
SVM

TRAIN
r2 score:        0.7496108582936637
MAE:             0.383564516332599
MSE:             0.3346447867133921

TEST
r2 score:        0.729508064989969
MAE:             0.3897680598426778
MSE:             0.34771017765429973

~~~
Decision Tree

TRAIN
r2 score:        1.0
MAE:             4.221907539810565e-17
MSE:             9.24499456646287e-32

TEST
r2 score:        0.6228217144931267
MAE:             0.4403051356589147
MSE:             0.4848526395290697

~~~
Random Forest

TRAIN
r2 score:        0.9741263135396302
MAE:             0.12000198560508221
MSE:             0.03458015083247723

TEST
r2 score:        0.8119778189909694
MAE:             0.3194169859011629
MSE:             0.24169750554364758

~~~
Gradient Boostin