# Esercitazione lab 5

# Richieste:

- Modificare un esempio e premettere la ricerca di una buona scelta degli iperparametri.
- Creare MLP con due layer nascosti e confrontare le prestazioni
- Confronto tra MLP classifier e un modello ibrido

## Carico il dataset [digits] + scaling

In [3]:
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler

X, y = load_digits(return_X_y=True)

scaler = StandardScaler()
X_std = scaler.fit_transform(X)

## Nested CV [classificazione]

In [20]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np

def nested_cv(model, param_grid, X, y, outer_splits=5,
              inner_splits=5, scoring=['accuracy'], random_state=42, verbose=True):

    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    score_results = {metric: [] for metric in scoring}

    best_param_overall = None
    best_score = -np.inf 

    for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X), 1):
        if verbose:
            print(f"\nPerforming Outer Fold {outer_fold}/{outer_splits}")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)
        if verbose:
            print("Performing GridSearchCV...")

        grid_search = GridSearchCV(model, param_grid, cv=inner_cv,
                                   n_jobs=-1, scoring=scoring[0])
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        if verbose:
            print(f" Best Params: {best_params}")

        y_pred = best_model.predict(X_test)

        if 'accuracy' in scoring:
            acc = accuracy_score(y_test, y_pred)
            score_results['accuracy'].append(acc)

            if acc > best_score:
                best_score = acc
                best_param_overall = best_params

            if verbose:
                print(f" Accuracy: {acc:.4f}")

    result = {}
    for metric, scores in score_results.items():
        result[f"Nested CV {metric.upper()}"] = f"{np.mean(scores):.4f} ± {np.std(scores):.4f}"

    result["Best Parameters with highest accuracy"] = best_param_overall

    return result

## MLP classifier

In [6]:
import time
from sklearn.neural_network import MLPClassifier

t0 = time.time()

mlp_model = MLPClassifier(random_state=42, max_iter=1000)

mlp_params = {
    'hidden_layer_sizes': [(50,100,50), (100,)],
    #'activation': ['tanh', 'relu'],
    #'solver': ['sgd', 'adam'],
    #'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

mlp_results = nested_cv(mlp_model, mlp_params, X_std, y, 
                          outer_splits = 5, inner_splits = 5, 
                          scoring = ['accuracy'])
t1 = time.time()

print(f'MLP Results: \n {mlp_results}')
print(f'Training MLP in {(t1 - t0):.3f}s')


Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
 Accuracy: 0.9833

Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
 Accuracy: 0.9750

Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant'}
 Accuracy: 0.9638

Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
 Accuracy: 0.9833

Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
 Accuracy: 0.9666
MLP Results: 
 {'Nested CV ACCURACY': '0.9744 ± 0.0082', 'Best Parameters with highest accuracy': {'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}}
Training MLP in 54.183s


## Problema con make_regression

### Nested CV da utilizzare

In [7]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

def nested_cv_regression(model, param_grid, X, y, outer_splits=5,
                         inner_splits=5, scoring=None, random_state=42, verbose=True):
    if scoring is None:
        scoring = ['r2']  # Default metric

    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    score_results = {metric: [] for metric in scoring}

    best_param_overall = None
    best_r2_score = -np.inf 

    for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X), 1):
        if verbose:
            print(f"\nPerforming Outer Fold {outer_fold}/{outer_splits}")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)
        if verbose:
            print("Performing GridSearchCV...")

        grid_search = GridSearchCV(model, param_grid, cv=inner_cv,
                                   n_jobs=-1, scoring=scoring[0])
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        if verbose:
            print(f" Best Params: {best_params}")

        y_pred = best_model.predict(X_test)

        if 'r2' in scoring:
            r2 = r2_score(y_test, y_pred)
            score_results['r2'].append(r2)
            if r2 > best_r2_score:
                best_r2_score = r2
                best_param_overall = best_params
            if verbose:
                print(f" R²: {r2:.4f}")

        if 'mae' in scoring:
            mae = mean_absolute_error(y_test, y_pred)
            score_results['mae'].append(mae)
            if verbose:
                print(f" MAE: {mae:.4f}")

        if 'rmse' in scoring:
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            score_results['rmse'].append(rmse)
            if verbose:
                print(f" RMSE: {rmse:.4f}")

    result = {}
    for metric, scores in score_results.items():
        result[f"Nested CV {metric.upper()}"] = f"{np.mean(scores):.4f} ± {np.std(scores):.4f}"

    result["Best Parameters with highest R2 score"] = best_param_overall

    return result

### MLP regressor

Carico il dataset con `make_regression`

In [12]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=200, n_features=50, noise=2.6, bias=1, random_state= 42)
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [16]:
from sklearn.neural_network import MLPRegressor

t0 = time.time()

mlp_regr_params = {
    'hidden_layer_sizes': [(5, 5, 5), (15,),
                           (20,), (25,)],
}

mlp_regr_model = MLPRegressor(solver = 'sgd', max_iter=500)


mlp_regr_results = nested_cv_regression(mlp_regr_model, mlp_regr_params, X_std, y, 
                          outer_splits = 5, inner_splits = 5, 
                          scoring = ['r2'])

t1 = time.time()

print(f'MLP Regression Results: \n {mlp_regr_results}')
print(f'Training MLP in {(t1 - t0):.3f}s')


Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'hidden_layer_sizes': (25,)}
 R²: 0.9631

Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'hidden_layer_sizes': (20,)}
 R²: 0.9733

Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'hidden_layer_sizes': (25,)}
 R²: 0.9647

Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'hidden_layer_sizes': (25,)}
 R²: 0.9730

Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'hidden_layer_sizes': (25,)}
 R²: 0.9663
MLP Regression Results: 
 {'Nested CV R2': '0.9681 ± 0.0043', 'Best Parameters with highest R2 score': {'hidden_layer_sizes': (20,)}}
Training MLP in 1.344s


# Confronto tra MLP classifier e un modello ibrido 
- costruito mettendo prima un bernoulli RBM e poi seguita da un logistic regression, oppure un altro classificatore. \
  Mettiamo le prime feature che abbiamo imparato, che capiscono bene come clusterizzare. \
Avro' una versione piu' compatta e mi semplifica la costruzione del modello finale, con una classificazione semplice messo alla fine.

Carico il dataset

In [32]:
from sklearn.datasets import load_digits

X, y = load_digits(return_X_y=True)

Pipeline

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import BernoulliRBM

# Modelli
scaler = MinMaxScaler()
rbm = BernoulliRBM(random_state=0)
logistic = LogisticRegression(max_iter=1000, solver='liblinear')  

# Pipeline
pipeline = Pipeline([
    ('scaler', scaler),
    ('rbm', rbm),
    ('logistic', logistic)
])

# Hyperparameter grid
rbm_log_param = {
    'rbm__n_components': [32, 64, 100],
    'rbm__learning_rate': [0.01, 0.06],
    'rbm__n_iter': [10, 20],
    'logistic__C': [0.1, 1.0, 10]
}

Il modello

In [34]:
import time

start_time = time.time()

rbm_log_results = nested_cv(pipeline, rbm_log_param, X_std, y,
                        outer_splits=5, inner_splits=5,
                        scoring=['accuracy'])

end_time = time.time()

print(f'RBM + log Results: \n{rbm_log_results}')
print(f'Tempo di esecuzione {end_time - start_time}')


Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'logistic__C': 10, 'rbm__learning_rate': 0.06, 'rbm__n_components': 100, 'rbm__n_iter': 20}
 Accuracy: 0.9306

Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'logistic__C': 10, 'rbm__learning_rate': 0.06, 'rbm__n_components': 64, 'rbm__n_iter': 20}
 Accuracy: 0.9278

Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'logistic__C': 10, 'rbm__learning_rate': 0.01, 'rbm__n_components': 100, 'rbm__n_iter': 10}
 Accuracy: 0.8468

Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'logistic__C': 10, 'rbm__learning_rate': 0.01, 'rbm__n_components': 100, 'rbm__n_iter': 10}
 Accuracy: 0.8635

Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'logistic__C': 10, 'rbm__learning_rate': 0.01, 'rbm__n_components': 100, 'rbm__n_iter': 10}
 Accuracy: 0.8273
RBM + log Results: 
{'Nested CV ACCURACY': '0.8792 ± 0.0424', 'Best Parameters with highest accuracy': {'l