# Pipelines
In questo esercizio, bisognerà creare una Pipeline in cui si effettua:
1. Scaling
2. Riduzione della dimensionalità
3. Visualizzare le dimensioni ottenute
4. Applicare il modello
5. Ottenere l'accuracy


In [1]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_wine
import pandas as pd
import time

In [2]:
wine_dataset = load_wine()
X = wine_dataset.data
y = wine_dataset.target

print(X.shape)
print(wine_dataset.feature_names)

(178, 13)
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


# Nested CV

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, r2_score, root_mean_squared_error, mean_absolute_error

def nested_cv(model, param_grid, X, y, outer_splits=5,
              inner_splits=5, scoring=['accuracy', 'roc_auc'],
              random_state=42, verbose=True):

    # Assicurati che `y` sia un array 1D
    if isinstance(y, pd.DataFrame):  # Se è un DataFrame Pandas
        y = y.values.ravel()
    elif isinstance(y, pd.Series):  # Se è una Serie Pandas
        y = y.values
    else:  # Se è un array Numpy
        y = np.ravel(y)

    # CROSS-VALIDATION ESTERNA
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    score_results = {metric: [] for metric in scoring}

    best_param_overall = None
    best_score = -np.inf

    for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X), 1):
        if verbose:
            print(f"\nPerforming Outer Fold {outer_fold}/{outer_splits}")

        # Usare il metodo .iloc per X, se è un DataFrame
        if isinstance(X, pd.DataFrame):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        else:  # Altrimenti usa indicizzazione standard
            X_train, X_test = X[train_idx], X[test_idx]

        y_train, y_test = y[train_idx], y[test_idx]

        # CROSS-VALIDATION INTERNA con GridSearchCV
        inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)
        if verbose:
            print("Performing GridSearchCV...")

        grid_search = GridSearchCV(model, param_grid, cv=inner_cv,
                                   n_jobs=-1, scoring=scoring[0])
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        if verbose:
            print(f" Best Params: {best_params}")

        # Test del modello ottimale sui dati di test dell'outer fold
        y_pred = best_model.predict(X_test)

        # Calcolo delle metriche
        if 'accuracy' in scoring:
            acc = accuracy_score(y_test, y_pred)
            score_results['accuracy'].append(acc)
            if acc > best_score:
                best_score = acc
                best_param_overall = best_params
            if verbose:
                print(f" Accuracy: {acc:.4f}")

        if 'roc_auc' in scoring:
            try:
                y_score = best_model.predict_proba(X_test)[:, 1]
                auc = roc_auc_score(y_test, y_score)
                score_results['roc_auc'].append(auc)
                if verbose:
                    print(f" AUC: {auc:.4f}")
            except AttributeError:
                if verbose:
                    print("Controlla se il modello ha un metodo `predict_proba`.")
                score_results['roc_auc'].append(np.nan)

        if 'r2' in scoring:
            r2score = r2_score(y_test, y_pred)
            score_results['r2'].append(r2score)

            if verbose:
                print(f" R2: {r2score:.4f}")

        if 'mae' in scoring:
            mae = mean_absolute_error(y_test, y_pred)
            score_results['mae'].append(mae)

            if verbose:
                print(f" MAE: {mae:.4f}")
        if 'rmse' in scoring:
            rmse = root_mean_squared_error(y_test, y_pred)
            score_results['rmse'].append(rmse)

            if verbose:
                print(f" RMSE: {rmse:.4f}")

    # Calcolo delle metriche globali (media e deviazione standard)
    result = {}
    for metric, scores in score_results.items():
        result[f"Nested CV {metric.upper()}"] = f"{np.nanmean(scores):.4f} ± {np.nanstd(scores):.4f}"

    result["Best Parameters with highest accuracy"] = best_param_overall

    return result

# Pipeline SVC
Creo una Pipeline dove:
1. Effettuo uno scaling con Standard Scaler (Z-score normalization)
2. Riduco la dimensionalità con PCA
3. Uso un modello di classificazione SVC

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

svc_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components = 2)),
    ('svc', SVC())
])

## Parametri per la Nested CV

In [11]:
params_grid_svc = {
    'svc__kernel': ['linear', 'rbf'],
    'svc__C': [0.1, 1, 10], # L2 regularization
    'svc__gamma': ['scale', 'auto'] # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
}

In [15]:
start_time = time.time()

result = nested_cv(model = svc_pipeline,
                   param_grid = params_grid_svc,
                   X = X,
                   y = y,
                   outer_splits = 5,
                   inner_splits = 5,
                   scoring = ['accuracy'])

end_time = time.time()

print(f'Result SVC: \n{result}')
print(f'Elapsed time: {end_time - start_time} s')


Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
 Accuracy: 1.0000

Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
 Accuracy: 0.9444

Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
 Accuracy: 0.9722

Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}
 Accuracy: 0.9714

Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'svc__C': 1, 'svc__gamma': 'auto', 'svc__kernel': 'rbf'}
 Accuracy: 0.9143
Result SVC: 
{'Nested CV ACCURACY': '0.9605 ± 0.0290', 'Best Parameters with highest accuracy': {'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}}
Elapsed time: 1.0145187377929688 s
