# Metodi basati su prototipi
Confronto un metodo basato su prototipi (`KNN`) con un modello di `Logistic Regression`.

## Dataset sintetico
Genero un dataset sintetico

In [3]:
from sklearn.datasets import make_classification

# Dataset per un task di classificazione binaria con 1000 osservazioni e 20 features
X, y = make_classification(n_samples = 1000, n_features = 20, n_classes = 2, random_state = 42)

print(X.shape)
print(y[:5]) # Le prime 5 etichette

(1000, 20)
[1 0 1 1 0]


## Nested Cross-Validation

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, r2_score, root_mean_squared_error, mean_absolute_error

def nested_cv(model, param_grid, X, y, outer_splits=5,
              inner_splits=5, scoring=['accuracy', 'roc_auc'],
              random_state=42, verbose=True):

    # Assicurati che `y` sia un array 1D
    if isinstance(y, pd.DataFrame):  # Se è un DataFrame Pandas
        y = y.values.ravel()
    elif isinstance(y, pd.Series):  # Se è una Serie Pandas
        y = y.values
    else:  # Se è un array Numpy
        y = np.ravel(y)

    # CROSS-VALIDATION ESTERNA
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    score_results = {metric: [] for metric in scoring}

    best_param_overall = None
    best_score = -np.inf

    for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X), 1):
        if verbose:
            print(f"\nPerforming Outer Fold {outer_fold}/{outer_splits}")

        # Usare il metodo .iloc per X, se è un DataFrame
        if isinstance(X, pd.DataFrame):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        else:  # Altrimenti usa indicizzazione standard
            X_train, X_test = X[train_idx], X[test_idx]

        y_train, y_test = y[train_idx], y[test_idx]

        # CROSS-VALIDATION INTERNA con GridSearchCV
        inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)
        if verbose:
            print("Performing GridSearchCV...")

        grid_search = GridSearchCV(model, param_grid, cv=inner_cv,
                                   n_jobs=-1, scoring=scoring[0])
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        if verbose:
            print(f" Best Params: {best_params}")

        # Test del modello ottimale sui dati di test dell'outer fold
        y_pred = best_model.predict(X_test)

        # Calcolo delle metriche
        if 'accuracy' in scoring:
            acc = accuracy_score(y_test, y_pred)
            score_results['accuracy'].append(acc)
            if acc > best_score:
                best_score = acc
                best_param_overall = best_params
            if verbose:
                print(f" Accuracy: {acc:.4f}")

        if 'roc_auc' in scoring:
            try:
                y_score = best_model.predict_proba(X_test)[:, 1]
                auc = roc_auc_score(y_test, y_score)
                score_results['roc_auc'].append(auc)
                if verbose:
                    print(f" AUC: {auc:.4f}")
            except AttributeError:
                if verbose:
                    print("Controlla se il modello ha un metodo `predict_proba`.")
                score_results['roc_auc'].append(np.nan)

        if 'r2' in scoring:
            r2score = r2_score(y_test, y_pred)
            score_results['r2'].append(r2score)

            if verbose:
                print(f" R2: {r2score:.4f}")

        if 'mae' in scoring:
            mae = mean_absolute_error(y_test, y_pred)
            score_results['mae'].append(mae)

            if verbose:
                print(f" MAE: {mae:.4f}")
        if 'rmse' in scoring:
            rmse = root_mean_squared_error(y_test, y_pred)
            score_results['rmse'].append(rmse)

            if verbose:
                print(f" RMSE: {rmse:.4f}")

    # Calcolo delle metriche globali (media e deviazione standard)
    result = {}
    for metric, scores in score_results.items():
        result[f"Nested CV {metric.upper()}"] = f"{np.nanmean(scores):.4f} ± {np.nanstd(scores):.4f}"

    result["Best Parameters with highest accuracy"] = best_param_overall

    return result

## Pipeline 1: Scaling, PCA e KNN

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

pipeline_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('knn', KNeighborsClassifier())
])

params_grid_knn = {
    'pca__n_components': [2, 3, 5],
    'pca__svd_solver': ['full', 'randomized'],
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

In [8]:
import time

start_time = time.time()

results = nested_cv(pipeline_knn,
          param_grid = params_grid_knn,
          X = X,
          y = y,
          scoring = ['accuracy'])

end_time = time.time()

print("Result KNN")
print(results)

print(f'Elapsed time: {end_time - start_time} s')


Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'knn__metric': 'euclidean', 'knn__n_neighbors': 5, 'knn__weights': 'uniform', 'pca__n_components': 5, 'pca__svd_solver': 'randomized'}
 Accuracy: 0.7300

Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'knn__metric': 'manhattan', 'knn__n_neighbors': 5, 'knn__weights': 'uniform', 'pca__n_components': 5, 'pca__svd_solver': 'randomized'}
 Accuracy: 0.7700

Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'knn__metric': 'euclidean', 'knn__n_neighbors': 7, 'knn__weights': 'uniform', 'pca__n_components': 5, 'pca__svd_solver': 'full'}
 Accuracy: 0.7150

Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'knn__metric': 'manhattan', 'knn__n_neighbors': 5, 'knn__weights': 'distance', 'pca__n_components': 5, 'pca__svd_solver': 'randomized'}
 Accuracy: 0.7650

Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'knn__metric': 'euclidean', 'knn__n_neighbors': 

## Pipeline 2: Scaling, Kernel PCA, Logistic Regression

In [15]:
from sklearn.decomposition import KernelPCA
from sklearn.linear_model import LogisticRegression

pipeline_logistic_regression = Pipeline([
    ('scaler', StandardScaler()),
    ('kpca', KernelPCA()),
    ('logisticRegression', LogisticRegression())
])

params_grid_logistic_regression = {
    'kpca__n_components': [2, 3, 5],
    'kpca__kernel': ['linear', 'rbf', 'poly'],
    'logisticRegression__solver': ['lbfgs', 'liblinear', 'saga']
}

In [17]:
start_time = time.time()

result_logistic_regression = nested_cv(pipeline_logistic_regression,
          param_grid = params_grid_logistic_regression,
          X = X,
          y = y,
          scoring = ['accuracy'])

end_time = time.time()

print("Result Logistic Regression")
print(result_logistic_regression)

print(f'Elapsed time: {end_time - start_time} s')


Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'kpca__kernel': 'rbf', 'kpca__n_components': 5, 'logisticRegression__solver': 'lbfgs'}
 Accuracy: 0.7400

Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'kpca__kernel': 'rbf', 'kpca__n_components': 5, 'logisticRegression__solver': 'lbfgs'}
 Accuracy: 0.8100

Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'kpca__kernel': 'linear', 'kpca__n_components': 5, 'logisticRegression__solver': 'lbfgs'}
 Accuracy: 0.7850

Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'kpca__kernel': 'rbf', 'kpca__n_components': 5, 'logisticRegression__solver': 'lbfgs'}
 Accuracy: 0.7800

Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'kpca__kernel': 'linear', 'kpca__n_components': 5, 'logisticRegression__solver': 'lbfgs'}
 Accuracy: 0.8100
Result Logistic Regression
{'Nested CV ACCURACY': '0.7850 ± 0.0257', 'Best Parameters with highest accuracy': {'kpca__kernel