# Esercitazione lab2

# Richieste:
-  Confronti tra: `NNC`, `LDA`, `QDA`, `GaussianNB`, `RidgeClassifier`, `Perceptron`

## Load breast cancer dataset

In [1]:
from sklearn import datasets
import numpy as np

b_cancer = datasets.load_breast_cancer()
X = b_cancer.data # Matrice delle X
y = b_cancer.target # Vettore delle y

In [2]:
print(b_cancer.feature_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [3]:
print(b_cancer.target_names)

['malignant' 'benign']


## Normalization

In [4]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_norm = mms.fit_transform(X) 

## Nested CV [classificazione]

In [18]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np

def nested_cv(model, param_grid, X, y, outer_splits=5,
              inner_splits=5, scoring=['accuracy'], random_state=42, verbose=True):

    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    score_results = {metric: [] for metric in scoring}

    best_param_overall = None
    best_score = -np.inf 

    for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X), 1):
        if verbose:
            print(f"\nPerforming Outer Fold {outer_fold}/{outer_splits}")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)
        if verbose:
            print("Performing GridSearchCV...")

        grid_search = GridSearchCV(model, param_grid, cv=inner_cv,
                                   n_jobs=-1, scoring=scoring[0])
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        if verbose:
            print(f" Best Params: {best_params}")

        y_pred = best_model.predict(X_test)

        if 'accuracy' in scoring:
            acc = accuracy_score(y_test, y_pred)
            score_results['accuracy'].append(acc)

            if acc > best_score:
                best_score = acc
                best_param_overall = best_params

            if verbose:
                print(f" Accuracy: {acc:.4f}")

    result = {}
    for metric, scores in score_results.items():
        result[f"Nested CV {metric.upper()}"] = f"{np.mean(scores):.4f} ± {np.std(scores):.4f}"

    result["Best Parameters with highest accuracy"] = best_param_overall

    return result

# Confronti tra modelli

## Nearest Centroid Neighbor

In [19]:
import time
from sklearn.neighbors import NearestCentroid

start_time = time.time()

nc_model = NearestCentroid()

nc_params = {'metric': ['euclidean', 'manhattan']}

nc_results = nested_cv(nc_model, nc_params, X_norm, y,
                        outer_splits=5, inner_splits=5,
                        scoring=['accuracy'])

end_time = time.time()

print(f'NCA Results: \n{nc_results}')
print(f'Tempo di esecuzione {end_time - start_time}')


Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'metric': 'manhattan'}
 Accuracy: 0.9737

Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'metric': 'manhattan'}
 Accuracy: 0.9298

Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'metric': 'manhattan'}
 Accuracy: 0.9474

Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'metric': 'euclidean'}
 Accuracy: 0.9386

Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'metric': 'manhattan'}
 Accuracy: 0.9204
NCA Results: 
{'Nested CV ACCURACY': '0.9420 ± 0.0182', 'Best Parameters (highest accuracy)': {'metric': 'manhattan'}}
Tempo di esecuzione 2.106163501739502


In [12]:
import pandas as pd
nc_params_df = pd.DataFrame(nc_results["Best Parameters per fold"])
nc_params_df

Unnamed: 0,metric
0,manhattan
1,manhattan
2,manhattan
3,euclidean
4,manhattan


## Latent Discriminant Analysis

In [13]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_model = LinearDiscriminantAnalysis()

lda_params = {'solver': ['svd', 'lsqr', 'eigen']}

start_time = time.time()

lda_results = nested_cv(lda_model, lda_params, X_norm, y,
                        outer_splits=5, inner_splits=5,
                        scoring=['accuracy'])

end_time = time.time()

print(f'LDA Results: \n{lda_results}')
print(f'Tempo di esecuzione {end_time - start_time}')


 Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'solver': 'svd'}
 Accuracy: 0.9561

 Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'solver': 'svd'}
 Accuracy: 0.9737

 Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'solver': 'svd'}
 Accuracy: 0.9386

 Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'solver': 'svd'}
 Accuracy: 0.9474

 Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'solver': 'svd'}
 Accuracy: 0.9558
LDA Results: 
{'Nested CV ACCURACY': '0.9543 ± 0.0116', 'Best Parameters per fold': [{'solver': 'svd'}, {'solver': 'svd'}, {'solver': 'svd'}, {'solver': 'svd'}, {'solver': 'svd'}]}
Tempo di esecuzione 0.39138364791870117


## Quadratic Discriminant Analysis

In [17]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda_model = QuadraticDiscriminantAnalysis()

qda_params = {'reg_param': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 0.9]}

start_time = time.time()

qda_results = nested_cv(qda_model, qda_params, X_norm, y,
                        outer_splits=5, inner_splits=5,
                        scoring=['accuracy', 'auc-roc'])

end_time = time.time()

print(f'QDA Results: \n{qda_results}')
print(f'Tempo di esecuzione {end_time - start_time}')

QDA Results: 
{'Nested CV ACCURACY': '0.9596 ± 0.0071', 'Nested CV AUC-ROC': '0.9516 ± 0.0053', 'Best Parameters per fold': [{'reg_param': 0.001}, {'reg_param': 0.001}, {'reg_param': 0.0}, {'reg_param': 0.0001}, {'reg_param': 0.001}]}
Tempo di esecuzione 0.5321633815765381


## Gaussian Naive Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB

gnb_model = GaussianNB()

gnb_params = {
    'var_smoothing': np.logspace(-12, -6, 7)  
}

start_time = time.time()

gnb_results = nested_cv(gnb_model, gnb_params, X_norm, y,
                        outer_splits=5, inner_splits=5,
                        scoring=['accuracy'])

end_time = time.time()

print(f'GNB Results: \n{gnb_results}')
print(f'Tempo di esecuzione {end_time - start_time}')


 Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'var_smoothing': 1e-12}
 Accuracy: 0.9649

 Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'var_smoothing': 1e-12}
 Accuracy: 0.9211

 Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'var_smoothing': 1e-12}
 Accuracy: 0.9386

 Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'var_smoothing': 1e-12}
 Accuracy: 0.9298

 Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'var_smoothing': 1e-12}
 Accuracy: 0.9292
GNB Results: 
{'Nested CV ACCURACY': '0.9367 ± 0.0152', 'Best Parameters per fold': [{'var_smoothing': 1e-12}, {'var_smoothing': 1e-12}, {'var_smoothing': 1e-12}, {'var_smoothing': 1e-12}, {'var_smoothing': 1e-12}]}
Tempo di esecuzione 0.1956024169921875


## Ridge Classifier

In [15]:
from sklearn.linear_model import RidgeClassifier

rc_model = RidgeClassifier(max_iter=5000, random_state=42)


rc_params = {
    'alpha': [0.01, 0.1, 1.0, 10.0],  
    'fit_intercept': [True, False],          
    'tol': [1e-4, 1e-3, 1e-2],              
    'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],  
}

start_time = time.time()

rc_results = nested_cv(rc_model, rc_params, X_norm, y,
                        outer_splits=5, inner_splits=5,
                        scoring=['accuracy'])

end_time = time.time()

print(f'RC Results: \n{rc_results}')
print(f'Tempo di esecuzione {end_time - start_time}')


 Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'alpha': 0.1, 'fit_intercept': True, 'solver': 'sparse_cg', 'tol': 0.01}
 Accuracy: 0.9561

 Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'alpha': 1.0, 'fit_intercept': True, 'solver': 'svd', 'tol': 0.0001}
 Accuracy: 0.9649

 Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'alpha': 0.01, 'fit_intercept': True, 'solver': 'sparse_cg', 'tol': 0.001}
 Accuracy: 0.9474

 Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'alpha': 0.1, 'fit_intercept': True, 'solver': 'svd', 'tol': 0.0001}
 Accuracy: 0.9649

 Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'alpha': 0.01, 'fit_intercept': True, 'solver': 'sag', 'tol': 0.01}
 Accuracy: 0.9646
RC Results: 
{'Nested CV ACCURACY': '0.9596 ± 0.0070', 'Best Parameters per fold': [{'alpha': 0.1, 'fit_intercept': True, 'solver': 'sparse_cg', 'tol': 0.01}, {'alpha': 1.0, 'fit_intercept': True, 'solver':

## Perceptron

In [17]:
from sklearn.linear_model import Perceptron

ppn_model = Perceptron(random_state=42)

ppn_params = {
    'penalty': [None, 'l2', 'l1', 'elasticnet'],        
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2],                 
    #'l1_ratio': [0.15, 0.5, 0.7, 0.9],
    #'fit_intercept': [True, False],                    
    #'max_iter': [500, 1000, 2000],                     
    #'shuffle': [True, False],                          
    #'eta0': [0.1, 0.5, 1.0, 5.0],                     
    #'early_stopping': [False, True],                   
}

start_time = time.time()

ppn_results = nested_cv(ppn_model, ppn_params, X_norm, y,
                        outer_splits=5, inner_splits=5,
                        scoring=['accuracy'])

end_time = time.time()

print(f'PPN Results: \n{ppn_results}')
print(f'Tempo di esecuzione {end_time - start_time}')


 Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'alpha': 0.0001, 'penalty': 'l1'}
 Accuracy: 0.9737

 Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'alpha': 0.0001, 'penalty': 'l1'}
 Accuracy: 0.9737

 Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'alpha': 1e-05, 'penalty': 'l2'}
 Accuracy: 0.9298

 Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'alpha': 1e-05, 'penalty': 'l2'}
 Accuracy: 0.9737

 Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'alpha': 1e-05, 'penalty': 'l1'}
 Accuracy: 0.9646
PPN Results: 
{'Nested CV ACCURACY': '0.9631 ± 0.0170', 'Best Parameters per fold': [{'alpha': 0.0001, 'penalty': 'l1'}, {'alpha': 0.0001, 'penalty': 'l1'}, {'alpha': 1e-05, 'penalty': 'l2'}, {'alpha': 1e-05, 'penalty': 'l2'}, {'alpha': 1e-05, 'penalty': 'l1'}]}
Tempo di esecuzione 2.0428638458251953


In [35]:
ppn_params_df = pd.DataFrame(ppn_results["Best Parameters per fold"])
ppn_params_df

Unnamed: 0,alpha,early_stopping,eta0,fit_intercept,l1_ratio,max_iter,penalty,shuffle
0,0.0001,False,1.0,True,0.9,500,elasticnet,True
1,1e-05,False,0.5,True,0.15,500,l2,True
2,1e-05,False,1.0,True,0.15,500,l2,True
3,0.0001,False,0.1,True,0.15,500,l2,True
4,1e-05,False,0.1,True,0.5,500,elasticnet,True
