# Discriminazione Lineare
Confronto diversi modelli di discriminazione lineare utilizzando il dataset `breast_cancer`.

## Import del dataset
Importo il dataset breast cancer e noto che ci sono 569 osservazioni e 30 colonne, di cui 29 features e 1 colonna per le etichette che possono assumere due valori: 'malignant' o 'benign'.

In [31]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

breast_cancer = load_breast_cancer()

X = pd.DataFrame(breast_cancer.data, columns = breast_cancer.feature_names); # features
y = pd.DataFrame(breast_cancer.target); # labels

print(X.shape)
print(list(breast_cancer.feature_names))
print(list(breast_cancer.target_names))

(569, 30)
['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension']
['malignant', 'benign']


# Standardizzazione e Scaling

In [35]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_standardized = pd.DataFrame(scaler.fit_transform(X), columns = breast_cancer.feature_names)

X_standardized["mean radius"].mean()

-3.153111437248248e-15

## Nested Cross-Validation

In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, r2_score, root_mean_squared_error, mean_absolute_error

def nested_cv(model, param_grid, X, y, outer_splits=5,
              inner_splits=5, scoring=['accuracy', 'roc_auc'],
              random_state=42, verbose=True):

    # Assicurati che `y` sia un array 1D
    if isinstance(y, pd.DataFrame):  # Se è un DataFrame Pandas
        y = y.values.ravel()
    elif isinstance(y, pd.Series):  # Se è una Serie Pandas
        y = y.values
    else:  # Se è un array Numpy
        y = np.ravel(y)

    # CROSS-VALIDATION ESTERNA
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    score_results = {metric: [] for metric in scoring}

    best_param_overall = None
    best_score = -np.inf

    for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X), 1):
        if verbose:
            print(f"\nPerforming Outer Fold {outer_fold}/{outer_splits}")

        # Usare il metodo .iloc per X, se è un DataFrame
        if isinstance(X, pd.DataFrame):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        else:  # Altrimenti usa indicizzazione standard
            X_train, X_test = X[train_idx], X[test_idx]

        y_train, y_test = y[train_idx], y[test_idx]

        # CROSS-VALIDATION INTERNA con GridSearchCV
        inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)
        if verbose:
            print("Performing GridSearchCV...")

        grid_search = GridSearchCV(model, param_grid, cv=inner_cv,
                                   n_jobs=-1, scoring=scoring[0])
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        if verbose:
            print(f" Best Params: {best_params}")

        # Test del modello ottimale sui dati di test dell'outer fold
        y_pred = best_model.predict(X_test)

        # Calcolo delle metriche
        if 'accuracy' in scoring:
            acc = accuracy_score(y_test, y_pred)
            score_results['accuracy'].append(acc)
            if acc > best_score:
                best_score = acc
                best_param_overall = best_params
            if verbose:
                print(f" Accuracy: {acc:.4f}")

        if 'roc_auc' in scoring:
            try:
                y_score = best_model.predict_proba(X_test)[:, 1]
                auc = roc_auc_score(y_test, y_score)
                score_results['roc_auc'].append(auc)
                if verbose:
                    print(f" AUC: {auc:.4f}")
            except AttributeError:
                if verbose:
                    print("Controlla se il modello ha un metodo `predict_proba`.")
                score_results['roc_auc'].append(np.nan)

        if 'r2' in scoring:
            r2score = r2_score(y_test, y_pred)
            score_results['r2'].append(r2score)

            if verbose:
                print(f" R2: {r2score:.4f}")

        if 'mae' in scoring:
            mae = mean_absolute_error(y_test, y_pred)
            score_results['mae'].append(mae)

            if verbose:
                print(f" MAE: {mae:.4f}")
        if 'rmse' in scoring:
            rmse = root_mean_squared_error(y_test, y_pred)
            score_results['rmse'].append(rmse)

            if verbose:
                print(f" RMSE: {rmse:.4f}")

    # Calcolo delle metriche globali (media e deviazione standard)
    result = {}
    for metric, scores in score_results.items():
        result[f"Nested CV {metric.upper()}"] = f"{np.nanmean(scores):.4f} ± {np.nanstd(scores):.4f}"

    result["Best Parameters with highest accuracy"] = best_param_overall

    return result

## LDA - Linear Discriminant Analysis

In [42]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import time

lda = LinearDiscriminantAnalysis()

lda_parameters = {
    'solver': ["svd", "lsqr", "eigen"]
}

start_time = time.time()

lda_results = nested_cv(model = lda,
                    param_grid = lda_parameters,
                    X = X_standardized,
                    y = y,
                    outer_splits = 5,
                    inner_splits = 5,
                    scoring = ['accuracy'])

end_time = time.time()

print(f'LDA results" \n{lda_results}')
print(f'Tempo di esecuzione {end_time - start_time}')


Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'solver': 'svd'}
 Accuracy: 0.9561

Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'solver': 'svd'}
 Accuracy: 0.9737

Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'solver': 'svd'}
 Accuracy: 0.9386

Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'solver': 'svd'}
 Accuracy: 0.9474

Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'solver': 'svd'}
 Accuracy: 0.9558
LDA results" 
{'Nested CV ACCURACY': '0.9543 ± 0.0116', 'Best Parameters with highest accuracy': {'solver': 'svd'}}
Tempo di esecuzione 1.5341382026672363


## QDA - Quadratic Discriminant Analysis

In [43]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis()

qda_parameters = {
    'reg_param': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 0.9]
}

start_time = time.time()

qda_results = nested_cv(model = qda,
                        param_grid = qda_parameters,
                        X = X_standardized,
                        y = y,
                        outer_splits = 5,
                        inner_splits = 5,
                        scoring = ['accuracy'])

end_time = time.time()

print(f'QDA results" \n{qda_results}')
print(f'Tempo di esecuzione {end_time - start_time}')


Performing Outer Fold 1/5
Performing GridSearchCV...
 Best Params: {'reg_param': 0.1}
 Accuracy: 0.9561

Performing Outer Fold 2/5
Performing GridSearchCV...
 Best Params: {'reg_param': 0.5}
 Accuracy: 0.9737

Performing Outer Fold 3/5
Performing GridSearchCV...
 Best Params: {'reg_param': 0.1}
 Accuracy: 0.9737

Performing Outer Fold 4/5
Performing GridSearchCV...
 Best Params: {'reg_param': 0.5}
 Accuracy: 0.9649

Performing Outer Fold 5/5
Performing GridSearchCV...
 Best Params: {'reg_param': 0.1}
 Accuracy: 0.9558
QDA results" 
{'Nested CV ACCURACY': '0.9648 ± 0.0079', 'Best Parameters with highest accuracy': {'reg_param': 0.5}}
Tempo di esecuzione 1.4440438747406006
