# **Part 01: Apprentissage avec des Benchmarks**

## **1. Préparation et exploration des données**

In [8]:
# Téléchargement des données qui sont dans le répertoire datasets 
from prepdata import data_recovery

datasets = [
    "abalone8", "abalone17", "abalone20",
    "autompg", "australian", "balance", "bankmarketing",
    "bupa", "german", "glass", "hayes", "heart", "iono",
    "libras", "newthyroid", "pageblocks", "pima",
    "satimage", "segmentation", "sonar", "spambase",
    "splice", "vehicle", "wdbc", "wine", "wine4",
    "yeast3", "yeast6",
]

for name in datasets:
    X, y = data_recovery(name)
    print(name, X.shape, y.shape)


abalone8 (4177, 10) (4177,)
abalone17 (4177, 10) (4177,)
abalone20 (4177, 10) (4177,)
autompg (392, 7) (392,)
australian (690, 14) (690,)
balance (625, 4) (625,)
bankmarketing (45211, 51) (45211,)
bupa (345, 6) (345,)
german (1000, 24) (1000,)
glass (214, 9) (214,)
hayes (132, 4) (132,)
heart (270, 13) (270,)
iono (351, 34) (351,)
libras (360, 90) (360,)
newthyroid (215, 5) (215,)
pageblocks (5473, 10) (5473,)
pima (768, 8) (768,)
satimage (6435, 36) (6435,)
segmentation (2310, 19) (2310,)
sonar (208, 60) (208,)
spambase (4597, 57) (4597,)
splice (3175, 60) (3175,)
vehicle (846, 18) (846,)
wdbc (569, 30) (569,)
wine (178, 13) (178,)
wine4 (1599, 11) (1599,)
yeast3 (1484, 8) (1484,)
yeast6 (1484, 8) (1484,)


In [9]:
# Construction d'une table récapitulative des informations sur les datasets
import numpy as np
import pandas as pd
from prepdata import data_recovery

datasets = [
    "abalone8", "abalone17", "abalone20",
    "autompg", "australian", "balance", "bankmarketing",
    "bupa", "german", "glass", "hayes", "heart", "iono",
    "libras", "newthyroid", "pageblocks", "pima",
    "satimage", "segmentation", "sonar", "spambase",
    "splice", "vehicle", "wdbc", "wine", "wine4",
    "yeast3", "yeast6",
]

info_rows = []
imbalanced_datasets = []   # pour stocker les jeux déséquilibrés

threshold = 0.3            # classe minoritaire < 30% => déséquilibré

for name in datasets:
    X, y = data_recovery(name)

    n, d = X.shape
    ratio_positive = np.mean(y == 1)
    ratio_negative = np.mean(y == 0)

    # minoritaire = min(ratio_pos, ratio_neg)
    minority_ratio = min(ratio_positive, ratio_negative)


    # détection valeurs manquantes
    n_missing = np.isnan(X).sum()
    has_missing = n_missing > 0

    info_rows.append({
        "dataset": name,
        "n_samples": n,
        "n_features": d,
        "ratio_positive": ratio_positive,
        "ratio_negative": ratio_negative,
        "minority_ratio": minority_ratio,
        "imbalanced": minority_ratio < threshold,  
        "has_missing": has_missing,
        "n_missing": int(n_missing),
        "problem_type": "binary_classification",
    })

    # si dataset déséquilibré, on l'ajoute à une liste spéciale
    if minority_ratio < threshold:
        imbalanced_datasets.append(name)

info_df = pd.DataFrame(info_rows)
print(info_df)
info_df.to_csv("dataset_info_summary.csv", index=False)

# Affichage des datasets déséquilibrés
print("\nDatasets déséquilibrés (< 30% de la classe minoritaire) :")
print(imbalanced_datasets)
print("Nombre de datasets déséquilibrés :", len(imbalanced_datasets))

#Nombre de datasets qui contiennent des valeurs manquantes
n_datasets_with_missing = info_df["has_missing"].sum()
print("Nombre de datasets avec des valeurs manquantes :", n_datasets_with_missing)



          dataset  n_samples  n_features  ratio_positive  ratio_negative  \
0        abalone8       4177          10        0.135983        0.864017   
1       abalone17       4177          10        0.013886        0.986114   
2       abalone20       4177          10        0.006225        0.993775   
3         autompg        392           7        0.375000        0.625000   
4      australian        690          14        0.444928        0.555072   
5         balance        625           4        0.460800        0.539200   
6   bankmarketing      45211          51        0.116985        0.883015   
7            bupa        345           6        0.420290        0.579710   
8          german       1000          24        0.300000        0.700000   
9           glass        214           9        0.327103        0.672897   
10          hayes        132           4        0.227273        0.772727   
11          heart        270          13        0.444444        0.555556   
12          

## **2. Protocole commun**

In [10]:
# Le but est de définir une fonction générique avec un split train/test, les métriques et la mesure du temps d’apprentissage. On va l'utiliser par la suite pour tous les datasets.
import numpy as np
from time import perf_counter

from sklearn.model_selection import train_test_split
# Imortation des métriques qu'on va utiliser
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
)

def evaluate_model(model, X, y, test_size=0.3, random_state=0):
    """
    model : estimateur sklearn déjà configuré 
    X, y : données complètes
    Retourne un dict avec accuracy, f1, auc, temps d'apprentissage.
    """
    # découpe stratifiée
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        stratify=y,
        random_state=random_state,
    )

    # temps d'apprentissage
    t0 = perf_counter()
    model.fit(X_train, y_train)
    t1 = perf_counter()
    train_time = t1 - t0

    # prédictions de classes
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred)   # F1 pour la classe positive 

    # AUC: il faut des scores/probas, pas des classes
    try:
        if hasattr(model, "predict_proba"):
            y_scores = model.predict_proba(X_test)[:, 1]
        else:
            # certains modèles (SVM linéaire, etc.) ont decision_function
            y_scores = model.decision_function(X_test)
        auc = roc_auc_score(y_test, y_scores)   # ROC AUC binaire 
    except Exception:
        auc = np.nan   # si vraiment pas possible

    return {
        "accuracy": acc,
        "f1": f1,
        "auc": auc,
        "train_time": train_time,
        "n_train": len(y_train),
        "n_test": len(y_test),
    }


## **3. Approches non paramétriques**

In [None]:
# Sur cette partie, on va se concentrer sur le KNN avec ses différentes variantes.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import GridSearchCV

#Définition des variantes KNN


# 1) KNN de base : k=5, distance euclidienne, poids uniformes 
knn_base = KNeighborsClassifier(
    n_neighbors=5,
    weights="uniform",
    metric="minkowski",
    p=2,
)

# 2) KNN pondéré par la distance : les voisins proches comptent plus 
knn_distance = KNeighborsClassifier(
    n_neighbors=5,
    weights="distance",
    metric="minkowski",
    p=2,
)

# 3) KNN avec normalisation des features : StandardScaler + KNN 
knn_scaled = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(
        n_neighbors=5,
        weights="uniform",
        metric="minkowski",
        p=2,
    ),
)

# 4) KNN avec k optimisé par validation croisée (GridSearchCV)
#    On cherche le meilleur n_neighbors dans {1,3,5,7,11}
def make_knn_cv():
    """
    Crée un GridSearchCV neuf pour KNN :
    - pipeline StandardScaler + KNN
    - recherche de k via validation croisée
    """
    base_pipe = make_pipeline(
        StandardScaler(),
        KNeighborsClassifier(weights="uniform"),
    )
    param_grid = {
        "kneighborsclassifier__n_neighbors": [1, 3, 5, 7, 11],
    }
    grid = GridSearchCV(
        estimator=base_pipe,
        param_grid=param_grid,
        cv=5,
        scoring="f1",   # on optimise le F1-score 
        n_jobs=-1,
    )
    return grid

# 5) KNN avec SMOTE + normalisation (pour données déséquilibrées), méthode qu'on utilisera seulement si le dataset est déséquilibré, d'ou l'utilité de la colonne qu'on a rajouté dans le tableau récapitulatif
knn_smote = ImbPipeline(
    steps=[
        ("smote", SMOTE(random_state=0)),
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier(
            n_neighbors=5,
            weights="uniform",
            metric="minkowski",
            p=2,
        )),
    ]
)

# Dictionnaire pour le tableau benchmark
models_knn = {
    "knn_base": knn_base,
    "knn_distance": knn_distance,
    "knn_scaled": knn_scaled,
    "knn_cv": make_knn_cv,   
    "knn_smote": knn_smote,  
}


# Evaluation de toutes les variantes KNN

results_rows = []

for ds in datasets:   
    # vérifier si le dataset est déséquilibré
    row_info = info_df[info_df["dataset"] == ds].iloc[0]
    is_imbalanced = bool(row_info["imbalanced"])

    # charger X, y
    X, y = data_recovery(ds)

    for model_name, model_or_factory in models_knn.items():
        # on skippe knn_smote si le dataset est équilibré
        if (model_name == "knn_smote") and (not is_imbalanced):
            continue

    
        if callable(model_or_factory):
            model = model_or_factory()
        else:
            model = model_or_factory

        res = evaluate_model(model, X, y)
        res.update({
            "dataset": ds,
            "model_family": "knn",
            "model_name": model_name,
            "imbalanced": is_imbalanced,
        })
        results_rows.append(res)

results_knn_df = pd.DataFrame(results_rows)



In [None]:
# Affichage d'un tableau avec lignes = datasets, colonnes = variantes KNN, valeur = accuracy
# on peut faire pareil oui pour f1, auc, train_time 
acc_table = results_knn_df.pivot_table(
    index="dataset",
    columns="model_name",
    values="accuracy"
)
print(acc_table)

model_name     knn_base    knn_cv  knn_distance  knn_scaled  knn_smote
dataset                                                               
abalone17      0.984051  0.977671      0.984051    0.985646   0.901914
abalone20      0.993620  0.985646      0.993620    0.993620   0.938596
abalone8       0.834928  0.806220      0.836523    0.830144   0.671451
australian     0.671498  0.859903      0.657005    0.835749        NaN
autompg        0.737288  0.762712      0.788136    0.754237        NaN
balance        0.904255  0.930851      0.904255    0.888298        NaN
bankmarketing  0.882409  0.875184      0.881303    0.892215   0.854615
bupa           0.653846  0.596154      0.653846    0.634615        NaN
german         0.676667  0.676667      0.683333    0.700000        NaN
glass          0.738462  0.830769      0.753846    0.723077        NaN
hayes          0.825000  0.875000      0.825000    0.875000   0.850000
heart          0.641975  0.839506      0.654321    0.814815        NaN
iono  

In [16]:
# On compare avec F1, vu que l'accuracy n'est pas toujours le meilleur indicateur sur des datasets déséquilibrés
f1_table = results_knn_df.pivot_table(
    index="dataset",
    columns="model_name",
    values="f1"
)
print(f1_table)

model_name     knn_base    knn_cv  knn_distance  knn_scaled  knn_smote
dataset                                                               
abalone17      0.000000  0.125000      0.000000    0.000000   0.102190
abalone20      0.000000  0.000000      0.000000    0.000000   0.049383
abalone8       0.181818  0.278932      0.196078    0.171206   0.339744
australian     0.626374  0.828402      0.607735    0.808989        NaN
autompg        0.651685  0.695652      0.725275    0.674157        NaN
balance        0.898876  0.924855      0.898876    0.881356        NaN
bankmarketing  0.356595  0.432070      0.365142    0.406656   0.471597
bupa           0.526316  0.522727      0.526316    0.558140        NaN
german         0.340136  0.469945      0.362416    0.357143        NaN
glass          0.638298  0.755556      0.652174    0.608696        NaN
hayes          0.461538  0.666667      0.461538    0.666667   0.666667
heart          0.591549  0.816901      0.600000    0.788732        NaN
iono  

In [18]:
# Pareil pour l'AUC 
auc_table = results_knn_df.pivot_table(
    index="dataset",
    columns="model_name",
    values="auc"
)
print(auc_table)

model_name     knn_base    knn_cv  knn_distance  knn_scaled  knn_smote
dataset                                                               
abalone17      0.566836  0.553569      0.567550    0.565433   0.702054
abalone20      0.489968  0.495987      0.489968    0.547151   0.596810
abalone8       0.679764  0.582487      0.676948    0.671769   0.711253
australian     0.710208  0.910491      0.700000    0.898015        NaN
autompg        0.846898  0.847666      0.862869    0.867322        NaN
balance        0.972801  0.983328      0.973939    0.967680        NaN
bankmarketing  0.766091  0.671589      0.765568    0.797585   0.793856
bupa           0.712689  0.586364      0.717424    0.664583        NaN
german         0.611217  0.619841      0.620794    0.677910        NaN
glass          0.833333  0.825216      0.849567    0.771104        NaN
hayes          0.976703  0.973118      0.978495    0.973118   0.913978
heart          0.726235  0.908333      0.720988    0.847531        NaN
iono  

In [15]:
# Une autre forme du tableau 
# on suppose que info_df contient au moins: dataset, imbalanced
knn_with_info = results_knn_df.merge(
    info_df[["dataset", "imbalanced"]],
    on="dataset",
    how="left"
)

# on définit une fonction pratique pour renommer les colonnes après pivot
def make_metric_table(df, value_col):
    tab = df.pivot_table(
        index="dataset",
        columns="model_name",
        values=value_col
    )
    # on ajoute le nom de la métrique au niveau des colonnes
    tab.columns = [f"{value_col}_{m}" for m in tab.columns]
    return tab

acc_tab = make_metric_table(knn_with_info, "accuracy")
f1_tab  = make_metric_table(knn_with_info, "f1")
auc_tab = make_metric_table(knn_with_info, "auc")

# on récupère aussi la colonne imbalanced (une seule fois par dataset)
imb_tab = info_df.set_index("dataset")[["imbalanced"]]

# tableau final : imbalanced + toutes les colonnes de métriques
knn_full_table = pd.concat(
    [imb_tab, acc_tab, f1_tab, auc_tab],
    axis=1
)

print(knn_full_table.head())



            imbalanced  accuracy_knn_base  accuracy_knn_cv  \
dataset                                                      
abalone8          True           0.834928         0.806220   
abalone17         True           0.984051         0.977671   
abalone20         True           0.993620         0.985646   
autompg          False           0.737288         0.762712   
australian       False           0.671498         0.859903   

            accuracy_knn_distance  accuracy_knn_scaled  accuracy_knn_smote  \
dataset                                                                      
abalone8                 0.836523             0.830144            0.671451   
abalone17                0.984051             0.985646            0.901914   
abalone20                0.993620             0.993620            0.938596   
autompg                  0.788136             0.754237                 NaN   
australian               0.657005             0.835749                 NaN   

            f1_knn

## **4. Approches paramétriques linéaires**

In [19]:
# Là on va se concentrer sur les modèles linéaires : régression logistique et SVM linéaire, avec deux variantes pour chacun.
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Régression logistique
# Variante 1 : LogReg de base (avec standardisation)
logreg_base = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        C=1.0,
        solver="lbfgs",
        max_iter=1000,
    ),
)

# Variante 2 : LogReg avec pondération automatique des classes
logreg_balanced = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        C=1.0,
        solver="lbfgs",
        max_iter=1000,
        class_weight="balanced",  # plus de poids à la classe minoritaire 
    ),
)

# SVM Linéaire

# Variante 1 : LinearSVC de base (avec standardisation)
svm_lin_base = make_pipeline(
    StandardScaler(),
    LinearSVC(
        C=1.0,
        max_iter=5000,
    ),
)

# Variante 2 : LinearSVC avec class_weight balanced
svm_lin_balanced = make_pipeline(
    StandardScaler(),
    LinearSVC(
        C=1.0,
        class_weight="balanced",  # gère les jeux déséquilibrés 
        max_iter=5000,
    ),
)

models_linear = {
    "logreg_base": logreg_base,
    "logreg_balanced": logreg_balanced,
    "svm_lin_base": svm_lin_base,
    "svm_lin_balanced": svm_lin_balanced,
}

# Evaluation des modèles linéaires
results_linear_rows = []

for ds in datasets:  
    row_info = info_df[info_df["dataset"] == ds].iloc[0]
    is_imbalanced = bool(row_info["imbalanced"])

    X, y = data_recovery(ds)

    for model_name, model in models_linear.items():

        # On utilise les variantes "balanced" que pour les datasets déséquilibrés
        if ("balanced" in model_name) and (not is_imbalanced):
            continue

        res = evaluate_model(model, X, y)
        res.update({
            "dataset": ds,
            "model_family": "linear",
            "model_name": model_name,
            "imbalanced": is_imbalanced,
        })
        results_linear_rows.append(res)

results_linear_df = pd.DataFrame(results_linear_rows)
print(results_linear_df.head())




   accuracy        f1       auc  train_time  n_train  n_test    dataset  \
0  0.863636  0.022857  0.755936    0.097377     2923    1254   abalone8   
1  0.649123  0.373219  0.757858    0.031555     2923    1254   abalone8   
2  0.864434  0.011628  0.751648    0.018558     2923    1254   abalone8   
3  0.643541  0.376569  0.757383    0.007366     2923    1254   abalone8   
4  0.984848  0.000000  0.883114    0.025880     2923    1254  abalone17   

  model_family        model_name  imbalanced  
0       linear       logreg_base        True  
1       linear   logreg_balanced        True  
2       linear      svm_lin_base        True  
3       linear  svm_lin_balanced        True  
4       linear       logreg_base        True  




In [20]:
# Construction de la table des résultats avec accuracy
acc_table = results_linear_df.pivot_table(
    index="dataset",
    columns="model_name",
    values="accuracy"
)
print(acc_table)

model_name     logreg_balanced  logreg_base  svm_lin_balanced  svm_lin_base
dataset                                                                    
abalone17             0.803030     0.984848          0.818182      0.986443
abalone20             0.861244     0.993620          0.859649      0.993620
abalone8              0.649123     0.863636          0.643541      0.864434
australian                 NaN     0.859903               NaN      0.859903
autompg                    NaN     0.838983               NaN      0.889831
balance                    NaN     0.968085               NaN      0.968085
bankmarketing         0.842598     0.901504          0.850044      0.901283
bupa                       NaN     0.653846               NaN      0.644231
german                     NaN     0.776667               NaN      0.773333
glass                      NaN     0.707692               NaN      0.738462
hayes                 0.875000     0.875000          0.875000      0.850000
heart       

In [21]:
# On  Utilise F1‑score comme métrique principale de comparaison entre les modèles linéaires => Contrairement à l’accuracy, il reste pertinent même lorsque les classes sont déséquilibrées.
f1_table = results_linear_df.pivot_table(
    index="dataset",
    columns="model_name",
    values="f1"
)
print(f1_table)

model_name     logreg_balanced  logreg_base  svm_lin_balanced  svm_lin_base
dataset                                                                    
abalone17             0.101818     0.000000          0.109375      0.000000
abalone20             0.064516     0.000000          0.063830      0.000000
abalone8              0.373219     0.022857          0.376569      0.011628
australian                 NaN     0.841530               NaN      0.846561
autompg                    NaN     0.786517               NaN      0.860215
balance                    NaN     0.965116               NaN      0.965116
bankmarketing         0.552692     0.450658          0.562392      0.420597
bupa                       NaN     0.550000               NaN      0.543210
german                     NaN     0.567742               NaN      0.552632
glass                      NaN     0.486486               NaN      0.540541
hayes                 0.782609     0.666667          0.782609      0.666667
heart       

## **4. Approches paramétriques non linéaires**

In [None]:
# Maintenant, on va utiliser les modèles non linéaires  : Decision Tree, RandomForest, AdaBoost et GradientBoosting.
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from imblearn.over_sampling import ADASYN 
from imblearn.pipeline import Pipeline as ImbPipeline


# Decision Trees

# Variante 1 : arbre de décision "de base"
tree_base = DecisionTreeClassifier(
    max_depth=None,
    random_state=0,
)

# Variante 2 : ADASYN + arbre de décision (pour données déséquilibrées)
tree_adasyn = ImbPipeline(
    steps=[
        ("adasyn", ADASYN(random_state=0)),
        ("tree", DecisionTreeClassifier(
            max_depth=None,
            random_state=0,
        )),
    ]
)

# Random Forests

# Variante 1 : RandomForest "standard"
rf_base = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    n_jobs=-1,
    random_state=0,
) 

# Variante 2 : ADASYN + RandomForest
rf_adasyn = ImbPipeline(
    steps=[
        ("adasyn", ADASYN(random_state=0)),
        ("rf", RandomForestClassifier(
            n_estimators=100,
            max_depth=None,
            n_jobs=-1,
            random_state=0,
        )),
    ]
)


# ADABOOST

# Variante 1 : AdaBoost "de base"
ada_base = AdaBoostClassifier(
    n_estimators=100,
    learning_rate=1.0,
    random_state=0,
)  

# Variante 2 : ADASYN + AdaBoost
ada_adasyn = ImbPipeline(
    steps=[
        ("adasyn", ADASYN(random_state=0)),
        ("ada", AdaBoostClassifier(
            n_estimators=100,
            learning_rate=1.0,
            random_state=0,
        )),
    ]
)


# GRADIENT BOOSTING

# Variante 1 : Gradient Boosting "standard"
gb_base = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=0,
)  

# Variante 2 : ADASYN + Gradient Boosting
gb_adasyn = ImbPipeline(
    steps=[
        ("adasyn", ADASYN(random_state=0)),
        ("gb", GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=0,
        )),
    ]
)


# Dictionnaire non linéaire : 4 modèles × 2 variantes chacun
models_nonlinear = {
    "tree_base": tree_base,
    "tree_adasyn": tree_adasyn,
    "rf_base": rf_base,
    "rf_adasyn": rf_adasyn,
    "ada_base": ada_base,
    "ada_adasyn": ada_adasyn,
    "gb_base": gb_base,
    "gb_adasyn": gb_adasyn,
}

# Evaluation des modèles non linéaires
results_nonlinear_rows = []

for ds in datasets:
    # On récupère les infos du dataset (notamment s'il est déséquilibré)
    row_info = info_df[info_df["dataset"] == ds].iloc[0]
    is_imbalanced = bool(row_info["imbalanced"])

    # Chargement des données préparées X, y
    X, y = data_recovery(ds)

    for model_name, model in models_nonlinear.items():

        # On applique les variantes "adasyn" uniquement aux jeux déséquilibrés.
        # Sur les jeux équilibrés, cela n'apporte souvent rien et peut même dégrader.
        if ("adasyn" in model_name) and (not is_imbalanced):
            continue

        # Évaluation selon le protocole commun (train/test stratifié, F1, AUC, etc.)
        res = evaluate_model(model, X, y)

        # On ajoute les méta-informations nécessaires pour le benchmark global
        res.update({
            "dataset": ds,
            "model_family": "nonlinear",   
            "model_name": model_name,      
            "imbalanced": is_imbalanced,
        })

        results_nonlinear_rows.append(res)

# DataFrame de résultats pour les méthodes non linéaires
results_nonlinear_df = pd.DataFrame(results_nonlinear_rows)
print(results_nonlinear_df.head())




   accuracy        f1       auc  train_time  n_train  n_test   dataset  \
0  0.775120  0.237838  0.557094    0.031587     2923    1254  abalone8   
1  0.755183  0.241975  0.557864    0.059197     2923    1254  abalone8   
2  0.852472  0.170404  0.743235    0.297291     2923    1254  abalone8   
3  0.798246  0.321716  0.745309    0.332926     2923    1254  abalone8   
4  0.863636  0.000000  0.740104    0.347575     2923    1254  abalone8   

  model_family   model_name  imbalanced  
0    nonlinear    tree_base        True  
1    nonlinear  tree_adasyn        True  
2    nonlinear      rf_base        True  
3    nonlinear    rf_adasyn        True  
4    nonlinear     ada_base        True  


In [24]:
# je sauvgarde le résultat dans un fichier csv dans un répertoire résultats vu que pour les modèles non linéaires, ça prend plus de temps à s'exécuter
results_nonlinear_df.to_csv("resultats/results_nonlinear_models.csv", index=False) 

In [25]:
# Construction de la table avec F1-score
f1_table = results_nonlinear_df.pivot_table(
    index="dataset",
    columns="model_name",
    values="f1"
)       
print(f1_table)

model_name     ada_adasyn  ada_base  gb_adasyn   gb_base  rf_adasyn   rf_base  \
dataset                                                                         
abalone17        0.092050  0.000000   0.080645  0.000000   0.046512  0.000000   
abalone20        0.061856  0.000000   0.063492  0.000000   0.111111  0.000000   
abalone8         0.344988  0.000000   0.372263  0.078431   0.321716  0.170404   
australian            NaN  0.845714        NaN  0.828729        NaN  0.852459   
autompg               NaN  0.853933        NaN  0.904762        NaN  0.840909   
balance               NaN  0.982456        NaN  0.870056        NaN  0.848837   
bankmarketing    0.511901  0.445513   0.557692  0.516228   0.500000  0.491778   
bupa                  NaN  0.611765        NaN  0.620690        NaN  0.602740   
german                NaN  0.554839        NaN  0.545455        NaN  0.449275   
glass                 NaN  0.682927        NaN  0.809524        NaN  0.780488   
hayes            0.900000  1

In [None]:
# Le tableau avec accuracy, qui peut etre trompante sur des datasets déséquilibrés
acc_table = results_nonlinear_df.pivot_table(
    index="dataset",
    columns="model_name",
    values="accuracy"
)   
print(acc_table)

model_name     ada_adasyn  ada_base  gb_adasyn   gb_base  rf_adasyn   rf_base  \
dataset                                                                         
abalone17        0.826954  0.985646   0.909091  0.982456   0.967305  0.986443   
abalone20        0.927432  0.992026   0.952951  0.990431   0.974482  0.993620   
abalone8         0.551834  0.863636   0.725678  0.850080   0.798246  0.852472   
australian            NaN  0.869565        NaN  0.850242        NaN  0.869565   
autompg               NaN  0.889831        NaN  0.932203        NaN  0.881356   
balance               NaN  0.984043        NaN  0.877660        NaN  0.861702   
bankmarketing    0.895680  0.897965   0.905043  0.906591   0.902978  0.904306   
bupa                  NaN  0.682692        NaN  0.682692        NaN  0.721154   
german                NaN  0.770000        NaN  0.766667        NaN  0.746667   
glass                 NaN  0.800000        NaN  0.876923        NaN  0.861538   
hayes            0.950000  1