In [1]:
import numpy as np
import joblib
import json
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_recall_fscore_support,
    classification_report
)
from lime.lime_tabular import LimeTabularExplainer
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix
)
from sklearn.base import clone
from sklearn.metrics import accuracy_score, f1_score

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


# Chargement des donn√©es

In [2]:
X_train = np.load('data/X_train_scaled.npy')
X_test  = np.load('data/X_test_scaled.npy')
y_train = np.load('data/y_train.npy')
y_test  = np.load('data/y_test.npy')

# Encodage des labels
le = joblib.load('models/label_encoder.pkl')
classes = le.classes_
len(classes), classes


# HYPERPARAM TUNING

In [3]:
rf_base = RandomForestClassifier(
    random_state=42,
    n_jobs=-1
)

param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.5],
}

# Meilleur mod√®le RF + m√©triques sur le test

In [4]:
best_rf = RandomForestClassifier(
    n_estimators=300,      # üëâ remplace par ta meilleure valeur
    max_depth=20,         # üëâ idem
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

# Entra√Ænement sur tout le jeu d'entra√Ænement
best_rf.fit(X_train, y_train)
y_pred_test = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
test_f1_macro = f1_score(y_test, y_pred_test, average="macro")

print("\nEVALUATION DU MODELE RANDOM FOREST")
print("=" * 60)
print(f"Test Accuracy        : {test_accuracy:.4f}")
print(f"F1_macro (test)      : {test_f1_macro:.4f}")
print(f"Nombre de classes    : {len(classes)}")
print(f"Classes              : {classes}")


# Classification report + matrice de confusion

In [5]:
print("\nClassification report :")
print(classification_report(y_test, y_pred_test, target_names=classes))

In [6]:
plt.figure(figsize=(12, 10))
cm = confusion_matrix(y_test, y_pred_test)
sns.heatmap(
    cm,
    annot=True,        # ‚Üê affiche les valeurs
    fmt="d",           # ‚Üê format entier (pas float)
    cmap="Blues",
    cbar=True
)

plt.title("Matrice de confusion ‚Äì Random Forest")
plt.xlabel("Pr√©dictions")
plt.ylabel("V√©rit√©s")
plt.tight_layout()
plt.show()

# M√©triques d√©taill√©es:

In [7]:
accuracy    = accuracy_score(y_test, y_pred_test)
macro_f1    = f1_score(y_test, y_pred_test, average="macro")
weighted_f1 = f1_score(y_test, y_pred_test, average="weighted")

print("=== M√©triques globales (Random Forest) ===")
print(f"Accuracy globale      : {accuracy:.4f}")
print(f"F1-macro              : {macro_f1:.4f}")
print(f"F1-pond√©r√© (weighted) : {weighted_f1:.4f}")


# ==== M√©triques par classe ====
precision, recall, f1, support = precision_recall_fscore_support(
    y_test, 
    y_pred_test, 
    labels=np.arange(len(classes)),
    zero_division=0
)

# noms demand√©s
metrics_names = ['Precision', 'Recall', 'F1-Score', 'Test Accuracy']

# ==== Tableau d√©taill√© ====
metrics_df = pd.DataFrame({
    "Classe": classes,
    "Support": support,
    "Precision": precision,
    "Recall": recall,
    "F1-Score": f1,
})

# Ajouter la Test Accuracy globale (identique √† chaque ligne)
metrics_df["Test Accuracy"] = accuracy

print("\n=== M√©triques d√©taill√©es par classe ===")
display(metrics_df.round(3))


# ==== Rapport sklearn ====
print("\n=== Rapport de classification (sklearn) ===")
print(classification_report(
    y_test, 
    y_pred_test, 
    target_names=classes, 
    zero_division=0
))

# Visualisation des m√©triques

In [8]:
from sklearn.metrics import precision_score, recall_score
import matplotlib.pyplot as plt

# === M√©triques globales (pond√©r√©es) ===
precision_weighted = precision_score(y_test, y_pred_test, average="weighted")
recall_weighted    = recall_score(y_test, y_pred_test, average="weighted")
f1_global          = weighted_f1      # tu peux mettre macro_f1 si tu pr√©f√®res
test_acc           = accuracy

model_name = "Random Forest"

# === Figure 2x2 comme dans ta capture ===
fig, axes = plt.subplots(2, 2, figsize=(10, 8))

# --- 1) Precision ---
ax = axes[0, 0]
ax.barh([model_name], [precision_weighted], color="#F08080")
ax.set_xlim(0, 1)
ax.set_xlabel("Precision")
ax.set_title(f"Precision : {precision_weighted:.4f}")
ax.text(precision_weighted, 0, f"{precision_weighted:.4f}", va="center", ha="left")

# --- 2) Recall ---
ax = axes[0, 1]
ax.barh([model_name], [recall_weighted], color="#90EE90")
ax.set_xlim(0, 1)
ax.set_xlabel("Recall")
ax.set_title(f"Recall: {recall_weighted:.4f}")
ax.text(recall_weighted, 0, f"{recall_weighted:.4f}", va="center", ha="left")

# --- 3) F1-Score ---
ax = axes[1, 0]
ax.barh([model_name], [f1_global], color="#DDA0DD")
ax.set_xlim(0, 1)
ax.set_xlabel("F1-Score")
ax.set_title(f"F1-Score : {f1_global:.4f}")
ax.text(f1_global, 0, f"{f1_global:.4f}", va="center", ha="left")

# --- 4) Test Accuracy ---
ax = axes[1, 1]
ax.barh([model_name], [test_acc], color="#87CEFA")
ax.set_xlim(0, 1)
ax.set_xlabel("Test Accuracy")
ax.set_title(f"Test Accuracy: {test_acc:.4f}")
ax.text(test_acc, 0, f"{test_acc:.4f}", va="center", ha="left")

plt.tight_layout()
plt.show()


# Barplot F1-score par classe

In [9]:
metrics_sorted = metrics_df.sort_values("F1-Score", ascending=False)

plt.figure(figsize=(12, 7))

# Palette nuance de vert
colors = sns.color_palette("Greens", n_colors=len(metrics_sorted))

# Barplot horizontal
plt.barh(metrics_sorted["Classe"], metrics_sorted["F1-Score"], color=colors)

plt.xlim(0, 1)
plt.title("F1-Score par classe ‚Äì Random Forest")
plt.xlabel("F1-Score")
plt.ylabel("Classe")
plt.tight_layout()
plt.show()



# Vraie courbe d‚Äôoverfitting

In [10]:
train_sizes_frac = np.linspace(0.1, 1.0, 10)

train_scores = []
test_scores  = []

for frac in train_sizes_frac:
    n_samples = int(len(X_train) * frac)
    X_sub = X_train[:n_samples]
    y_sub = y_train[:n_samples]

    model_clone = clone(best_rf)
    model_clone.fit(X_sub, y_sub)

    y_pred_train = model_clone.predict(X_sub)
    y_pred_test  = model_clone.predict(X_test)

    train_scores.append(accuracy_score(y_sub, y_pred_train))
    test_scores.append(accuracy_score(y_test, y_pred_test))

train_scores = np.array(train_scores)
test_scores  = np.array(test_scores)

train_errors = 1 - train_scores
test_errors  = 1 - test_scores

x = train_sizes_frac * 100
idx_opt = np.argmin(test_errors)
x_opt = x[idx_opt]
y_max = max(train_errors.max(), test_errors.max())

plt.figure(figsize=(8, 5))
plt.plot(x, train_errors, marker="o", linewidth=2, label="Training error")
plt.plot(x, test_errors, marker="o", linewidth=2, label="Validation/Test error")

plt.axvline(x_opt, color="black", linestyle="--")
plt.text(x_opt, y_max * 0.95, "Good-fit", ha="center", va="bottom")

# Fl√®ches Under-fitting / Over-fitting
plt.annotate(
    "Under-fitting",
    xy=(x.min()+5, y_max*0.88),
    xytext=(x.min()+20, y_max*0.88),
    arrowprops=dict(arrowstyle="<->"),
    ha="center",
    va="center"
)
plt.annotate(
    "Over-fitting",
    xy=(x_opt+5, y_max*0.88),
    xytext=(x.max()-10, y_max*0.88),
    arrowprops=dict(arrowstyle="<->"),
    ha="center",
    va="center"
)

plt.xlabel("Training set size (%)")
plt.ylabel("Error (1 - accuracy)")
plt.title("Under-fitting vs Over-fitting ‚Äì Random Forest")
plt.legend(loc="upper right")
plt.grid(True)
plt.tight_layout()
plt.show()

# Courbes ROC (One-vs-Rest)

In [11]:
best_model = joblib.load("models/tuned/random_forest_best_only_eval.pkl")
# Nombre de classes
n_classes = len(classes)

# 1) Binarisation des labels pour le One-vs-Rest
#   y_test_bin : matrice (nb_samples, n_classes) avec 0/1
y_test_bin = label_binarize(y_test, classes=np.arange(n_classes))

# 2) Probabilit√©s pr√©dites par le mod√®le sur le test
y_score = best_model.predict_proba(X_test)   # shape : (nb_samples, n_classes)

# 3) Courbes ROC et AUC par classe
fpr = {}   # False Positive Rate
tpr = {}   # True Positive Rate
roc_auc = {}

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# 4) Micro-average ROC (toutes les classes "aplatis")
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# 5) Affichage
plt.figure(figsize=(10, 7))

# Courbe micro-average (globale)
plt.plot(
    fpr["micro"], tpr["micro"],
    label=f"Micro-average (AUC = {roc_auc['micro']:.3f})",
    color="magenta", linestyle="--", linewidth=3
)

# Courbes par classe (option : beaucoup de classes ‚Üí alpha faible)
for i, cls_name in enumerate(classes):
    plt.plot(
        fpr[i], tpr[i],
        lw=1, alpha=0.5,
        label=f"{cls_name} (AUC = {roc_auc[i]:.3f})"
    )

# Diagonale "mod√®le al√©atoire"
plt.plot([0, 1], [0, 1], "k--", label="Random")

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves ‚Äì One-vs-Rest")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(True)
plt.tight_layout()
plt.show()


# AUC Scores

In [12]:
# Tableau des AUC par classe
auc_scores = pd.DataFrame({
    "Classe": classes,
    "AUC": [roc_auc[i] for i in range(n_classes)]
})

# Tri par AUC d√©croissante
auc_scores = auc_scores.sort_values("AUC", ascending=False).reset_index(drop=True)

# Moyennes pour info
macro_auc = auc_scores["AUC"].mean()

print("=== AUC par classe ===")
display(auc_scores)

print(f"\nAUC moyenne (macro) : {macro_auc:.4f}")
print(f"AUC micro-average   : {roc_auc['micro']:.4f}")


# AUC par classe (visualisation)

In [13]:
auc_sorted = auc_scores.sort_values("AUC", ascending=True)

plt.figure(figsize=(10, 8))

# Palette d√©grad√©e jaune ‚Üí vert (comme ton screen)
colors = sns.color_palette("YlGn", n_colors=len(auc_sorted))

plt.barh(
    auc_sorted["Classe"],
    auc_sorted["AUC"],
    color=colors,
    edgecolor="black"
)

plt.xlim(0.0, 1.0)
plt.xlabel("AUC Score")
plt.ylabel("Crop Class")
plt.title("AUC Score per Class ‚Äì " + ("Random Forest" if hasattr(best_model, 'n_estimators') else "Naive Bayes"))

# Afficher la valeur AUC au bout de chaque barre
for i, v in enumerate(auc_sorted["AUC"]):
    plt.text(v + 0.005, i, f"{v:.3f}", va="center")

plt.tight_layout()
plt.show()

# Sauvegarder le mod√®le RF + infos

In [14]:
rf_model_path = "models/tuned/random_forest_best_only_eval.pkl"
joblib.dump(best_rf, rf_model_path)

# Sauvegarde des infos dans un json
best_rf_info = {
    "model_name": "Random Forest",
    "model_path": rf_model_path,
    "test_accuracy": float(test_accuracy),
    "test_f1_macro": float(test_f1_macro),
    "n_classes": len(classes),
    "classes": list(classes)
}

with open("results/tuning/best_rf_info.json", "w") as f:
    json.dump(best_rf_info, f, indent=4)

best_rf_info

# Xai_lime

In [15]:
X_train_arr = np.array(X_train)
X_test_arr  = np.array(X_test)

# 2) Noms des features
try:
    feature_names
    print("‚úÖ feature_names d√©j√† d√©fini, utilisation des noms existants.")
except NameError:
    feature_names = [f"feature_{i}" for i in range(X_train_arr.shape[1])]
    print("‚ÑπÔ∏è feature_names n'√©tait pas d√©fini : cr√©ation de noms g√©n√©riques.")

# 3) Cr√©ation de l‚Äôexplainer LIME
explainer = LimeTabularExplainer(
    X_train_arr,
    feature_names=feature_names,
    class_names=list(classes),
    mode="classification",
    discretize_continuous=True
)

# 4) Choix de l‚Äôinstance √† expliquer
idx = 10  # tu peux changer l‚Äôindex
instance = X_test_arr[idx]
true_label_idx  = int(y_test[idx])
true_label_name = classes[true_label_idx]

print(f"üîç Instance test n¬∞{idx}")
print(f"   Vraie classe : {true_label_name} (index {true_label_idx})")

# 5) On r√©cup√®re d'abord la classe pr√©dite par le Random Forest
pred_label_idx  = int(best_rf.predict(instance.reshape(1, -1))[0])
pred_label_name = classes[pred_label_idx]
print(f"   Classe pr√©dite : {pred_label_name} (index {pred_label_idx})")

# 6) Explication LIME pour CE label pr√©cis
exp = explainer.explain_instance(
    instance,
    best_rf.predict_proba,   # LIME appelle le mod√®le en proba
    num_features=10,         # top 10 features les plus importantes
    labels=[pred_label_idx]  # on demande le label pr√©dict√©, pas "1" par d√©faut
)

# 7) Affichage graphique pour ce label
fig = exp.as_pyplot_figure(label=pred_label_idx)
plt.title(f"Explication LIME ‚Äì Random Forest ‚Äì Instance {idx}", fontsize=14)
plt.tight_layout()
plt.show()

# 8) Vue d√©taill√©e dans le notebook (table + contributions)
exp.show_in_notebook(show_table=True)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=37cd5642-eb00-4cb1-969e-a9bc85cf5e83' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>