In [3]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels
Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/56/b0/e3efafd9c97ed931f6453bd71aa8feaffc9217e6121af65fda06cf32f608/xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Obtaining dependency information for nvidia-nccl-cu12 from https://files.pythonhosted.org/packages/73/61/fa7a709b3f2d57038d99c220eba816b21466567835e4d46300ff674ed975/nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl.metadata
  Downloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [38;5;70m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0

In [4]:
# ===========================================
#  PROJET GREEN AI - PREDICTION FIRE_SIZE
#  Modélisation "simple mais maîtrisée"
# ===========================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from xgboost import XGBRegressor


# -----------------------------
# 1. Chargement des données
# -----------------------------
df = pd.read_csv("1992_FPA_FOD_cons.csv", low_memory=False)


# ----------------------------------------------------------
# 2. Feature engineering spécifique : NDVI_mean_scalar
#    NDVI_mean est une liste de valeurs ('0.1' '0.07' ...)
#    -> on la convertit en VRAI NDVI moyen (float)
# ----------------------------------------------------------
def parse_ndvi_cell(s):
    """Convertit une chaîne de type 
    `'0.1' '0.07' '0.19' ...` en moyenne numérique.
    Retourne NaN si rien n'est exploitable."""
    if pd.isna(s):
        return np.nan
    s = str(s).strip()
    if not s:
        return np.nan
    parts = s.replace("'", " ").split()
    vals = []
    for p in parts:
        try:
            vals.append(float(p))
        except ValueError:
            continue
    if not vals:
        return np.nan
    return float(np.mean(vals))

df["NDVI_mean_scalar"] = df["NDVI_mean"].apply(parse_ndvi_cell)


# ------------------------------------
# 3. Sélection des variables utiles
# ------------------------------------
# Variables numériques : météo, indices de danger, moyennes 5j, topographie, NDVI
features_numeric = [
    # météo instantanée
    "pr", "tmmn", "tmmx", "rmin", "rmax", "sph", "vs", "th", "srad", "etr",
    # indices de danger
    "fm100", "fm1000", "bi", "vpd", "erc",
    # moyennes glissantes 5 jours
    "pr_5D_mean", "tmmn_5D_mean", "tmmx_5D_mean",
    "rmin_5D_mean", "rmax_5D_mean", "sph_5D_mean",
    "vs_5D_mean", "th_5D_mean", "srad_5D_mean", "etr_5D_mean",
    "fm100_5D_mean", "fm1000_5D_mean", "bi_5D_mean", "vpd_5D_mean", "erc_5D_mean",
    # topographie
    "Elevation", "Slope", "Aspect", "TRI", "TPI",
    # NDVI agrégé
    "NDVI_mean_scalar",
]

# Variables catégorielles (encodage one-hot)
features_categorical = [
    "EVT", "EVH", "EVC",
    "Ecoregion_US_L3CODE", "Ecoregion_US_L4CODE",
]

target = "FIRE_SIZE"


# -------------------------------------------------
# 4. Sous-dataframe + nettoyage basique de FIRE_SIZE
# -------------------------------------------------
df_model = df[features_numeric + features_categorical + [target]].copy()

# On ne garde que les feux avec une taille strictement positive
df_model = df_model[df_model[target] > 0]

# Conversion des colonnes numériques en float (sécurisée)
for col in features_numeric:
    df_model[col] = pd.to_numeric(df_model[col], errors="coerce")

# Transformation de la cible en log pour réduire l'effet des mega-feux
df_model["target_log"] = np.log1p(df_model[target])


# -----------------------------
# 5. Split train / test
# -----------------------------
X = df_model[features_numeric + features_categorical]
y = df_model["target_log"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# --------------------------------------------
# 6. Pipeline de prétraitement (Green & propre)
#    - Imputation des NaN
#    - Standardisation des numériques
#    - One-hot des catégorielles
# --------------------------------------------
from sklearn.pipeline import Pipeline as SkPipeline  # pour éviter confusion

numeric_pipeline = SkPipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = SkPipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, features_numeric),
        ("cat", categorical_pipeline, features_categorical),
    ]
)


# -----------------------------
# 7. Modèle baseline : ElasticNet
#    -> modèle linéaire, simple, peu coûteux
# -----------------------------
model_enet = SkPipeline(steps=[
    ("preprocess", preprocessor),
    ("model", ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42))
])

model_enet.fit(X_train, y_train)
y_pred_enet = model_enet.predict(X_test)

print("=== ElasticNet (baseline Green AI) ===")
print("RMSE log :", np.sqrt(mean_squared_error(y_test, y_pred_enet)))
print("MAE log  :", mean_absolute_error(y_test, y_pred_enet))
print("R² log   :", r2_score(y_test, y_pred_enet))


# -----------------------------
# 8. Modèle principal : XGBoost
#    -> modèle non linéaire MAIS réglé de façon raisonnable
#       (nombre d'arbres limité, tree_method='hist', pas de tuning lourd)
# -----------------------------
model_xgb = SkPipeline(steps=[
    ("preprocess", preprocessor),
    ("model", XGBRegressor(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.9,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        tree_method="hist",   # version plus efficace / moins coûteuse
        n_jobs=4,
        random_state=42
    ))
])

model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)

print("\n=== XGBoost (log) ===")
print("RMSE log :", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))
print("MAE log  :", mean_absolute_error(y_test, y_pred_xgb))
print("R² log   :", r2_score(y_test, y_pred_xgb))

# En unités réelles (acres)
y_test_real = np.expm1(y_test)
y_pred_real = np.expm1(y_pred_xgb)

print("\n=== XGBoost (surface réelle) ===")
print("RMSE réel :", np.sqrt(mean_squared_error(y_test_real, y_pred_real)))
print("MAE réel  :", mean_absolute_error(y_test_real, y_pred_real))


# ----------------------------------------------------
# 9. Interprétation rapide : features les plus importantes
# ----------------------------------------------------
# On récupère les noms de features après préprocessing
feature_names = model_xgb.named_steps["preprocess"].get_feature_names_out()
importances = model_xgb.named_steps["model"].feature_importances_

# top 15
idx_sorted = np.argsort(importances)[::-1][:15]
print("\n=== Top 15 features (XGBoost) ===")
for name, imp in zip(feature_names[idx_sorted], importances[idx_sorted]):
    print(f"{name:40s} -> {imp:.4f}")


=== ElasticNet (baseline Green AI) ===
RMSE log : 1.185545844170466
MAE log  : 0.9236331566428228
R² log   : 0.028502361769987328

=== XGBoost (log) ===
RMSE log : 1.0717072613571375
MAE log  : 0.8184621773381953
R² log   : 0.2061153757002816

=== XGBoost (surface réelle) ===
RMSE réel : 54.4028181367064
MAE réel  : 10.245447979593221

=== Top 15 features (XGBoost) ===
cat__Ecoregion_US_L3CODE_45.0            -> 0.0390
cat__Ecoregion_US_L3CODE_25.0            -> 0.0118
cat__Ecoregion_US_L4CODE_65f             -> 0.0118
cat__Ecoregion_US_L3CODE_39.0            -> 0.0114
cat__Ecoregion_US_L3CODE_66.0            -> 0.0093
cat__Ecoregion_US_L3CODE_84.0            -> 0.0089
cat__Ecoregion_US_L4CODE_66g             -> 0.0065
cat__Ecoregion_US_L4CODE_84b             -> 0.0064
cat__Ecoregion_US_L3CODE_36.0            -> 0.0059
cat__Ecoregion_US_L4CODE_65d             -> 0.0057
cat__EVT_7997.0                          -> 0.0056
cat__Ecoregion_US_L4CODE_69d             -> 0.0056
cat__Ecoregion_U