# 00 — Setup & Conversion
## FireSpreadNet · Next Day Wildfire Spread

**Ce notebook est autonome** : il ne dépend d'aucun script externe.  
Il suffit de placer ce notebook dans le même dossier que les autres notebooks,  
et d'avoir les fichiers `.tfrecord` accessibles quelque part sur le disque.

### Ce que fait ce notebook :
1. Détecte automatiquement le dossier contenant les fichiers `.tfrecord`
2. Installe les dépendances manquantes si nécessaire
3. Convertit les TFRecords → numpy `.npz` (dans `processed/`)
4. Sauvegarde un fichier `setup_config.json` lu par tous les autres notebooks

### Usage :
- Exécute toutes les cellules de haut en bas une seule fois
- Les autres notebooks (`01_EDA`, `02_Preprocessing`, etc.) peuvent ensuite être lancés directement

## Étape 1 — Dépendances

In [None]:
import importlib, subprocess, sys

def ensure(pkg, import_name=None):
    import_name = import_name or pkg
    if importlib.util.find_spec(import_name) is None:
        print(f"Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])
    else:
        print(f"  {pkg} — OK")

print("Checking dependencies...")
ensure("numpy")
ensure("tqdm")
ensure("matplotlib")
ensure("seaborn")
ensure("pandas")

# TFRecord parsing backend — try tensorflow first, then lightweight tfrecord
_BACKEND = None
if importlib.util.find_spec("tensorflow") is not None:
    _BACKEND = "tensorflow"
    print("  tensorflow — OK (TFRecord backend)")
elif importlib.util.find_spec("tfrecord") is not None:
    _BACKEND = "tfrecord"
    print("  tfrecord — OK (TFRecord backend)")
else:
    print("  Installing tfrecord (lightweight, no TF needed)...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "tfrecord", "-q"])
    _BACKEND = "tfrecord"

print(f"\nTFRecord backend: {_BACKEND}")
print("All dependencies OK.")

## Étape 2 — Détection automatique des chemins

In [None]:
from pathlib import Path

# ── Notebook directory (works wherever the notebooks are placed) ──────────────
NB_DIR = Path().resolve()   # current working directory when running the notebook

# ── Locate TFRecord files (search parent directories and common sub-paths) ────
TFRECORD_SEARCH_ROOTS = [
    NB_DIR,
    NB_DIR.parent,
    NB_DIR.parent / "data",
    NB_DIR.parent / "data" / "raw",
    NB_DIR.parent / "data" / "raw" / "ml_tracks" / "a.fire_danger",
    NB_DIR / "data",
    NB_DIR / "data" / "raw",
    NB_DIR / "data" / "raw" / "ml_tracks" / "a.fire_danger",
]

RAW_DATA_DIR = None
for root in TFRECORD_SEARCH_ROOTS:
    if root.exists():
        tfr_files = list(root.rglob("*.tfrecord")) + list(root.rglob("*.tfrecord*"))
        if tfr_files:
            # Use the directory containing the first .tfrecord found
            RAW_DATA_DIR = tfr_files[0].parent
            break

if RAW_DATA_DIR is None:
    print("⚠️  TFRecord files not found automatically.")
    print("    Set RAW_DATA_DIR manually below:")
    RAW_DATA_DIR = NB_DIR.parent / "data" / "raw" / "ml_tracks" / "a.fire_danger"
    print(f"    Using default: {RAW_DATA_DIR}")
else:
    tfr_count = len(list(RAW_DATA_DIR.glob("*.tfrecord*")))
    print(f"✅ Found TFRecord data in: {RAW_DATA_DIR}")
    print(f"   {tfr_count} .tfrecord file(s) detected")

# ── Output directories ────────────────────────────────────────────────────────
PROCESSED_DIR = NB_DIR.parent / "data" / "processed"
FIGURES_DIR   = NB_DIR.parent / "results" / "figures"
MODELS_DIR    = NB_DIR.parent / "saved_models"

for d in [PROCESSED_DIR, FIGURES_DIR, MODELS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print(f"\nProcessed output : {PROCESSED_DIR}")
print(f"Figures output   : {FIGURES_DIR}")
print(f"Models output    : {MODELS_DIR}")

## Étape 3 — Configuration (embarquée, sans import externe)

In [None]:
# ══════════════════════════════════════════════════════════════
# CONFIGURATION — Next Day Wildfire Spread (Huot et al., 2022)
# Entièrement embarquée — aucun import de config.py nécessaire
# ══════════════════════════════════════════════════════════════

GRID_SIZE = 64          # taille du patch spatial (pixels)
CELL_SIZE_KM = 1.0      # résolution ~1 km/pixel (MODIS)
TIMESTEP_H = 24         # pas de temps : 1 jour

# Noms lisibles des 12 canaux d'entrée
FEATURE_CHANNELS = [
    "elevation",        # SRTM — altitude (m)
    "wind_speed",       # GRIDMET 'th' — vitesse du vent (m/s)
    "wind_direction",   # GRIDMET 'vs' — direction du vent (°)
    "min_temp",         # GRIDMET 'tmmn' — temp. min. (K)
    "max_temp",         # GRIDMET 'tmmx' — temp. max. (K)
    "humidity",         # GRIDMET 'sph' — humidité spécifique (kg/kg)
    "precipitation",    # GRIDMET 'pr' — précipitations (mm)
    "drought_index",    # GRIDMET 'PDSI' — indice de sécheresse
    "ndvi",             # VIIRS NDVI — végétation (−1 à 1)
    "erc",              # GRIDMET 'ERC' — Energy Release Component
    "population",       # LandScan — densité de population (pers/km²)
    "prev_fire_mask",   # FIRMS/VIIRS — masque feu J-1 (binaire)
]
N_INPUT_CHANNELS = len(FEATURE_CHANNELS)  # 12

# Clés TFRecord réelles (noms dans les fichiers .tfrecord)
TFRECORD_INPUT_KEYS = [
    "elevation", "th", "vs", "tmmn", "tmmx",
    "sph", "pr", "PDSI", "NDVI", "ERC",
    "population", "PrevFireMask",
]
TFRECORD_TARGET_KEY = "FireMask"

# Indice de chaque canal (pour les modèles physiques)
CH = {name: i for i, name in enumerate(FEATURE_CHANNELS)}

N_PIXELS = GRID_SIZE * GRID_SIZE  # 4096

print(f"Canaux d'entrée ({N_INPUT_CHANNELS}) :")
for i, name in enumerate(FEATURE_CHANNELS):
    print(f"  [{i:2d}] {name}")

## Étape 4 — Conversion TFRecord → NumPy (.npz)

In [None]:
import numpy as np
from tqdm.auto import tqdm

ALL_FEATURES = TFRECORD_INPUT_KEYS + [TFRECORD_TARGET_KEY]

# ── Helpers ───────────────────────────────────────────────────────────────────

def find_tfrecords(root: Path, split_tag: str):
    """Trouve les fichiers .tfrecord pour un split donné."""
    patterns = [f"*{split_tag}*.tfrecord", f"*{split_tag}*.tfrecord*"]
    files = []
    for p in patterns:
        files += sorted(root.rglob(p))
    # Déduplique, retire les .index
    seen, unique = set(), []
    for f in files:
        if f not in seen and not f.name.endswith(".index"):
            seen.add(f); unique.append(f)
    return unique


def parse_tf(tfrecord_files, max_samples=None):
    """Parse avec TensorFlow."""
    import tensorflow as tf
    desc = {f: tf.io.FixedLenFeature([N_PIXELS], tf.float32) for f in ALL_FEATURES}
    X_list, Y_list, count = [], [], 0
    for fpath in tqdm(tfrecord_files, desc="TF backend"):
        for raw in tf.data.TFRecordDataset(str(fpath)):
            p = tf.io.parse_single_example(raw, desc)
            x = np.stack([p[f].numpy().reshape(GRID_SIZE, GRID_SIZE)
                          for f in TFRECORD_INPUT_KEYS], axis=0).astype(np.float32)
            y = p[TFRECORD_TARGET_KEY].numpy().reshape(1, GRID_SIZE, GRID_SIZE).astype(np.float32)
            if x[-1].max() == 0 and y.max() == 0: continue
            X_list.append(x); Y_list.append(y); count += 1
            if max_samples and count >= max_samples: break
        if max_samples and count >= max_samples: break
    return (np.stack(X_list), np.stack(Y_list)) if X_list else (np.empty((0,)), np.empty((0,)))


def parse_tfrecord_pkg(tfrecord_files, max_samples=None):
    """Parse avec le package léger tfrecord (pas de TF)."""
    import tfrecord as tfr
    desc = {f: "float" for f in ALL_FEATURES}
    X_list, Y_list, count = [], [], 0
    for fpath in tqdm(tfrecord_files, desc="tfrecord pkg"):
        for rec in tfr.tfrecord_loader(str(fpath), index_path=None, description=desc):
            x = np.stack([np.array(rec[f], dtype=np.float32).reshape(GRID_SIZE, GRID_SIZE)
                          for f in TFRECORD_INPUT_KEYS], axis=0)
            y = np.array(rec[TFRECORD_TARGET_KEY], dtype=np.float32).reshape(1, GRID_SIZE, GRID_SIZE)
            if x[-1].max() == 0 and y.max() == 0: continue
            X_list.append(x); Y_list.append(y); count += 1
            if max_samples and count >= max_samples: break
        if max_samples and count >= max_samples: break
    return (np.stack(X_list), np.stack(Y_list)) if X_list else (np.empty((0,)), np.empty((0,)))


parse_fn = parse_tf if _BACKEND == "tensorflow" else parse_tfrecord_pkg

# ── Conversion ────────────────────────────────────────────────────────────────

# tag des fichiers par split : train=train, val=eval, test=test
SPLIT_TAGS = {"train": "train", "val": "eval", "test": "test"}

# ⚠️  Optionnel : limiter le nombre d'échantillons pour un test rapide
MAX_SAMPLES = None   # Mettre ex. 2000 pour un test rapide, None = tout

split_stats = {}

for split, tag in SPLIT_TAGS.items():
    out_path = PROCESSED_DIR / f"{split}.npz"
    if out_path.exists():
        data = np.load(out_path)
        print(f"  {split}: déjà converti — {data['X'].shape[0]} échantillons ({out_path.name})")
        split_stats[split] = int(data['X'].shape[0])
        continue

    files = find_tfrecords(RAW_DATA_DIR, tag)
    if not files:
        print(f"  ⚠️  {split} ({tag}): aucun fichier trouvé dans {RAW_DATA_DIR}")
        continue

    print(f"\n  Conversion {split} ({len(files)} fichier(s))...")
    X, Y = parse_fn(files, MAX_SAMPLES)

    if len(X) == 0:
        print(f"  ⚠️  Aucun échantillon valide pour {split}")
        continue

    X = np.nan_to_num(X, nan=0.0)
    Y = (np.nan_to_num(Y, nan=0.0) > 0).astype(np.float32)

    np.savez_compressed(out_path, X=X, Y=Y)
    split_stats[split] = len(X)
    print(f"  ✅ {split}: {len(X)} échantillons → {out_path.name}")
    print(f"     X shape: {X.shape}, Y shape: {Y.shape}")
    print(f"     Taux pixels feu: {Y.mean():.4f}")

print("\nConversion terminée !")

## Étape 5 — Statistiques de normalisation (à partir du train)

In [None]:
import json

norm_stats = {}
train_npz = PROCESSED_DIR / "train.npz"

if train_npz.exists():
    print("Calcul des statistiques par canal (train set)...")
    data = np.load(train_npz)
    X_train = data['X']   # (N, C, H, W)
    for i, name in enumerate(FEATURE_CHANNELS):
        ch = X_train[:, i].astype(np.float64)
        norm_stats[name] = {
            "mean": float(np.nanmean(ch)),
            "std":  float(np.nanstd(ch)) + 1e-8,
        }
    print(f"  {len(norm_stats)} canaux traités\n")
    print(f"  {'Canal':<20} {'Moyenne':>12} {'Écart-type':>12}")
    print(f"  {'-'*44}")
    for name, s in norm_stats.items():
        print(f"  {name:<20} {s['mean']:>12.4f} {s['std']:>12.4f}")
else:
    print("⚠️  train.npz introuvable — statistiques par défaut utilisées.")
    # Valeurs approximatives issues de la littérature (Huot et al., 2022)
    norm_stats = {
        "elevation":      {"mean": 1200.0, "std": 800.0},
        "wind_speed":     {"mean": 3.5,    "std": 1.5},
        "wind_direction": {"mean": 200.0,  "std": 80.0},
        "min_temp":       {"mean": 285.0,  "std": 8.0},
        "max_temp":       {"mean": 305.0,  "std": 8.0},
        "humidity":       {"mean": 0.005,  "std": 0.003},
        "precipitation":  {"mean": 1.0,    "std": 5.0},
        "drought_index":  {"mean": 0.0,    "std": 3.0},
        "ndvi":           {"mean": 0.3,    "std": 0.2},
        "erc":            {"mean": 40.0,   "std": 25.0},
        "population":     {"mean": 50.0,   "std": 200.0},
        "prev_fire_mask": {"mean": 0.0,    "std": 1.0},
    }

## Étape 6 — Sauvegarde de la configuration (setup_config.json)
Ce fichier est chargé par tous les autres notebooks au démarrage.

In [None]:
import json

setup_config = {
    "PROCESSED_DIR":     str(PROCESSED_DIR),
    "FIGURES_DIR":       str(FIGURES_DIR),
    "MODELS_DIR":        str(MODELS_DIR),
    "RAW_DATA_DIR":      str(RAW_DATA_DIR),
    "GRID_SIZE":         GRID_SIZE,
    "CELL_SIZE_KM":      CELL_SIZE_KM,
    "TIMESTEP_H":        TIMESTEP_H,
    "FEATURE_CHANNELS":  FEATURE_CHANNELS,
    "N_INPUT_CHANNELS":  N_INPUT_CHANNELS,
    "CH":                CH,
    "TFRECORD_INPUT_KEYS":  TFRECORD_INPUT_KEYS,
    "TFRECORD_TARGET_KEY":  TFRECORD_TARGET_KEY,
    "norm_stats":        norm_stats,
    "split_samples":     split_stats,
    "tfrecord_backend":  _BACKEND,
}

CONFIG_PATH = NB_DIR / "setup_config.json"
with open(CONFIG_PATH, "w") as f:
    json.dump(setup_config, f, indent=2)

print(f"✅  setup_config.json sauvegardé : {CONFIG_PATH}")
print("\nLes autres notebooks peuvent le charger avec :")
print("  import json")
print("  from pathlib import Path")
print("  cfg = json.load(open(Path().resolve() / 'setup_config.json'))")

## Étape 7 — Résumé & vérification finale

In [None]:
from pathlib import Path

print("=" * 55)
print("  RÉSUMÉ DU SETUP — FireSpreadNet")
print("=" * 55)

checks = {
    "Données brutes (TFRecord)": RAW_DATA_DIR,
    "Données traitées (npz)":    PROCESSED_DIR,
    "Figures":                    FIGURES_DIR,
    "Modèles":                    MODELS_DIR,
    "Config (setup_config.json)": NB_DIR / "setup_config.json",
}

all_ok = True
for label, path in checks.items():
    exists = Path(path).exists()
    status = "✅" if exists else "❌"
    print(f"  {status}  {label}")
    print(f"       {path}")
    if not exists:
        all_ok = False

print()
for split in ["train", "val", "test"]:
    npz = PROCESSED_DIR / f"{split}.npz"
    if npz.exists():
        d = np.load(npz)
        n = d['X'].shape[0]
        print(f"  ✅  {split}.npz — {n} échantillons")
    else:
        print(f"  ❌  {split}.npz — non trouvé")
        all_ok = False

print()
if all_ok:
    print("  ✅  Setup complet — tu peux lancer les autres notebooks !")
else:
    print("  ⚠️  Certains fichiers sont manquants — vérifie les chemins ci-dessus.")

print("=" * 55)