In [7]:
import pandas as pd
import numpy as np

# 1. CHARGEMENT
df = pd.read_parquet("../data/processed/NIGERIA_FINAL_FULL.parquet")

# Conversion Pluie (Si nécessaire)
if 'Rainf_f_tavg' in df.columns:
    if df['Rainf_f_tavg'].mean() < 1: 
        df['Rainfall_mm'] = df['Rainf_f_tavg'] * 86400 * 30.4
    else:
        df['Rainfall_mm'] = df['Rainf_f_tavg']

# Cible
df['target_pct'] = (df['phase35'] / df['population']) * 100

In [8]:

# ==============================================================================
# FONCTION AVANCÉE DE FEATURE ENGINEERING
# ==============================================================================
def create_advanced_features(df):
    print("Demarrage de l'ingenierie avancee...")
    
    df = df.sort_values(by=['admin2', 'date']).reset_index(drop=True)
    
    # A. SAISONNALITE CYCLIQUE (Pour respecter la continuité Dec -> Jan)
    df['month'] = df['date'].dt.month
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
    
    # B. DEFINITION DE LA PERIODE DE SOUDURE (LEAN SEASON)
    # Au Nigeria (Sahel), la soudure est typiquement Juin-Juillet-Aout-Septembre
    df['is_lean_season'] = df['month'].isin([6, 7, 8, 9]).astype(int)

    # Variables climatiques à traiter
    cols_meteo = ['Rainfall_mm', 'SoilMoi00_10cm_tavg', 'Tair_f_tavg', 'NDVI'] # Ajoute NDVI si dispo
    cols_meteo = [c for c in cols_meteo if c in df.columns]

    for col in cols_meteo:
        print(f"   Traitement : {col}")
        
        # 1. STATISTIQUES MOBILES (ROLLING)
        # Moyenne 3 mois (Saison courte)
        df[f'{col}_roll3m'] = df.groupby('admin2')[col].shift(1).rolling(window=3).mean().values
        # Moyenne 6 mois (Saison longue)
        df[f'{col}_roll6m'] = df.groupby('admin2')[col].shift(1).rolling(window=6).mean().values
        
        # 2. ANOMALIES (Z-SCORES) - LE PLUS IMPORTANT
        # Calcul de la moyenne et std historique POUR CE MOIS dans CETTE REGION
        # Ex: "Est-ce que ce mois d'Août est plus sec que les mois d'Août habituels à Abadam ?"
        
        # On groupe par Region et Mois pour avoir la climato de référence
        climato = df.groupby(['admin2', 'month'])[col].agg(['mean', 'std']).reset_index()
        climato.columns = ['admin2', 'month', f'{col}_clim_mean', f'{col}_clim_std']
        
        # On merge cette climato au dataset principal
        df = pd.merge(df, climato, on=['admin2', 'month'], how='left')
        
        # Calcul du Z-Score : (Valeur Actuelle - Moyenne Historique) / Ecart-type
        # Si Z < -1 : Déficit sévère (Sécheresse)
        # Si Z > +1 : Excédent (Inondation potentielle pour la pluie)
        df[f'{col}_anomaly'] = (df[col] - df[f'{col}_clim_mean']) / (df[f'{col}_clim_std'] + 1e-6)
        
        # 3. LAGS CLASSIQUES (Sur la valeur brute et sur l'anomalie)
        for lag in [1, 3, 6]:
            df[f'{col}_lag{lag}'] = df.groupby('admin2')[col].shift(lag)
            df[f'{col}_anomaly_lag{lag}'] = df.groupby('admin2')[f'{col}_anomaly'].shift(lag)

    # C. INTERACTIONS (CROSS-FEATURES)
    # Impact d'une sécheresse PENDANT la période de soudure (Facteur aggravant)
    # Si Anomalie Pluie est négative (Sécheresse) ET qu'on est en Soudure
    if 'Rainfall_mm_anomaly' in df.columns:
        df['drought_during_lean'] = df['Rainfall_mm_anomaly'] * df['is_lean_season']

    # D. NETTOYAGE
    # Suppression des colonnes intermédiaires de climato (mean/std) pour alléger
    cols_to_drop = [c for c in df.columns if '_clim_' in c]
    df = df.drop(columns=cols_to_drop)
    
    # Suppression des NaN dus aux Lags (Perte de la 1ere année)
    df_final = df.dropna(subset=[f'{cols_meteo[0]}_lag6'])
    
    print(f"Termine. Dimensions finales : {df_final.shape}")
    return df_final

# EXECUTION
df_features = create_advanced_features(df)

# Vérification d'une anomalie
print("\nExemple d'Anomalie (Z-Score) pour Abadam :")
cols_view = ['date', 'month', 'Rainfall_mm', 'Rainfall_mm_anomaly', 'is_lean_season']
display(df_features[df_features['admin2'] == 'Abadam'][cols_view].head(10))

# Sauvegarde
df_features.to_parquet("../data/processed/NIGERIA_DATASET_MODEL_ADVANCED.parquet")
print("Sauvegarde effectuee : NIGERIA_DATASET_MODEL_ADVANCED.parquet")


Demarrage de l'ingenierie avancee...
   Traitement : Rainfall_mm
   Traitement : SoilMoi00_10cm_tavg
   Traitement : Tair_f_tavg
Termine. Dimensions finales : (39096, 71)

Exemple d'Anomalie (Z-Score) pour Abadam :


Unnamed: 0,date,month,Rainfall_mm,Rainfall_mm_anomaly,is_lean_season
66,2020-07-01,7,123.381542,-0.002298,1
67,2020-08-01,8,169.966442,0.253828,1
68,2020-09-01,9,88.914138,-0.090626,1
69,2020-10-01,10,8.491211,-0.284646,0
70,2020-11-01,11,0.0,-0.662058,0
71,2020-12-01,12,0.0,-0.533568,0
72,2021-01-01,1,0.0,-0.446316,0
73,2021-02-01,2,0.0,-0.522239,0
74,2021-03-01,3,0.0,-1.017226,0
75,2021-04-01,4,0.54966,-1.083169,0


Sauvegarde effectuee : NIGERIA_DATASET_MODEL_ADVANCED.parquet




#  Data Dictionary

**Project:** Food Security Prediction in Nigeria
**Granularity:** Per District (`admin2`) and per Month.

### 1. Metadata & Identifiers

| Variable | Type | Description | Role |
| --- | --- | --- | --- |
| `date` | `datetime` | The first day of the reference month (e.g., `2021-06-01`). | Temporal Index |
| `admin2` | `string` | Name of the administrative district (LGA - Local Government Area). | Spatial Index |
| `month` | `int` | Month number (1 = Jan, 12 = Dec). | Seasonal Feature |

### 2. Target Variables

*The outcome variables the model aims to predict.*

| Variable | Type | Description | Formula / Source |
| --- | --- | --- | --- |
| **`target_pct`** | `float` | **Current Target.** Percentage of the population in Acute Food Insecurity (IPC Phase 3+). | `(phase35 / population) * 100` |
| **`target_future_3m`** | `float` | **Prediction Target.** The value of `target_pct` observed 3 months ahead. | `target_pct` shifted by -3 months |

### 3. Climate & Environmental Indicators

*Data sourced from NASA FLDAS, transformed for analysis.*

| Variable | Type | Unit | Description | Interpretation |
| --- | --- | --- | --- | --- |
| `Rainfall_mm` | `float` | mm/month | Total monthly precipitation. | Immediate water availability. |
| `SoilMoi00_10cm_tavg` | `float` | m³/m³ | Surface soil moisture (0-10cm). | Key indicator for sowing conditions. |
| `Tair_f_tavg` | `float` | Kelvin/°C | Average air temperature. | Heat stress factor (evapotranspiration). |
| `NDVI` | `float` | 0-1 | *(If avail.)* Normalized Difference Vegetation Index. | Vegetation health/greenness. |

### 4. Feature Engineering (Derived Variables)

*Variables created to provide "memory" and context to the model.*

#### A. Temporal & Seasonal Features

| Variable | Description | Rationale |
| --- | --- | --- |
| `month_sin` / `month_cos` | Cyclical encoding of the month. | Allows the model to understand that December (12) is close to January (1). |
| **`is_lean_season`** | Binary flag (1 for Jun-Jul-Aug-Sep, 0 otherwise). | Marks the critical **Lean Season** (empty stocks before harvest). |

#### B. Rolling Statistics (Trends)

*Note: Calculated on **previous** months only (Shift=1) to prevent data leakage.*

| Suffix | Example | Description |
| --- | --- | --- |
| `_mean_3m` (or `_roll3m`) | `Rainfall_mm_roll3m` | 3-month rolling mean. Indicates short-term agricultural season quality. |
| `_mean_6m` (or `_roll6m`) | `Rainfall_mm_roll6m` | 6-month rolling mean. Indicates long-term water accumulation (groundwater). |

#### C. Lags (Past Values)

| Suffix | Example | Description |
| --- | --- | --- |
| `_lag1` | `target_pct_lag1` | Situation 1 month ago (Inertia). |
| `_lag3` | `Rainfall_mm_lag3` | Situation 3 months ago (Crop maturation cycle). |
| `_lag6` | `Rainfall_mm_lag6` | Situation 6 months ago (Impact of the previous season). |
| `_lag12` | `Rainfall_mm_lag12` | Situation 1 year ago (Annual comparison). |

#### D. Anomalies & Risk Indicators (Advanced)

*Key features for crisis detection.*

| Variable | Description | Formula | Interpretation |
| --- | --- | --- | --- |
| **`_anomaly`** (Z-Score) | Standardized deviation from the historical mean for this specific month/region. | `(Value - Hist_Mean) / Std_Dev` | **< -1**: Severe deficit (e.g., Drought). <br>

<br> **> +1**: Surplus (e.g., Potential Flood). |
| `drought_during_lean` | Interaction Feature. | `Anomaly_Rain * is_lean_season` | Amplifies the signal if a drought occurs *during* the critical lean season. |

---

###  Assumptions & Limitations

1. **Sparse IPC Data:** Target variable (`target_pct`) is only available 2-3 times per year. The model is trained strictly on these known data points.
2. **Monthly Aggregation:** FLDAS data is aggregated by month. Short-term extreme events (e.g., 2-day flash floods) might be smoothed out.
3. **Lag vs. Reality:** We assume climate shocks take between 1 to 6 months to impact food security figures.