# üìä Exploratory Data Analysis - Madagascar Vanilla Prices

Ce notebook analyse les donn√©es de prix de la vanille malgache pour:
1. Comprendre les tendances et patterns
2. Identifier la saisonnalit√©
3. D√©tecter les anomalies
4. Pr√©parer les donn√©es pour le modeling

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configuration plots
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Paths
DATA_PATH = Path('../data/processed')
OUTPUT_PATH = Path('../outputs/figures')

print("‚úÖ Configuration loaded")

In [None]:
# Charger les donn√©es
df = pd.read_csv(DATA_PATH / 'vanilla_prices.csv', parse_dates=['date'])
df = df.set_index('date')

print(f"üìä Dataset shape: {df.shape}")
print(f"üìÖ P√©riode: {df.index.min().date()} ‚Üí {df.index.max().date()}")
print(f"\nüìã Colonnes: {list(df.columns)}")

## 1. Vue d'ensemble des donn√©es

In [None]:
# Statistiques descriptives
print("üìà Statistiques des prix (USD/kg):")
print("="*40)
stats = df['price_usd_kg'].describe()
print(f"Moyenne: ${stats['mean']:.2f}")
print(f"M√©diane: ${df['price_usd_kg'].median():.2f}")
print(f"Min: ${stats['min']:.2f}")
print(f"Max: ${stats['max']:.2f}")
print(f"√âcart-type: ${stats['std']:.2f}")
print(f"\nCoef. de variation: {(stats['std']/stats['mean']*100):.1f}%")

In [None]:
# Donn√©es manquantes
missing = df.isnull().sum()
if missing.sum() > 0:
    print("‚ö†Ô∏è Valeurs manquantes:")
    print(missing[missing > 0])
else:
    print("‚úÖ Aucune valeur manquante dans la variable cible")

## 2. √âvolution temporelle des prix

In [None]:
# Plot de la s√©rie temporelle principale
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Prix bruts
ax1 = axes[0]
ax1.plot(df.index, df['price_usd_kg'], linewidth=2, color='#2E86AB')
ax1.fill_between(df.index, df['price_usd_kg'], alpha=0.3, color='#2E86AB')
ax1.set_title('Prix de la Vanille de Madagascar (2010-2024)', fontsize=14, fontweight='bold')
ax1.set_ylabel('Prix (USD/kg)')
ax1.axhline(y=df['price_usd_kg'].mean(), color='red', linestyle='--', label=f'Moyenne: ${df["price_usd_kg"].mean():.0f}')

# Annotations des √©v√©nements cl√©s
ax1.annotate('Pic historique\n(sp√©culation)', 
             xy=(pd.Timestamp('2018-01-01'), 600), 
             xytext=(pd.Timestamp('2015-01-01'), 550),
             arrowprops=dict(arrowstyle='->', color='red'),
             fontsize=10, color='red')

ax1.annotate('COVID-19', 
             xy=(pd.Timestamp('2020-03-01'), 350), 
             xytext=(pd.Timestamp('2021-06-01'), 450),
             arrowprops=dict(arrowstyle='->', color='gray'),
             fontsize=10, color='gray')

ax1.legend(loc='upper left')

# Variation mensuelle en %
ax2 = axes[1]
colors = ['green' if x > 0 else 'red' for x in df['price_pct_change'].fillna(0)]
ax2.bar(df.index, df['price_pct_change']*100, color=colors, alpha=0.7, width=20)
ax2.axhline(y=0, color='black', linewidth=0.5)
ax2.set_title('Variation Mensuelle des Prix (%)', fontsize=14, fontweight='bold')
ax2.set_ylabel('Variation (%)')
ax2.set_xlabel('Date')

plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'price_evolution.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úÖ Figure sauvegard√©e: outputs/figures/price_evolution.png")

## 3. Analyse de la saisonnalit√©

In [None]:
# Prix moyen par mois
monthly_avg = df.groupby('month')['price_usd_kg'].agg(['mean', 'std'])

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Boxplot par mois
ax1 = axes[0]
month_names = ['Jan', 'F√©v', 'Mar', 'Avr', 'Mai', 'Jun', 'Jul', 'Ao√ª', 'Sep', 'Oct', 'Nov', 'D√©c']
df_plot = df.copy()
df_plot['month_name'] = df_plot['month'].map(lambda x: month_names[x-1])

sns.boxplot(data=df_plot, x='month', y='price_usd_kg', ax=ax1, palette='coolwarm')
ax1.set_xticklabels(month_names)
ax1.set_title('Distribution des Prix par Mois', fontsize=14, fontweight='bold')
ax1.set_xlabel('Mois')
ax1.set_ylabel('Prix (USD/kg)')

# Ajouter zones saisonni√®res
ax1.axvspan(4, 6, alpha=0.2, color='green', label='R√©colte (Mai-Jul)')
ax1.axvspan(-0.5, 2.5, alpha=0.2, color='red', label='Cyclones (Jan-Mar)')
ax1.legend(loc='upper right')

# Bar chart moyenne par mois
ax2 = axes[1]
colors = ['#FF6B6B' if i in [0, 1, 2] else '#4ECDC4' if i in [4, 5, 6] else '#95E1D3' for i in range(12)]
bars = ax2.bar(month_names, monthly_avg['mean'], yerr=monthly_avg['std']/2, 
               capsize=3, color=colors, edgecolor='black', linewidth=0.5)
ax2.set_title('Prix Moyen Mensuel (avec √©cart-type)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Mois')
ax2.set_ylabel('Prix Moyen (USD/kg)')
ax2.axhline(y=df['price_usd_kg'].mean(), color='black', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'seasonality_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# D√©composition de la s√©rie temporelle
from statsmodels.tsa.seasonal import seasonal_decompose

# D√©composition
decomposition = seasonal_decompose(df['price_usd_kg'], model='multiplicative', period=12)

fig, axes = plt.subplots(4, 1, figsize=(14, 12))

decomposition.observed.plot(ax=axes[0], title='Donn√©es Observ√©es')
axes[0].set_ylabel('Prix')

decomposition.trend.plot(ax=axes[1], title='Tendance')
axes[1].set_ylabel('Tendance')

decomposition.seasonal.plot(ax=axes[2], title='Saisonnalit√©')
axes[2].set_ylabel('Facteur')

decomposition.resid.plot(ax=axes[3], title='R√©sidus')
axes[3].set_ylabel('R√©sidus')

plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'time_series_decomposition.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Analyse par ann√©e

In [None]:
# Statistiques par ann√©e
yearly_stats = df.groupby('year')['price_usd_kg'].agg(['mean', 'min', 'max', 'std'])
yearly_stats['range'] = yearly_stats['max'] - yearly_stats['min']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Prix moyen par ann√©e
ax1 = axes[0]
colors = plt.cm.RdYlGn(np.linspace(0, 1, len(yearly_stats)))
bars = ax1.bar(yearly_stats.index, yearly_stats['mean'], color=colors, edgecolor='black')
ax1.set_title('Prix Moyen Annuel de la Vanille', fontsize=14, fontweight='bold')
ax1.set_xlabel('Ann√©e')
ax1.set_ylabel('Prix Moyen (USD/kg)')

# Annoter les valeurs
for bar in bars:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'${height:.0f}', ha='center', va='bottom', fontsize=8)

# Volatilit√© par ann√©e
ax2 = axes[1]
ax2.bar(yearly_stats.index, yearly_stats['std'], color='coral', edgecolor='black')
ax2.set_title('Volatilit√© Annuelle (√âcart-type)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Ann√©e')
ax2.set_ylabel('√âcart-type (USD/kg)')

plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'yearly_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Analyse des corr√©lations

In [None]:
# Corr√©lation entre features
features_to_analyze = ['price_usd_kg', 'price_lag1', 'price_lag3', 'price_lag6', 'price_lag12',
                       'price_ma3', 'price_ma6', 'price_ma12', 'month', 'quarter',
                       'harvest_season', 'cyclone_season']

df_corr = df[features_to_analyze].dropna()
correlation_matrix = df_corr.corr()

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='RdYlBu_r', 
            center=0, ax=ax, square=True, linewidths=0.5)
ax.set_title('Matrice de Corr√©lation des Features', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Autocorr√©lation
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

plot_acf(df['price_usd_kg'].dropna(), lags=24, ax=axes[0], title='Autocorr√©lation (ACF)')
plot_pacf(df['price_usd_kg'].dropna(), lags=24, ax=axes[1], title='Autocorr√©lation Partielle (PACF)')

plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'autocorrelation.png', dpi=150, bbox_inches='tight')
plt.show()

print("üí° L'ACF d√©cro√Æt lentement ‚Üí s√©rie non-stationnaire, tendance pr√©sente")
print("üí° La PACF a un pic significatif au lag 1 ‚Üí mod√®le AR(1) possible")

## 6. Distribution des prix

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Histogramme
ax1 = axes[0]
ax1.hist(df['price_usd_kg'], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
ax1.axvline(df['price_usd_kg'].mean(), color='red', linestyle='--', label=f'Moyenne: ${df["price_usd_kg"].mean():.0f}')
ax1.axvline(df['price_usd_kg'].median(), color='green', linestyle='--', label=f'M√©diane: ${df["price_usd_kg"].median():.0f}')
ax1.set_title('Distribution des Prix', fontsize=14, fontweight='bold')
ax1.set_xlabel('Prix (USD/kg)')
ax1.set_ylabel('Fr√©quence')
ax1.legend()

# KDE plot
ax2 = axes[1]
sns.kdeplot(df['price_usd_kg'], ax=ax2, fill=True, color='steelblue')
ax2.set_title('Densit√© de Probabilit√© (KDE)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Prix (USD/kg)')

# Q-Q plot
from scipy import stats
ax3 = axes[2]
stats.probplot(df['price_usd_kg'], dist='norm', plot=ax3)
ax3.set_title('Q-Q Plot (Normalit√©)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'price_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

# Test de normalit√©
stat, p_value = stats.shapiro(df['price_usd_kg'][:50])  # Shapiro limit√© √† 50 obs
print(f"\nüìä Test de Shapiro-Wilk: p-value = {p_value:.4f}")
if p_value < 0.05:
    print("‚ö†Ô∏è La distribution n'est PAS normale (p < 0.05)")
else:
    print("‚úÖ La distribution peut √™tre consid√©r√©e normale (p >= 0.05)")

## 7. Stationnarit√©

In [None]:
from statsmodels.tsa.stattools import adfuller, kpss

# Test ADF
def adf_test(series, name=''):
    result = adfuller(series.dropna(), autolag='AIC')
    print(f"\nüìä Test ADF - {name}")
    print(f"   Statistique: {result[0]:.4f}")
    print(f"   P-value: {result[1]:.4f}")
    print(f"   Lags utilis√©s: {result[2]}")
    if result[1] <= 0.05:
        print("   ‚úÖ S√©rie STATIONNAIRE (p <= 0.05)")
    else:
        print("   ‚ö†Ô∏è S√©rie NON-STATIONNAIRE (p > 0.05)")
    return result[1]

# Test sur s√©rie originale
adf_test(df['price_usd_kg'], 'Prix originaux')

# Test sur diff√©rence premi√®re
df['price_diff'] = df['price_usd_kg'].diff()
adf_test(df['price_diff'], 'Diff√©rence premi√®re')

# Test sur log-diff√©rence
df['price_log_diff'] = np.log(df['price_usd_kg']).diff()
adf_test(df['price_log_diff'], 'Log-diff√©rence')

## 8. D√©tection des outliers

In [None]:
# D√©tection avec IQR
Q1 = df['price_usd_kg'].quantile(0.25)
Q3 = df['price_usd_kg'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['price_usd_kg'] < lower_bound) | (df['price_usd_kg'] > upper_bound)]

print(f"üìä D√©tection d'outliers (m√©thode IQR):")
print(f"   Q1: ${Q1:.2f}")
print(f"   Q3: ${Q3:.2f}")
print(f"   IQR: ${IQR:.2f}")
print(f"   Bornes: [${lower_bound:.2f}, ${upper_bound:.2f}]")
print(f"\n   Nombre d'outliers: {len(outliers)}")

if len(outliers) > 0:
    print("\n   Dates des outliers:")
    for idx, row in outliers.iterrows():
        print(f"   - {idx.date()}: ${row['price_usd_kg']:.2f}")

## 9. R√©sum√© de l'EDA

### üîç Principales observations:

1. **Tendance**: Forte hausse 2014-2018, puis baisse progressive
2. **Saisonnalit√©**: L√©g√®re hausse post-r√©colte (juin-ao√ªt)
3. **Volatilit√©**: Tr√®s √©lev√©e (coefficient de variation > 50%)
4. **Non-stationnarit√©**: N√©cessite diff√©renciation pour mod√©lisation
5. **Autocorr√©lation**: Forte corr√©lation avec lags r√©cents (AR)

### üìã Recommandations pour le modeling:

- Utiliser la **diff√©renciation** pour stationnariser
- Inclure **lag features** (lag1, lag3, lag12)
- Ajouter **indicateurs saisonniers**
- Consid√©rer **transformations log** pour stabiliser variance
- Tester mod√®les: **SARIMA**, **Prophet**, **XGBoost**

In [None]:
# Sauvegarder le r√©sum√©
summary = {
    'observations': len(df),
    'start_date': df.index.min().strftime('%Y-%m-%d'),
    'end_date': df.index.max().strftime('%Y-%m-%d'),
    'mean_price': df['price_usd_kg'].mean(),
    'median_price': df['price_usd_kg'].median(),
    'min_price': df['price_usd_kg'].min(),
    'max_price': df['price_usd_kg'].max(),
    'std_price': df['price_usd_kg'].std(),
    'cv_percent': (df['price_usd_kg'].std() / df['price_usd_kg'].mean() * 100)
}

print("\n" + "="*50)
print("üìã R√âSUM√â DE L'ANALYSE EXPLORATOIRE")
print("="*50)
for key, value in summary.items():
    if isinstance(value, float):
        print(f"{key}: {value:.2f}")
    else:
        print(f"{key}: {value}")