# üéØ MOD√àLE SIMPLE - RANDOM FOREST

**Objectif** : Tester rapidement un mod√®le classique pour pr√©dire le TauxGrippe

**Mod√®le** : RandomForest (le plus simple et robuste)

**Donn√©es** : train_weather_merged_complete.csv

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("‚úì Imports OK")

## 1. CHARGER LES DONN√âES

In [None]:
df = pd.read_csv('data_plus/train_weather_merged_complete.csv')
print(f"‚úì Charg√© : {df.shape}")
print(f"\nColonnes : {list(df.columns)}")
df.head()

## 2. PR√âPARATION DES FEATURES

In [None]:
# Features m√©t√©o principales (les plus importantes)
feature_cols = [
    't', 'td', 'u', 'ff', 'vv',           # Temp√©rature, humidit√©, vent, visibilit√©
    'tminsol', 'nbas', 'n',               # Temp√©rature sol, n√©bulosit√©
    'rr24', 'rr12', 'rr6',                # Pr√©cipitations
    'pres', 'tn12', 'tx12',               # Pression, temp min/max
    'week_year', 'region_code'            # Temporel + r√©gion
]

# Garder seulement les colonnes existantes
feature_cols = [c for c in feature_cols if c in df.columns]

print(f"‚úì {len(feature_cols)} features s√©lectionn√©es")
print(f"  {feature_cols}")

# Target
target = 'TauxGrippe'

# Cr√©er X et y
X = df[feature_cols].copy()
y = df[target].copy()

print(f"\nX : {X.shape}")
print(f"y : {y.shape}")

In [None]:
# G√©rer les NaN (imputation par la m√©diane)
print(f"NaN avant : {X.isnull().sum().sum()}")

for col in X.columns:
    if X[col].isnull().sum() > 0:
        X[col].fillna(X[col].median(), inplace=True)

print(f"NaN apr√®s : {X.isnull().sum().sum()}")
print("‚úì Donn√©es pr√™tes")

## 3. SPLIT TRAIN/TEST

In [None]:
# Split 80/20
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print(f"Train : {X_train.shape}")
print(f"Test  : {X_test.shape}")

## 4. ENTRA√éNEMENT RANDOM FOREST

In [None]:
print("üå≤ Entra√Ænement Random Forest...")

model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

model.fit(X_train, y_train)
print("\n‚úì Mod√®le entra√Æn√©!")

## 5. √âVALUATION

In [None]:
# Pr√©dictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# M√©triques Train
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

# M√©triques Test
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print("="*60)
print("R√âSULTATS")
print("="*60)

print("\nüìä TRAIN:")
print(f"  RMSE : {rmse_train:.2f}")
print(f"  MAE  : {mae_train:.2f}")
print(f"  R¬≤   : {r2_train:.4f}")

print("\nüìä TEST:")
print(f"  RMSE : {rmse_test:.2f}")
print(f"  MAE  : {mae_test:.2f}")
print(f"  R¬≤   : {r2_test:.4f}")

print("\n" + "="*60)

## 6. FEATURE IMPORTANCE

In [None]:
# Importance des features
importances = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("üìà TOP 10 FEATURES:")
print(importances.head(10).to_string(index=False))

# Graphique
plt.figure(figsize=(10, 6))
importances.head(10).plot(x='feature', y='importance', kind='barh', color='steelblue')
plt.title('Top 10 Features Importantes', fontsize=14, fontweight='bold')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## 7. EXEMPLES DE PR√âDICTIONS

In [None]:
# Comparaison r√©el vs pr√©dit
comparaison = pd.DataFrame({
    'R√©el': y_test.values,
    'Pr√©dit': y_pred_test,
    'Erreur': np.abs(y_test.values - y_pred_test)
})

print("üîç EXEMPLES DE PR√âDICTIONS (10 premiers):")
print(comparaison.head(10).to_string(index=False))

# Graphique scatter
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('TauxGrippe R√©el', fontsize=12)
plt.ylabel('TauxGrippe Pr√©dit', fontsize=12)
plt.title(f'Pr√©dictions vs R√©alit√© (R¬≤={r2_test:.3f})', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 8. SAUVEGARDE

In [None]:
# Sauvegarder les r√©sultats
comparaison.to_csv('predictions_test_simple.csv', index=False)
print("‚úì Pr√©dictions sauvegard√©es : predictions_test_simple.csv")

# Sauvegarder feature importance
importances.to_csv('feature_importance_simple.csv', index=False)
print("‚úì Feature importance sauvegard√©e : feature_importance_simple.csv")

## ‚úÖ CONCLUSION

**Mod√®le test√©** : Random Forest (100 arbres)

**Performance** : Voir les m√©triques ci-dessus

**Prochaines √©tapes** :
- Tester avec plus de features (lags, moyennes mobiles)
- Essayer d'autres mod√®les (XGBoost, LightGBM)
- Optimiser les hyperparam√®tres