# üöÄ KAGGLE INFLUENZA PREDICTION - STRAT√âGIE OPTIMIS√âE

**Objectif**: Maximiser le score Kaggle

**Strat√©gie**:
- Features m√©t√©o + Features temporelles cycliques + **Moyennes historiques**
- Validation sur 2011 (mimique le test 2012-2013)
- Ensemble XGBoost + LightGBM + CatBoost

**Structure**:
- Train: 2004-2010
- Validation: 2011
- Test: 2012-2013 (√† pr√©dire)

In [None]:
# IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import pickle
import warnings
warnings.filterwarnings('ignore')

print("‚úì Imports r√©ussis!")

## 1. CHARGEMENT DES DONN√âES

In [None]:
# Charger le dataset nettoy√©
df_train_full = pd.read_csv('data_plus/train_synop_cleaned_complet.csv')
df_train_full['date'] = pd.to_datetime(df_train_full['date'])

print(f"‚úì Dataset charg√©: {df_train_full.shape}")
print(f"P√©riode: {df_train_full['date'].min()} √† {df_train_full['date'].max()}")
print(f"R√©gions: {df_train_full['region_code'].nunique()}")
df_train_full.head()

## 2. FEATURE ENGINEERING STRAT√âGIQUE

In [None]:
def create_temporal_features(df):
    """Features temporelles avec cyclicit√©"""
    df = df.copy()
    
    # Composantes temporelles
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['day_of_year'] = df['date'].dt.dayofyear
    
    # Features cycliques (IMPORTANT pour saisonnalit√©)
    df['week_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52)
    df['week_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # Encoder saison
    saison_map = {'Hiver': 1, 'Printemps': 2, 'Ete': 3, 'Automne': 4}
    df['saison_encoded'] = df['saison'].map(saison_map)
    
    return df

def create_historical_features(df):
    """
    üöÄ SECRET WEAPON: Moyennes historiques
    Capture les patterns saisonniers de chaque r√©gion
    """
    df = df.copy()
    
    # 1. Moyenne par r√©gion + semaine de l'ann√©e
    df['TauxGrippe_hist_week_mean'] = df.groupby(['region_code', 'week_of_year'])['TauxGrippe'].transform('mean')
    
    # 2. Moyenne par r√©gion + mois
    df['TauxGrippe_hist_month_mean'] = df.groupby(['region_code', 'month'])['TauxGrippe'].transform('mean')
    
    # 3. Moyenne par r√©gion + saison
    df['TauxGrippe_hist_season_mean'] = df.groupby(['region_code', 'saison'])['TauxGrippe'].transform('mean')
    
    # 4. Stats globales par r√©gion
    df['TauxGrippe_region_mean'] = df.groupby('region_code')['TauxGrippe'].transform('mean')
    df['TauxGrippe_region_std'] = df.groupby('region_code')['TauxGrippe'].transform('std')
    
    # 5. Stats globales par semaine
    df['TauxGrippe_week_global_mean'] = df.groupby('week_of_year')['TauxGrippe'].transform('mean')
    
    return df

# Appliquer
print("üìä Cr√©ation des features temporelles...")
df_train_full = create_temporal_features(df_train_full)

print("üéØ Cr√©ation des features historiques...")
df_train_full = create_historical_features(df_train_full)

print(f"‚úì Features cr√©√©es. Shape: {df_train_full.shape}")

## 3. S√âLECTION DES FEATURES

In [None]:
# Features m√©t√©o
meteo_features = ['t', 'u', 'td', 'ff', 'vv', 'tminsol', 'pres', 
                  'rr3', 'rr6', 'rr12', 'rr24', 'n']

# Features temporelles
temporal_features = ['week_of_year', 'month', 'week_sin', 'week_cos', 
                     'month_sin', 'month_cos', 'saison_encoded']

# Features historiques
historical_features = ['TauxGrippe_hist_week_mean', 'TauxGrippe_hist_month_mean',
                       'TauxGrippe_hist_season_mean', 'TauxGrippe_region_mean',
                       'TauxGrippe_region_std', 'TauxGrippe_week_global_mean']

# R√©gion
region_features = ['region_code']

# Toutes les features
all_features = meteo_features + temporal_features + historical_features + region_features
available_features = [f for f in all_features if f in df_train_full.columns]

print(f"‚úì {len(available_features)} features s√©lectionn√©es")
print(f"  M√©t√©o: {len([f for f in meteo_features if f in available_features])}")
print(f"  Temporelles: {len([f for f in temporal_features if f in available_features])}")
print(f"  Historiques: {len([f for f in historical_features if f in available_features])}")

## 4. SPLIT DE VALIDATION STRAT√âGIQUE

**Strat√©gie**: Utiliser 2011 comme validation pour mimique le test (2012-2013)

In [None]:
# Split: Train 2004-2010, Validation 2011
df_train = df_train_full[df_train_full['year'] <= 2010].copy()
df_val = df_train_full[df_train_full['year'] == 2011].copy()

print(f"‚úì Train: {df_train.shape[0]} obs ({df_train['year'].min()}-{df_train['year'].max()})")
print(f"‚úì Validation: {df_val.shape[0]} obs ({df_val['year'].min()})")

# Pr√©parer X, y
X_train = df_train[available_features]
y_train = df_train['TauxGrippe']
X_val = df_val[available_features]
y_val = df_val['TauxGrippe']

# Imputation
imputer = SimpleImputer(strategy='median')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns, index=X_val.index)

print(f"‚úì Donn√©es pr√©par√©es")

## 5. ENTRA√éNEMENT DES MOD√àLES

In [None]:
results = {}

# XGBoost
print("üöÄ [1/3] XGBoost...")
xgb_model = xgb.XGBRegressor(
    n_estimators=500, max_depth=7, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, min_child_weight=3,
    gamma=0.1, random_state=42, n_jobs=-1
)
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
              early_stopping_rounds=50, verbose=False)

y_val_pred_xgb = xgb_model.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
results['XGBoost'] = {'RMSE': rmse_xgb, 'R¬≤': r2_score(y_val, y_val_pred_xgb)}
print(f"‚úì XGBoost - RMSE: {rmse_xgb:.2f}")

# LightGBM
print("‚ö° [2/3] LightGBM...")
lgb_model = lgb.LGBMRegressor(
    n_estimators=500, max_depth=7, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, min_child_samples=20,
    random_state=42, n_jobs=-1, verbose=-1
)
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])

y_val_pred_lgb = lgb_model.predict(X_val)
rmse_lgb = np.sqrt(mean_squared_error(y_val, y_val_pred_lgb))
results['LightGBM'] = {'RMSE': rmse_lgb, 'R¬≤': r2_score(y_val, y_val_pred_lgb)}
print(f"‚úì LightGBM - RMSE: {rmse_lgb:.2f}")

# CatBoost
print("üê± [3/3] CatBoost...")
cat_model = CatBoostRegressor(
    iterations=500, depth=7, learning_rate=0.05,
    subsample=0.8, random_state=42, verbose=False
)
cat_model.fit(X_train, y_train, eval_set=(X_val, y_val),
              early_stopping_rounds=50, verbose=False)

y_val_pred_cat = cat_model.predict(X_val)
rmse_cat = np.sqrt(mean_squared_error(y_val, y_val_pred_cat))
results['CatBoost'] = {'RMSE': rmse_cat, 'R¬≤': r2_score(y_val, y_val_pred_cat)}
print(f"‚úì CatBoost - RMSE: {rmse_cat:.2f}")

In [None]:
# Ensemble (moyenne pond√©r√©e)
print("üéØ [4/4] Ensemble...")
weights = {'XGBoost': 1/rmse_xgb, 'LightGBM': 1/rmse_lgb, 'CatBoost': 1/rmse_cat}
total = sum(weights.values())
weights = {k: v/total for k, v in weights.items()}

y_val_pred_ensemble = (
    weights['XGBoost'] * y_val_pred_xgb +
    weights['LightGBM'] * y_val_pred_lgb +
    weights['CatBoost'] * y_val_pred_cat
)
rmse_ensemble = np.sqrt(mean_squared_error(y_val, y_val_pred_ensemble))
results['Ensemble'] = {'RMSE': rmse_ensemble, 'R¬≤': r2_score(y_val, y_val_pred_ensemble)}
print(f"‚úì Ensemble - RMSE: {rmse_ensemble:.2f}")

print(f"\nPoids: XGB={weights['XGBoost']:.3f}, LGB={weights['LightGBM']:.3f}, CAT={weights['CatBoost']:.3f}")

## 6. COMPARAISON DES R√âSULTATS

In [None]:
df_results = pd.DataFrame(results).T.sort_values('RMSE')
print("\nüìä PERFORMANCES SUR VALIDATION (2011):")
print(df_results)

best_model = df_results['RMSE'].idxmin()
best_rmse = df_results.loc[best_model, 'RMSE']
print(f"\nüèÜ MEILLEUR MOD√àLE: {best_model} (RMSE={best_rmse:.2f})")

# Visualisation
fig, ax = plt.subplots(figsize=(10, 6))
df_results['RMSE'].sort_values().plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('Comparaison des Mod√®les (RMSE sur Validation 2011)', fontsize=14, fontweight='bold')
ax.set_xlabel('RMSE')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('model_comparison.png', dpi=150)
plt.show()

## 7. FEATURE IMPORTANCE

In [None]:
# Feature importance XGBoost
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüìä Top 15 Features:")
print(feature_importance.head(15))

# Visualisation
fig, ax = plt.subplots(figsize=(10, 8))
feature_importance.head(15).set_index('feature')['importance'].sort_values().plot(
    kind='barh', ax=ax, color='coral'
)
ax.set_title('Top 15 Features (XGBoost)', fontsize=14, fontweight='bold')
ax.set_xlabel('Importance')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=150)
plt.show()

feature_importance.to_csv('feature_importance.csv', index=False)

## 8. R√âENTRA√éNEMENT FINAL (2004-2011)

In [None]:
print("üîÑ R√©entra√Ænement sur TOUTES les donn√©es (2004-2011)...")

X_full = df_train_full[available_features]
y_full = df_train_full['TauxGrippe']
X_full = pd.DataFrame(imputer.fit_transform(X_full), columns=X_full.columns, index=X_full.index)

# XGBoost
print("üöÄ XGBoost...")
xgb_final = xgb.XGBRegressor(
    n_estimators=500, max_depth=7, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, min_child_weight=3,
    gamma=0.1, random_state=42, n_jobs=-1
)
xgb_final.fit(X_full, y_full, verbose=False)

# LightGBM
print("‚ö° LightGBM...")
lgb_final = lgb.LGBMRegressor(
    n_estimators=500, max_depth=7, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, min_child_samples=20,
    random_state=42, n_jobs=-1, verbose=-1
)
lgb_final.fit(X_full, y_full)

# CatBoost
print("üê± CatBoost...")
cat_final = CatBoostRegressor(
    iterations=500, depth=7, learning_rate=0.05,
    subsample=0.8, random_state=42, verbose=False
)
cat_final.fit(X_full, y_full, verbose=False)

print("\n‚úì Mod√®les finaux entra√Æn√©s!")

## 9. SAUVEGARDE DES MOD√àLES

In [None]:
# Sauvegarder les mod√®les
with open('xgb_final.pkl', 'wb') as f:
    pickle.dump(xgb_final, f)
with open('lgb_final.pkl', 'wb') as f:
    pickle.dump(lgb_final, f)
with open('cat_final.pkl', 'wb') as f:
    pickle.dump(cat_final, f)
with open('imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)
with open('weights.pkl', 'wb') as f:
    pickle.dump(weights, f)
    
# Sauvegarder la liste des features
with open('features.pkl', 'wb') as f:
    pickle.dump(available_features, f)

print("‚úì Mod√®les sauvegard√©s:")
print("  - xgb_final.pkl")
print("  - lgb_final.pkl")
print("  - cat_final.pkl")
print("  - imputer.pkl")
print("  - weights.pkl")
print("  - features.pkl")

## 10. R√âSUM√â FINAL

In [None]:
print("\n" + "="*80)
print("‚úÖ ENTRA√éNEMENT TERMIN√â!")
print("="*80)

print(f"\nüìä STATISTIQUES:")
print(f"   Train: {len(df_train)} obs (2004-2010)")
print(f"   Validation: {len(df_val)} obs (2011)")
print(f"   Features: {len(available_features)}")

print(f"\nüéØ PERFORMANCES (validation 2011):")
for model, metrics in results.items():
    print(f"   {model:12s}: RMSE={metrics['RMSE']:6.2f} | R¬≤={metrics['R¬≤']:.4f}")

print(f"\nüèÜ MEILLEUR MOD√àLE: {best_model} (RMSE={best_rmse:.2f})")

print(f"\nüí° PROCHAINE √âTAPE:")
print(f"   Ex√©cutez KAGGLE_PREDICT.ipynb pour g√©n√©rer les pr√©dictions!")
print("="*80)