# üîç Analyse du Data Drift

Ce notebook analyse la d√©rive des donn√©es entre l'entra√Ænement et la production.

## Objectifs
1. Comparer les distributions des features
2. D√©tecter les drifts statistiques
3. Visualiser les changements
4. Recommandations pour le re-entra√Ænement

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Configuration des graphiques
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 1. Chargement des donn√©es

In [None]:
# Charger les donn√©es de r√©f√©rence (entra√Ænement)
reference_df = pd.read_csv('../data/reference_data.csv')
print(f"üìä Donn√©es de r√©f√©rence: {len(reference_df)} √©chantillons")
print(f"Colonnes: {reference_df.columns.tolist()}")
reference_df.head()

In [None]:
# Charger les logs de production
production_logs = []
with open('../production_logs.json', 'r') as f:
    for line in f:
        if line.strip():
            production_logs.append(json.loads(line))

production_df = pd.DataFrame(production_logs)
input_df = pd.json_normalize(production_df['input'])

print(f"üìä Donn√©es de production: {len(input_df)} √©chantillons")
print(f"Colonnes: {input_df.columns.tolist()}")
input_df.head()

## 2. Statistiques descriptives

In [None]:
# Comparaison des statistiques
features = ['age', 'income', 'loan_amount', 'employment_length', 'credit_score']

print("\nüìä STATISTIQUES DESCRIPTIVES\n")
print("=" * 80)

for feature in features:
    if feature in reference_df.columns and feature in input_df.columns:
        print(f"\n{feature.upper()}:")
        print(f"  R√©f√©rence - Moyenne: {reference_df[feature].mean():.2f}, Std: {reference_df[feature].std():.2f}")
        print(f"  Production - Moyenne: {input_df[feature].mean():.2f}, Std: {input_df[feature].std():.2f}")
        
        # Diff√©rence en %
        diff = ((input_df[feature].mean() - reference_df[feature].mean()) / reference_df[feature].mean() * 100)
        print(f"  Diff√©rence: {diff:+.2f}%")

## 3. Tests statistiques de drift

In [None]:
# Test de Kolmogorov-Smirnov pour chaque feature
drift_results = []

print("\nüîç D√âTECTION DE DRIFT (Test Kolmogorov-Smirnov)\n")
print("=" * 80)

for feature in features:
    if feature in reference_df.columns and feature in input_df.columns:
        # KS test
        ks_stat, p_value = stats.ks_2samp(
            reference_df[feature].dropna(),
            input_df[feature].dropna()
        )
        
        # Interpr√©ter le r√©sultat
        drift_detected = p_value < 0.05
        status = "üî¥ DRIFT D√âTECT√â" if drift_detected else "üü¢ OK"
        
        drift_results.append({
            'Feature': feature,
            'KS Statistic': ks_stat,
            'P-Value': p_value,
            'Drift': drift_detected,
            'Status': status
        })
        
        print(f"\n{feature}:")
        print(f"  KS Statistic: {ks_stat:.4f}")
        print(f"  P-Value: {p_value:.4f}")
        print(f"  Status: {status}")

# Cr√©er un DataFrame des r√©sultats
drift_df = pd.DataFrame(drift_results)
drift_df

## 4. Visualisations des distributions

In [None]:
# Comparaison des distributions
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.flatten()

for idx, feature in enumerate(features):
    if feature in reference_df.columns and feature in input_df.columns:
        ax = axes[idx]
        
        # Histogrammes
        ax.hist(reference_df[feature].dropna(), bins=30, alpha=0.5, label='R√©f√©rence', color='blue')
        ax.hist(input_df[feature].dropna(), bins=30, alpha=0.5, label='Production', color='red')
        
        ax.set_title(f'Distribution - {feature}', fontsize=12, fontweight='bold')
        ax.set_xlabel(feature)
        ax.set_ylabel('Fr√©quence')
        ax.legend()
        ax.grid(True, alpha=0.3)

# Supprimer le dernier subplot s'il n'est pas utilis√©
if len(features) < len(axes):
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.savefig('../notebooks/drift_analysis_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nüíæ Graphique sauvegard√©: drift_analysis_distributions.png")

## 5. Box plots comparatifs

In [None]:
# Cr√©er des box plots
fig, axes = plt.subplots(1, len(features), figsize=(20, 4))

for idx, feature in enumerate(features):
    if feature in reference_df.columns and feature in input_df.columns:
        # Pr√©parer les donn√©es
        data_to_plot = [
            reference_df[feature].dropna(),
            input_df[feature].dropna()
        ]
        
        axes[idx].boxplot(data_to_plot, labels=['R√©f√©rence', 'Production'])
        axes[idx].set_title(feature, fontweight='bold')
        axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../notebooks/drift_analysis_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nüíæ Graphique sauvegard√©: drift_analysis_boxplots.png")

## 6. Analyse approfondie avec Evidently

In [None]:
# Utiliser Evidently pour une analyse plus pouss√©e
try:
    from evidently import ColumnMapping
    from evidently.report import Report
    from evidently.metric_preset import DataDriftPreset, DataQualityPreset
    
    # Pr√©parer les donn√©es
    reference_data = reference_df[features]
    production_data = input_df[features]
    
    # Cr√©er le rapport
    report = Report(metrics=[
        DataDriftPreset(),
        DataQualityPreset()
    ])
    
    report.run(reference_data=reference_data, current_data=production_data)
    
    # Sauvegarder le rapport
    report.save_html('../notebooks/evidently_drift_report.html')
    
    print("\n‚úÖ Rapport Evidently g√©n√©r√©: evidently_drift_report.html")
    print("üìÇ Ouvrez le fichier HTML dans votre navigateur pour voir le rapport complet.")
    
except ImportError:
    print("‚ö†Ô∏è Evidently n'est pas install√©. Installez-le avec: pip install evidently")
except Exception as e:
    print(f"‚ö†Ô∏è Erreur lors de la g√©n√©ration du rapport Evidently: {e}")

## 7. Recommandations

In [None]:
print("\n" + "=" * 80)
print("üìã RECOMMANDATIONS")
print("=" * 80)

# Compter les drifts
n_drifts = sum(drift_df['Drift'])

if n_drifts == 0:
    print("\n‚úÖ STATUT: Aucun drift significatif d√©tect√©")
    print("\nüí° Le mod√®le est stable et peut continuer √† fonctionner en production.")
    print("   Continuez √† monitorer r√©guli√®rement les performances.")
    
elif n_drifts <= 2:
    print(f"\n‚ö†Ô∏è STATUT: Drift d√©tect√© sur {n_drifts} feature(s)")
    print("\nüí° Recommandations:")
    print("   1. Monitorer de pr√®s l'√©volution des performances")
    print("   2. Collecter plus de donn√©es de production")
    print("   3. Planifier un re-entra√Ænement dans les prochaines semaines")
    
else:
    print(f"\nüî¥ STATUT: Drift significatif d√©tect√© sur {n_drifts} features")
    print("\nüí° Actions recommand√©es (URGENT):")
    print("   1. ‚ö†Ô∏è RE-ENTRA√éNER LE MOD√àLE imm√©diatement avec des donn√©es r√©centes")
    print("   2. Analyser les causes du drift (changement de comportement client, saisonnalit√©, etc.)")
    print("   3. Augmenter la fr√©quence de monitoring")
    print("   4. Consid√©rer un syst√®me de re-entra√Ænement automatique")

print("\nüìä Features avec drift:")
for _, row in drift_df[drift_df['Drift']].iterrows():
    print(f"   - {row['Feature']}: p-value = {row['P-Value']:.4f}")

print("\n" + "=" * 80)

## 8. Export du rapport

In [None]:
# Sauvegarder les r√©sultats
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
drift_df.to_csv(f'../notebooks/drift_report_{timestamp}.csv', index=False)

print(f"\nüíæ Rapport sauvegard√©: drift_report_{timestamp}.csv")
print("\n‚úÖ Analyse termin√©e!")