# ML-Scheduler Data Collection - HYDATIS Cluster
## Collecte Données Historiques Prometheus

**Environnement**: Jupyter Notebook HYDATIS - wassimmezrani namespace  
**Cluster**: 6 nodes (3 masters + 3 workers)  
**Prometheus**: http://10.110.190.83:9090  

**Objectif**: Collecter et préparer données pour 3 algorithmes ML:
- XGBoost: Patterns temporels charge des nodes
- Q-Learning: Historique placements + performances
- Isolation Forest: Comportements normaux/anormaux nodes

**Volumes montés**:
- `/data` (50Gi): Données historiques
- `/models` (20Gi): Modèles ML
- `/experiments` (15Gi): Expérimentations
- `/home/jovyan` (19Gi): Workspace

In [None]:
# Configuration et imports pour environnement Jupyter HYDATIS
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor
import os
from pathlib import Path

warnings.filterwarnings('ignore')
plt.style.use('default')

# Configuration Prometheus HYDATIS Cluster
PROMETHEUS_URL = "http://prometheus-k8s.monitoring.svc.cluster.local:9090"
PROMETHEUS_EXTERNAL_URL = "http://10.110.190.83:9090"

# Configuration timeout et performance pour environnement production
REQUEST_TIMEOUT = 120
COLLECTION_PAUSE = 2.0
MAX_RETRIES = 3

# Configuration chemins pour environnement Jupyter
DATA_PATH = Path("/data")
MODELS_PATH = Path("/models")
EXPERIMENTS_PATH = Path("/experiments")
WORKSPACE_PATH = Path("/home/jovyan")

# Vérifier les volumes montés
print("=" * 60)
print("ML-SCHEDULER DATA COLLECTION - HYDATIS CLUSTER")
print("=" * 60)
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Environnement: Jupyter Notebook (wassimmezrani)")
print(f"\nVolumes vérifiés:")
for path_name, path in [("DATA", DATA_PATH), ("MODELS", MODELS_PATH), 
                        ("EXPERIMENTS", EXPERIMENTS_PATH), ("WORKSPACE", WORKSPACE_PATH)]:
    exists = path.exists()
    if exists:
        # Vérifier espace disponible
        try:
            stat_info = os.statvfs(path)
            available_gb = (stat_info.f_bavail * stat_info.f_frsize) / (1024**3)
            print(f"  {path_name:12} {path}: OK ({available_gb:.1f}GB disponible)")
        except:
            print(f"  {path_name:12} {path}: OK")
    else:
        print(f"  {path_name:12} {path}: ERREUR")

# Créer répertoires si nécessaire
ml_data_dir = DATA_PATH / "ml_scheduler_data"
ml_data_dir.mkdir(parents=True, exist_ok=True)
print(f"\nRépertoire données ML: {ml_data_dir}")
print("=" * 60)

## Connexion et Validation Prometheus

In [None]:
def test_prometheus_connection():
    """Test connexion Prometheus avec URLs internes et externes optimisé pour HYDATIS"""
    # Priorité LoadBalancer externe car plus fiable en environnement Jupyter
    urls_to_test = [
        (PROMETHEUS_EXTERNAL_URL, "LoadBalancer externe"),
        (PROMETHEUS_URL, "Service interne cluster")
    ]
    
    for url, description in urls_to_test:
        for attempt in range(MAX_RETRIES):
            try:
                print(f"Test {description}: {url} (tentative {attempt + 1}/{MAX_RETRIES})")
                response = requests.get(f"{url}/api/v1/query?query=up", timeout=REQUEST_TIMEOUT//4)
                
                if response.status_code == 200:
                    data = response.json()
                    if data['status'] == 'success':
                        services_up = len(data['data']['result'])
                        print(f"  SUCCESS: Connexion réussie")
                        print(f"  Services UP: {services_up}")
                        
                        # Vérification additionnelle de la disponibilité des métriques
                        test_query = "node_cpu_seconds_total"
                        test_response = requests.get(f"{url}/api/v1/query?query={test_query}", 
                                                   timeout=REQUEST_TIMEOUT//4)
                        if test_response.status_code == 200 and test_response.json()['status'] == 'success':
                            print(f"  OK: Métriques nodes disponibles")
                            return url
                        else:
                            print(f"  WARNING: Connexion OK mais métriques limitées")
                            return url
                else:
                    print(f"  ERROR: HTTP {response.status_code}")
                    
            except requests.exceptions.RequestException as e:
                print(f"  ERROR tentative {attempt + 1}: {e}")
                if attempt < MAX_RETRIES - 1:
                    time.sleep(2)
                continue
    
    print("ERROR: Aucune connexion Prometheus disponible après tous les essais")
    return None

# Test connexion avec retry
active_prometheus_url = test_prometheus_connection()
if active_prometheus_url:
    print(f"\nSUCCESS: Prometheus actif: {active_prometheus_url}")
    print(f"Configuration optimisée pour cluster HYDATIS")
else:
    print(f"\nERROR: Prometheus inaccessible")
    print(f"Vérifier la connectivité réseau et les services LoadBalancer")

In [None]:
def check_cluster_metrics():
    """Vérifier métriques disponibles du cluster HYDATIS"""
    if not active_prometheus_url:
        print("ERROR: Pas de connexion Prometheus")
        return False
        
    print("Vérification métriques cluster HYDATIS...")
    
    # Tester métriques essentielles
    essential_queries = {
        "Nodes cluster": "kube_node_info",
        "CPU nodes": "node_cpu_seconds_total",
        "Mémoire nodes": "node_memory_MemAvailable_bytes",
        "Pods running": "kube_pod_status_phase{phase='Running'}",
        "Load average": "node_load1"
    }
    
    results = {}
    for name, query in essential_queries.items():
        try:
            response = requests.get(f"{active_prometheus_url}/api/v1/query", 
                                  params={'query': query}, timeout=10)
            if response.status_code == 200:
                data = response.json()
                if data['status'] == 'success' and data['data']['result']:
                    count = len(data['data']['result'])
                    results[name] = count
                    print(f"  OK {name}: {count} métriques")
                else:
                    results[name] = 0
                    print(f"  ERROR {name}: Aucune donnée")
            else:
                results[name] = 0
                print(f"  ERROR {name}: HTTP {response.status_code}")
        except Exception as e:
            results[name] = 0
            print(f"  ERROR {name}: {e}")
    
    # Vérifier disponibilité données historiques
    end_time = datetime.now()
    start_time = end_time - timedelta(days=15)  # Tester 15 jours
    
    try:
        params = {
            'query': 'node_cpu_seconds_total',
            'start': start_time.timestamp(),
            'end': end_time.timestamp(),
            'step': '1h'
        }
        response = requests.get(f"{active_prometheus_url}/api/v1/query_range", 
                              params=params, timeout=30)
        
        if response.status_code == 200:
            data = response.json()
            if data['status'] == 'success' and data['data']['result']:
                values = data['data']['result'][0]['values']
                if values:
                    first_ts = datetime.fromtimestamp(float(values[0][0]))
                    last_ts = datetime.fromtimestamp(float(values[-1][0]))
                    days_span = (last_ts - first_ts).days
                    print(f"  OK Données historiques: {days_span} jours disponibles")
                    print(f"    Période: {first_ts.strftime('%Y-%m-%d')} → {last_ts.strftime('%Y-%m-%d')}")
                    results['historical_days'] = days_span
                else:
                    print(f"  ERROR Données historiques: Aucune valeur")
                    results['historical_days'] = 0
            else:
                print(f"  ERROR Données historiques: Pas de résultats")
                results['historical_days'] = 0
    except Exception as e:
        print(f"  ERROR Données historiques: {e}")
        results['historical_days'] = 0
    
    # Évaluer la qualité
    total_metrics = sum(1 for v in results.values() if isinstance(v, int) and v > 0)
    cluster_ready = total_metrics >= 4 and results.get('historical_days', 0) >= 7
    
    print(f"\nÉvaluation cluster:")
    print(f"  Métriques disponibles: {total_metrics}/5")
    print(f"  Données historiques: {results.get('historical_days', 0)} jours")
    print(f"  Status: {'READY' if cluster_ready else 'NOT READY'}")
    
    return cluster_ready, results

# Vérification cluster
cluster_ready, cluster_metrics = check_cluster_metrics()

## Configuration des Métriques ML

In [None]:
# Configuration métriques optimisée pour cluster HYDATIS
ML_METRICS_CONFIG = {
    'xgboost_features': {
        'description': 'Prédiction charge temporelle nodes HYDATIS',
        'metrics': [
            'node_cpu_seconds_total',
            'node_memory_MemAvailable_bytes',
            'node_memory_MemTotal_bytes',
            'node_filesystem_avail_bytes{mountpoint="/"}',
            'node_load1',
            'node_load5',
            'node_load15',
            'node_network_receive_bytes_total',
            'node_network_transmit_bytes_total'
        ],
        'storage_path': DATA_PATH / "xgboost_data"
    },
    'qlearning_features': {
        'description': 'Optimisation scheduling pods cluster HYDATIS',
        'metrics': [
            'kube_pod_info',
            'kube_pod_status_phase',
            'container_cpu_usage_seconds_total',
            'container_memory_usage_bytes',
            'kube_pod_container_resource_requests',
            'kube_pod_container_resource_limits'
        ],
        'storage_path': DATA_PATH / "qlearning_data"
    },
    'isolation_features': {
        'description': 'Détection anomalies comportement nodes HYDATIS',
        'metrics': [
            'node_cpu_seconds_total',
            'node_memory_MemAvailable_bytes',
            'node_network_receive_bytes_total',
            'node_network_transmit_bytes_total',
            'node_filesystem_avail_bytes{mountpoint="/"}',
            'node_load1',
            'node_load5'
        ],
        'storage_path': DATA_PATH / "isolation_data"
    }
}

# Créer répertoires pour chaque algorithme
print("Configuration stockage données ML:")
for algo_name, config in ML_METRICS_CONFIG.items():
    storage_path = config['storage_path']
    storage_path.mkdir(parents=True, exist_ok=True)
    print(f"  {algo_name:20} → {storage_path}")
    print(f"  {'':20}   {config['description']}")
    print(f"  {'':20}   {len(config['metrics'])} métriques configurées")

print(f"\nTotal algorithmes: {len(ML_METRICS_CONFIG)}")
print(f"Total métriques uniques: {len(set().union(*[config['metrics'] for config in ML_METRICS_CONFIG.values()]))}")

## Collecte Données Nodes (XGBoost + Isolation Forest)

In [None]:
                        # Identifier si master ou worker
                        if node_name.startswith('10.110.190.'):
                            node_ip = node_name
                            if node_ip in ['10.110.190.32', '10.110.190.33', '10.110.190.34']:
                                node_role = 'master'
                                ip_mapping = {'32': '1', '33': '2', '34': '3'}
                                node_name = f"master{ip_mapping[node_ip.split('.')[-1]]}"
                            elif node_ip in ['10.110.190.35', '10.110.190.36', '10.110.190.37']:
                                node_role = 'worker'
                                ip_mapping = {'35': '1', '36': '2', '37': '3'}
                                node_name = f"worker{ip_mapping[node_ip.split('.')[-1]]}"
                            else:
                                node_role = 'unknown'
                        else:
                            node_role = 'unknown'
                            node_ip = 'unknown'

## Collecte Données Pods (Q-Learning)

In [None]:
def collect_pod_scheduling_data_optimized(days_back=15, step_hours=2):
    """Collecte données pods optimisée pour Q-Learning HYDATIS"""
    if not active_prometheus_url:
        return pd.DataFrame()
    
    end_time = datetime.now()
    start_time = end_time - timedelta(days=days_back)
    
    print(f"\nCOLLECTE DONNÉES PODS SCHEDULING")
    print(f"Période: {start_time.strftime('%Y-%m-%d %H:%M')} → {end_time.strftime('%Y-%m-%d %H:%M')}")
    print(f"Résolution: {step_hours}h pour réduire le volume")
    
    # Métriques pods essentielles pour Q-Learning
    pod_metrics = [
        ('kube_pod_info', 'Pod Information'),
        ('kube_pod_status_phase{phase="Running"}', 'Running Pods'),
        ('container_cpu_usage_seconds_total', 'Container CPU'),
        ('container_memory_usage_bytes', 'Container Memory'),
        ('kube_pod_container_resource_requests{resource="cpu"}', 'CPU Requests'),
        ('kube_pod_container_resource_requests{resource="memory"}', 'Memory Requests')
    ]
    
    all_pod_data = []
    
    for i, (metric_name, description) in enumerate(pod_metrics, 1):
        print(f"[{i}/{len(pod_metrics)}] Collecte {description}...")
        
        params = {
            'query': metric_name,
            'start': start_time.timestamp(),
            'end': end_time.timestamp(),
            'step': f'{step_hours}h'
        }
        
        try:
            response = requests.get(f"{active_prometheus_url}/api/v1/query_range", 
                                  params=params, timeout=180)
            
            if response.status_code == 200:
                data = response.json()
                if data['status'] == 'success' and data['data']['result']:
                    series_count = 0
                    points_count = 0
                    
                    for series in data['data']['result']:
                        labels = series['metric']
                        pod_name = labels.get('pod', 'unknown')
                        namespace = labels.get('namespace', 'unknown')
                        node_name = labels.get('node', labels.get('instance', 'unknown'))
                        container = labels.get('container', '')
                        resource = labels.get('resource', '')
                        
                        # Nettoyer nom node
                        if ':' in node_name:
                            node_name = node_name.split(':')[0]
                        
                        # Mapper IP vers nom node
                        if node_name.startswith('10.110.190.'):
                            node_mapping = {
                                '10.110.190.32': 'master1',
                                '10.110.190.33': 'master2', 
                                '10.110.190.34': 'master3',
                                '10.110.190.35': 'worker1',
                                '10.110.190.36': 'worker2',
                                '10.110.190.37': 'worker3'
                            }
                            node_name = node_mapping.get(node_name, node_name)
                        
                        series_count += 1
                        
                        for timestamp_str, value_str in series['values']:
                            try:
                                timestamp = datetime.fromtimestamp(float(timestamp_str))
                                value = float(value_str)
                                points_count += 1
                                
                                all_pod_data.append({
                                    'timestamp': timestamp,
                                    'pod_name': pod_name,
                                    'namespace': namespace,
                                    'node_name': node_name,
                                    'container': container,
                                    'resource': resource,
                                    'metric': metric_name.split('{')[0],
                                    'value': value,
                                    'description': description
                                })
                            except (ValueError, TypeError):
                                # Pour les métriques de statut
                                if value_str in ['1', '0']:
                                    value = int(value_str)
                                    points_count += 1
                                    all_pod_data.append({
                                        'timestamp': timestamp,
                                        'pod_name': pod_name,
                                        'namespace': namespace,
                                        'node_name': node_name,
                                        'container': container,
                                        'resource': resource,
                                        'metric': metric_name.split('{')[0],
                                        'value': value,
                                        'description': description
                                    })
                                continue
                    
                    print(f"    OK {series_count} séries, {points_count} points")
                else:
                    print(f"    ERROR Pas de données")
            else:
                print(f"    ERROR HTTP {response.status_code}")
                
        except Exception as e:
            print(f"    ERROR Erreur: {e}")
        
        # Pause optimisée pour collecte pods (plus de données)
        time.sleep(COLLECTION_PAUSE * 1.5)
    
    if all_pod_data:
        df = pd.DataFrame(all_pod_data)
        
        print(f"\nRÉSULTATS COLLECTE PODS:")
        print(f"  Total points: {len(df):,}")
        print(f"  Pods uniques: {df['pod_name'].nunique()}")
        print(f"  Namespaces: {df['namespace'].nunique()}")
        print(f"  Nodes: {df[df['node_name'] != 'unknown']['node_name'].nunique()}")
        print(f"  Métriques: {df['metric'].nunique()}")
        
        # Top namespaces
        top_ns = df['namespace'].value_counts().head(5)
        print(f"\n  Top namespaces:")
        for ns, count in top_ns.items():
            print(f"    {ns:20}: {count:,} points")
        
        return df
    else:
        print("\nERROR Aucune donnée pod collectée")
        return pd.DataFrame()

# Lancement collecte pods
if cluster_ready:
    pod_scheduling_df = collect_pod_scheduling_data_optimized(days_back=15)
else:
    pod_scheduling_df = pd.DataFrame()

## Analyse Qualité et Sauvegarde

In [None]:
def analyze_and_save_data():
    """Analyse qualité et sauvegarde optimisée pour environnement HYDATIS"""
    print("\n" + "="*60)
    print("ANALYSE QUALITÉ DONNÉES ML-SCHEDULER HYDATIS")
    print("="*60)
    
    datasets = {
        'Node Metrics': node_metrics_df,
        'Pod Scheduling': pod_scheduling_df
    }
    
    quality_report = {}
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    for name, df in datasets.items():
        print(f"\n{name.upper()}:")
        
        if df.empty:
            print("   ERROR Dataset vide")
            quality_report[name] = {'status': 'EMPTY', 'score': 0}
            continue
        
        # Métriques de base
        total_rows = len(df)
        unique_timestamps = df['timestamp'].nunique()
        date_range = (df['timestamp'].max() - df['timestamp'].min()).days
        
        print(f"   Lignes: {total_rows:,}")
        print(f"   Timestamps uniques: {unique_timestamps:,}")
        print(f"   Période couverte: {date_range} jours")
        
        # Analyse spécifique par type
        if 'node_name' in df.columns:
            print(f"   Nodes HYDATIS: {df['node_name'].nunique()}")
            print(f"   Métriques: {df['metric'].nunique()}")
        elif 'pod_name' in df.columns:
            print(f"   Pods: {df['pod_name'].nunique()}")
            print(f"   Namespaces: {df['namespace'].nunique()}")
        
        # Complétude
        missing_values = df.isnull().sum().sum()
        completeness = ((total_rows * len(df.columns) - missing_values) / (total_rows * len(df.columns))) * 100
        print(f"   Complétude: {completeness:.1f}%")
        
        # Score qualité adapté HYDATIS
        score = 0
        if total_rows > 1000: score += 30  # Volume suffisant
        if date_range >= 7: score += 25    # Période minimum MLOps
        if date_range >= 14: score += 10   # Période optimale
        if completeness >= 95: score += 25 # Complétude excellente
        elif completeness >= 90: score += 20
        if unique_timestamps > 100: score += 10  # Diversité temporelle
        
        status = "EXCELLENT" if score >= 85 else "BON" if score >= 70 else "MOYEN" if score >= 50 else "FAIBLE"
        print(f"   Score qualité: {score}/100 - {status}")
        
        quality_report[name] = {
            'status': status,
            'score': score,
            'rows': total_rows,
            'completeness': completeness,
            'days': date_range
        }
        
        # Sauvegarde dans volumes appropriés
        if score >= 50:  # Seulement si qualité suffisante
            dataset_name = name.lower().replace(' ', '_')
            
            # Sauvegarde CSV dans /data
            csv_path = DATA_PATH / f"{dataset_name}_{timestamp}.csv"
            df.to_csv(csv_path, index=False)
            print(f"   Sauvegardé CSV: {csv_path}")
            
            # Sauvegarde Parquet pour performance
            parquet_path = DATA_PATH / f"{dataset_name}_{timestamp}.parquet"
            df.to_parquet(parquet_path, index=False)
            print(f"   Sauvegardé Parquet: {parquet_path}")
            
            # Échantillon pour experiments
            if len(df) > 10000:
                sample_df = df.sample(n=min(5000, len(df)//10))
                sample_path = EXPERIMENTS_PATH / f"{dataset_name}_sample_{timestamp}.csv"
                sample_df.to_csv(sample_path, index=False)
                print(f"   Échantillon: {sample_path} ({len(sample_df)} lignes)")
    
    # Score global et métadonnées
    if quality_report:
        valid_scores = [r['score'] for r in quality_report.values() if r['score'] > 0]
        if valid_scores:
            avg_score = sum(valid_scores) / len(valid_scores)
            global_status = "READY" if avg_score >= 60 else "NEEDS_IMPROVEMENT"
            
            print(f"\nSCORE GLOBAL HYDATIS: {avg_score:.1f}/100 - {global_status}")
            print(f"Prêt pour développement ML: {'SUCCESS OUI' if avg_score >= 60 else 'ERROR NON'}")
            
            # Sauvegarde métadonnées complètes
            metadata = {
                'collection_info': {
                    'date': datetime.now().isoformat(),
                    'cluster': 'HYDATIS',
                    'environment': 'Jupyter Notebook wassimmezrani',
                    'prometheus_url': active_prometheus_url
                },
                'cluster_info': cluster_metrics,
                'data_quality': quality_report,
                'ml_algorithms_config': ML_METRICS_CONFIG,
                'datasets_summary': {
                    name: {
                        'rows': len(df) if not df.empty else 0,
                        'columns': list(df.columns) if not df.empty else [],
                        'date_range': {
                            'start': df['timestamp'].min().isoformat() if not df.empty and 'timestamp' in df.columns else None,
                            'end': df['timestamp'].max().isoformat() if not df.empty and 'timestamp' in df.columns else None
                        } if not df.empty else None
                    } for name, df in datasets.items()
                },
                'storage_paths': {
                    'data': str(DATA_PATH),
                    'models': str(MODELS_PATH),
                    'experiments': str(EXPERIMENTS_PATH)
                }
            }
            
            metadata_path = DATA_PATH / f"ml_scheduler_metadata_{timestamp}.json"
            with open(metadata_path, 'w') as f:
                json.dump(metadata, f, indent=2, default=str)
            
            print(f"\nMétadonnées complètes: {metadata_path}")
            
            return avg_score >= 60, metadata
    
    return False, {}

# Analyse finale et sauvegarde
ml_ready, collection_metadata = analyze_and_save_data()

## Résumé et Prochaines Étapes

In [None]:
# Résumé final de la collecte
print("\n" + "="*60)
print("RÉSUMÉ COLLECTE DONNÉES ML-SCHEDULER HYDATIS")
print("="*60)

print(f"\nENVIRONNEMENT:")
print(f"  Cluster: HYDATIS (6 nodes: 3 masters + 3 workers)")
print(f"  Jupyter: wassimmezrani namespace")
print(f"  Prometheus: {active_prometheus_url if active_prometheus_url else 'Non disponible'}")
print(f"  Date collecte: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

print(f"\nDONNÉES COLLECTÉES:")
if not node_metrics_df.empty:
    print(f"  SUCCESS Node Metrics: {len(node_metrics_df):,} points")
    print(f"     - Nodes HYDATIS: {node_metrics_df['node_name'].nunique()}")
    print(f"     - Métriques: {node_metrics_df['metric'].nunique()}")
    print(f"     - Période: {(node_metrics_df['timestamp'].max() - node_metrics_df['timestamp'].min()).days} jours")
else:
    print(f"  ERROR Node Metrics: Aucune donnée collectée")

if not pod_scheduling_df.empty:
    print(f"  SUCCESS Pod Scheduling: {len(pod_scheduling_df):,} points")
    print(f"     - Pods uniques: {pod_scheduling_df['pod_name'].nunique()}")
    print(f"     - Namespaces: {pod_scheduling_df['namespace'].nunique()}")
    print(f"     - Métriques: {pod_scheduling_df['metric'].nunique()}")
else:
    print(f"  ERROR Pod Scheduling: Aucune donnée collectée")

print(f"\nSTOCKAGE:")
if ml_ready:
    print(f"  SUCCESS Données sauvegardées dans {DATA_PATH}")
    print(f"  SUCCESS Échantillons dans {EXPERIMENTS_PATH}")
    print(f"  SUCCESS Métadonnées complètes générées")
else:
    print(f"  ERROR Qualité insuffisante - sauvegarde partielle")

print(f"\nSTATUS FINAL:")
if ml_ready:
    print(f"  SUCCESS READY FOR ML DEVELOPMENT")
    print(f"  \nPROCHAINES ÉTAPES:")
    print(f"    1. Développer XGBoost Predictor (prédiction charge nodes)")
    print(f"    2. Développer Q-Learning Optimizer (optimisation scheduling)")
    print(f"    3. Développer Isolation Forest Detector (détection anomalies)")
    print(f"    4. Intégrer algorithmes dans ML-Scheduler")
    print(f"    5. Tests et validation sur cluster HYDATIS")
    print(f"    6. Déploiement production")
else:
    print(f"  WARNING NEEDS IMPROVEMENT")
    print(f"  \nACTIONS REQUISES:")
    print(f"    1. Vérifier connectivité Prometheus")
    print(f"    2. Augmenter période de rétention données")
    print(f"    3. Relancer collecte avec paramètres ajustés")

print(f"\nFICHIERS DISPONIBLES:")
data_files = list(DATA_PATH.glob("*"))
if data_files:
    print(f"  Répertoire /data: {len(data_files)} fichiers")
    for file_path in sorted(data_files)[-5:]:  # Afficher les 5 derniers
        size_mb = file_path.stat().st_size / (1024*1024) if file_path.is_file() else 0
        print(f"    {file_path.name} ({size_mb:.1f} MB)")

experiment_files = list(EXPERIMENTS_PATH.glob("*"))
if experiment_files:
    print(f"  Répertoire /experiments: {len(experiment_files)} fichiers")

print("\n" + "="*60)
print(f"COLLECTE {'TERMINÉE AVEC SUCCÈS' if ml_ready else 'PARTIELLEMENT RÉUSSIE'}")
print("="*60)