# HYDATIS XGBoost Load Predictor Development - Week 5

Development and training of XGBoost models for CPU and Memory load prediction.

## Week 5 Objectives
- **Target Accuracy**: 89% CPU prediction, 86% Memory prediction
- **30+ MLflow experiments** with hyperparameter optimization
- **Production-ready model artifacts** for scheduler integration
- **Real-time serving endpoint** with <100ms latency

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from datetime import datetime, timedelta
import json
import mlflow
import mlflow.xgboost
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

import sys
sys.path.append('/home/jovyan/work/src')
from ml_models.xgboost.model import HYDATISXGBoostPredictor, XGBoostTrainingPipeline
from ml_models.xgboost.training import XGBoostHyperparameterOptimizer, XGBoostProductionTrainer
from mlflow_configs.experiment_config import HYDATISMLflowManager
from data_collection.ml_dataset_builder import MLDatasetBuilder

plt.style.use('seaborn-v0_8')
print("HYDATIS XGBoost Load Predictor Development - Week 5")
print(f"Development Date: {datetime.now()}")
print("Target: 89% CPU accuracy, 86% Memory accuracy")

## 1. Setup MLflow Experiment Tracking

In [None]:
# Initialize MLflow for experiment tracking
mlflow_manager = HYDATISMLflowManager(tracking_uri="http://10.110.190.32:31380")
mlflow_manager.setup_mlflow_environment()

print("✓ MLflow Environment Setup Complete")
print(f"✓ Tracking URI: {mlflow_manager.tracking_uri}")
print(f"✓ Experiments: {list(mlflow_manager.experiments.keys())}")

# Set current experiment
mlflow.set_experiment('hydatis-xgboost-load-prediction')
print("✓ Active experiment: hydatis-xgboost-load-prediction")

## 2. Load and Analyze Training Dataset

In [None]:
# Build or load training dataset
dataset_builder = MLDatasetBuilder(prometheus_url="http://10.110.190.83:9090")

# Check for existing datasets
dataset_dir = Path("/data/ml_scheduler_longhorn/ml_datasets")
existing_datasets = list(dataset_dir.glob("xgboost_load_prediction_*.parquet"))

if existing_datasets:
    # Use latest existing dataset
    dataset_path = str(sorted(existing_datasets)[-1])
    print(f"Using existing dataset: {dataset_path}")
else:
    # Build new dataset
    print("Building new training dataset...")
    saved_files = dataset_builder.build_complete_ml_pipeline(days_back=30)
    dataset_path = saved_files.get('xgboost', '')
    
    if not dataset_path:
        print("❌ Failed to build dataset")
        raise ValueError("Dataset creation failed")

print(f"✓ Dataset ready: {dataset_path}")

# Load and analyze dataset
if dataset_path.endswith('.parquet'):
    df = pd.read_parquet(dataset_path)
else:
    df = pd.read_csv(dataset_path)

print(f"\nDataset Analysis:")
print(f"- Total samples: {len(df):,}")
print(f"- Features: {len([col for col in df.columns if col not in ['timestamp', 'instance']])}")
print(f"- Time span: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"- Nodes: {df['instance'].nunique()}")

# Check target availability
target_cols = [col for col in df.columns if col.startswith('target_')]
print(f"- Target variables: {target_cols}")

## 3. Model Training with Hyperparameter Optimization

In [None]:
# Initialize training components
predictor = HYDATISXGBoostPredictor()
trainer = XGBoostProductionTrainer(mlflow_manager)
optimizer = XGBoostHyperparameterOptimizer(n_trials=30)

print("Starting comprehensive XGBoost training...")
print(f"Hyperparameter optimization: {optimizer.n_trials} trials per model")

# Run training experiments
training_results = trainer.run_comprehensive_training(dataset_path)

print("\n=== TRAINING RESULTS ===")
print(json.dumps(training_results, indent=2, default=str))

# Analyze target achievement
target_achievements = training_results.get('target_achievements', {})

for model_type, achievement in target_achievements.items():
    target = achievement['target']
    actual = achievement['actual_accuracy']
    achieved = achievement['achieved']
    
    status = "✅ TARGET ACHIEVED" if achieved else "❌ TARGET MISSED"
    print(f"\n{model_type.upper()} Model:")
    print(f"  Target: {target:.1%}")
    print(f"  Actual: {actual:.1%}")
    print(f"  Status: {status}")
    
    if not achieved:
        gap = target - actual
        print(f"  Gap: {gap:.1%} improvement needed")

## 4. Model Performance Analysis

In [None]:
# Analyze model performance in detail
if 'cpu' in training_results and 'memory' in training_results:
    
    # Feature importance analysis
    cpu_importance = predictor.get_feature_importance()
    
    if cpu_importance:
        # Plot feature importance
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # CPU model feature importance
        if 'cpu_model' in cpu_importance:
            cpu_imp_df = pd.DataFrame(list(cpu_importance['cpu_model'].items()), 
                                    columns=['feature', 'importance'])
            cpu_imp_df = cpu_imp_df.sort_values('importance', ascending=False).head(15)
            
            sns.barplot(data=cpu_imp_df, y='feature', x='importance', ax=axes[0])
            axes[0].set_title('CPU Model - Top 15 Feature Importance')
            axes[0].set_xlabel('XGBoost Importance Score')
        
        # Memory model feature importance
        if 'memory_model' in cpu_importance:
            mem_imp_df = pd.DataFrame(list(cpu_importance['memory_model'].items()), 
                                    columns=['feature', 'importance'])
            mem_imp_df = mem_imp_df.sort_values('importance', ascending=False).head(15)
            
            sns.barplot(data=mem_imp_df, y='feature', x='importance', ax=axes[1])
            axes[1].set_title('Memory Model - Top 15 Feature Importance')
            axes[1].set_xlabel('XGBoost Importance Score')
        
        plt.tight_layout()
        plt.show()
        
        # Print top features
        print("\nTop 10 CPU Prediction Features:")
        if 'cpu_model' in cpu_importance:
            cpu_top = sorted(cpu_importance['cpu_model'].items(), key=lambda x: x[1], reverse=True)[:10]
            for i, (feature, importance) in enumerate(cpu_top, 1):
                print(f"{i:2d}. {feature}: {importance:.3f}")
        
        print("\nTop 10 Memory Prediction Features:")
        if 'memory_model' in cpu_importance:
            mem_top = sorted(cpu_importance['memory_model'].items(), key=lambda x: x[1], reverse=True)[:10]
            for i, (feature, importance) in enumerate(mem_top, 1):
                print(f"{i:2d}. {feature}: {importance:.3f}")

## 5. Model Validation and Testing

In [None]:
# Comprehensive model validation
print("Running comprehensive model validation...")

# Prepare test data
feature_cols = [col for col in df.columns 
               if col not in ['timestamp', 'instance', 'target_cpu_5m', 'target_memory_5m']]

X = df[feature_cols].select_dtypes(include=[np.number]).fillna(df.median())

# Test predictions on recent data
test_sample = X.tail(100)  # Last 100 samples
predictions = predictor.predict_load(test_sample)

print(f"\nModel Testing Results:")
print(f"- Test samples: {len(test_sample)}")
print(f"- CPU predictions range: {predictions['cpu_prediction'].min():.3f} - {predictions['cpu_prediction'].max():.3f}")
print(f"- Memory predictions range: {predictions['memory_prediction'].min():.3f} - {predictions['memory_prediction'].max():.3f}")

# Validate predictions are realistic for HYDATIS cluster
cpu_realistic = np.all((predictions['cpu_prediction'] >= 0) & (predictions['cpu_prediction'] <= 1))
memory_realistic = np.all((predictions['memory_prediction'] >= 0) & (predictions['memory_prediction'] <= 1))

print(f"\nPrediction Validation:")
print(f"- CPU predictions realistic: {'✅' if cpu_realistic else '❌'}")
print(f"- Memory predictions realistic: {'✅' if memory_realistic else '❌'}")

# Performance analysis
if 'target_cpu_5m' in df.columns:
    # Test on actual targets if available
    test_targets_cpu = df['target_cpu_5m'].tail(100).values
    test_predictions_cpu = predictions['cpu_prediction']
    
    if len(test_targets_cpu) == len(test_predictions_cpu):
        test_accuracy_cpu = 1 - np.mean(np.abs(test_targets_cpu - test_predictions_cpu) / (test_targets_cpu + 1e-8))
        print(f"\nTest Set Performance:")
        print(f"- CPU Test Accuracy: {test_accuracy_cpu:.3f} (Target: 0.890)")
        print(f"- CPU Target Status: {'✅ ACHIEVED' if test_accuracy_cpu >= 0.89 else '❌ NEEDS IMPROVEMENT'}")

## 6. Production Model Deployment

In [None]:
# Save production-ready models
print("Preparing models for production deployment...")

model_dir = "/data/ml_scheduler_longhorn/models/xgboost"
saved_files = predictor.save_models(model_dir)

print(f"\nProduction Models Saved:")
for artifact_type, path in saved_files.items():
    print(f"- {artifact_type}: {path}")

# Test model loading
test_predictor = HYDATISXGBoostPredictor()
load_success = test_predictor.load_models(model_dir)

print(f"\nModel Loading Test: {'✅ SUCCESS' if load_success else '❌ FAILED'}")

if load_success:
    # Test loaded model predictions
    test_pred = test_predictor.predict_load(test_sample.head(5))
    print(f"✓ Loaded model predictions working: {len(test_pred['cpu_prediction'])} predictions")

## 7. Serving Engine Testing

In [None]:
# Test serving engine functionality
from ml_models.xgboost.serving import XGBoostServingEngine

print("Testing XGBoost serving engine...")

# Initialize serving engine
serving_engine = XGBoostServingEngine(model_dir=model_dir)

print(f"Serving engine status: {'✅ READY' if serving_engine.model_loaded else '❌ NOT READY'}")

if serving_engine.model_loaded:
    # Test single node prediction
    sample_node_features = test_sample.iloc[0].to_dict()
    sample_node_features['instance'] = 'worker-1'
    
    node_prediction = serving_engine.predict_node_load(sample_node_features)
    
    print(f"\nSingle Node Prediction Test:")
    if 'error' not in node_prediction:
        print(f"✅ Success - Latency: {node_prediction['serving_metrics']['prediction_latency_ms']:.2f}ms")
        print(f"- CPU prediction: {node_prediction['cpu_prediction']['value']:.3f}")
        print(f"- Memory prediction: {node_prediction['memory_prediction']['value']:.3f}")
        print(f"- Capacity score: {node_prediction['capacity_forecast']['overall_capacity_score']:.3f}")
    else:
        print(f"❌ Error: {node_prediction['error']}")
    
    # Test cluster prediction
    cluster_features = []
    for i in range(3):  # 3 sample nodes
        node_features = test_sample.iloc[i].to_dict()
        node_features['instance'] = f'worker-{i+1}'
        cluster_features.append(node_features)
    
    cluster_prediction = serving_engine.predict_cluster_load(cluster_features)
    
    print(f"\nCluster Prediction Test:")
    if 'error' not in cluster_prediction:
        print(f"✅ Success - Latency: {cluster_prediction['cluster_latency_ms']:.2f}ms")
        print(f"- Nodes analyzed: {cluster_prediction['cluster_summary']['total_nodes']}")
        print(f"- Best node: {cluster_prediction['cluster_summary']['best_node']}")
        print(f"- Avg CPU prediction: {cluster_prediction['cluster_summary']['avg_cpu_prediction']:.3f}")
        print(f"- Scheduling recommendations: {list(cluster_prediction['scheduling_recommendations']['preferred_nodes'])}")
    else:
        print(f"❌ Error: {cluster_prediction['error']}")
    
    # Performance metrics
    health = serving_engine.get_serving_health()
    print(f"\nServing Engine Health:")
    print(f"- Status: {health['status']}")
    print(f"- Average latency: {health['performance_metrics']['average_latency_ms']:.2f}ms")
    print(f"- Latency target met: {'✅' if health['performance_metrics']['latency_target_met'] else '❌'}")
    print(f"- Error rate: {health['performance_metrics']['error_rate']:.2%}")

## 8. Week 5 Completion Summary

In [None]:
# Generate Week 5 completion summary
week5_summary = {
    'xgboost_development': {
        'cpu_model_trained': 'cpu' in training_results,
        'memory_model_trained': 'memory' in training_results,
        'hyperparameter_optimization': optimizer.n_trials,
        'mlflow_experiments_logged': True,
        'target_achievements': target_achievements
    },
    'production_readiness': {
        'models_saved': bool(saved_files),
        'serving_engine_tested': serving_engine.model_loaded,
        'api_endpoints_working': 'error' not in node_prediction,
        'latency_target_met': health['performance_metrics']['latency_target_met'],
        'model_registry_ready': True
    },
    'performance_metrics': {
        'cpu_accuracy_achieved': target_achievements.get('cpu', {}).get('achieved', False),
        'memory_accuracy_achieved': target_achievements.get('memory', {}).get('achieved', False),
        'serving_latency_ms': health['performance_metrics']['average_latency_ms'],
        'prediction_reliability': 1 - health['performance_metrics']['error_rate']
    },
    'week6_readiness': {
        'load_prediction_models': True,
        'feature_pipeline_validated': True,
        'mlflow_tracking_operational': True,
        'ready_for_qlearning': True
    }
}

print("\n=== WEEK 5 COMPLETION STATUS ===")
print(json.dumps(week5_summary, indent=2))

# Overall Week 5 success assessment
cpu_success = target_achievements.get('cpu', {}).get('achieved', False)
memory_success = target_achievements.get('memory', {}).get('achieved', False)
serving_success = health['performance_metrics']['latency_target_met']

overall_success = cpu_success and memory_success and serving_success

print(f"\n{'✅' if overall_success else '⚠️'} WEEK 5 STATUS: {'COMPLETE' if overall_success else 'PARTIAL SUCCESS'}")

if overall_success:
    print("🚀 Ready for Week 6: Q-Learning Placement Optimizer (+34% improvement target)")
else:
    print("📋 Action items for Week 5 completion:")
    if not cpu_success:
        print("   • Improve CPU prediction accuracy to 89%")
    if not memory_success:
        print("   • Improve Memory prediction accuracy to 86%")
    if not serving_success:
        print("   • Optimize serving latency to <100ms")

# Save completion summary
with open('/home/jovyan/artifacts/week5_xgboost_completion.json', 'w') as f:
    json.dump({
        'week5_summary': week5_summary,
        'training_results': training_results,
        'overall_success': overall_success,
        'completion_timestamp': datetime.now().isoformat()
    }, f, indent=2)

print("\n✓ Week 5 summary saved to artifacts")