# Coordination Engine Integration for Self-Healing Platform

## Overview
This notebook demonstrates how to integrate trained anomaly detection models with the Self-Healing Platform's coordination engine. It shows the complete workflow from anomaly detection to automated remediation actions.

## Prerequisites
- Trained anomaly detection models (from 02-anomaly-detection notebooks)
- Running coordination engine in the cluster
- Access to OpenShift API and Prometheus metrics

## Expected Outcomes
- Understand coordination engine API and integration patterns
- Implement real-time anomaly detection pipeline
- Demonstrate automated remediation workflows
- Test end-to-end self-healing scenarios

## References
- ADR-002: Hybrid Deterministic-AI Self-Healing Approach
- ADR-009: Bootstrap Deployment Automation Architecture
- ADR-012: Notebook Architecture for End-to-End Workflows

## Setup and Configuration

In [None]:
# Import required libraries
import sys
import os
from pathlib import Path

# Setup path for utils module - works from any directory
def find_utils_path():
    """Find utils path regardless of current working directory"""
    possible_paths = [
        Path(__file__).parent.parent / 'utils' if '__file__' in dir() else None,
        Path.cwd() / 'notebooks' / 'utils',
        Path.cwd().parent / 'utils',
        Path('/workspace/repo/notebooks/utils'),
        Path('/opt/app-root/src/notebooks/utils'),
    ]
    for p in possible_paths:
        if p and p.exists() and (p / 'common_functions.py').exists():
            return str(p)
    return None

utils_path = find_utils_path()
if utils_path:
    sys.path.insert(0, utils_path)
    print(f"✅ Utils path found: {utils_path}")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import requests
import json
import time
import joblib
import warnings
warnings.filterwarnings('ignore')

# Kubernetes client
try:
    from kubernetes import client, config
    k8s_available = True
    print("✅ Kubernetes client available")
except ImportError:
    k8s_available = False
    print("⚠️ Kubernetes client not available - using simulation mode")

# Try to import common functions, with fallback
try:
    from common_functions import (
        setup_environment, print_environment_info,
        generate_synthetic_timeseries, validate_data_quality,
        load_processed_data
    )
    print("✅ Common functions imported")
except ImportError as e:
    print(f"⚠️ Using fallback implementations")
    def setup_environment():
        os.makedirs('/opt/app-root/src/data/processed', exist_ok=True)
        os.makedirs('/opt/app-root/src/models', exist_ok=True)
        return {'data_dir': '/opt/app-root/src/data', 'models_dir': '/opt/app-root/src/models'}
    def print_environment_info(env_info):
        print(f"📁 Data dir: {env_info.get('data_dir', 'N/A')}")
    def generate_synthetic_timeseries(*args, **kwargs):
        return pd.DataFrame({'value': np.random.random(100)})
    def validate_data_quality(df, name=''):
        return {'valid': True}
    def load_processed_data(filename):
        return None

print("✅ Libraries imported successfully")

In [None]:
# Set up environment
env_info = setup_environment()
print_environment_info(env_info)

# Coordination Engine Configuration
COORDINATION_ENGINE_CONFIG = {
    'base_url': 'http://coordination-engine:8080',
    'timeout': 30,
    'retry_attempts': 3,
    'retry_delay': 5
}

# Self-Healing Actions Configuration
REMEDIATION_ACTIONS = {
    'pod_restart': {
        'description': 'Restart problematic pods',
        'severity_threshold': 0.7,
        'cooldown_minutes': 10
    },
    'scale_up': {
        'description': 'Scale up deployment replicas',
        'severity_threshold': 0.8,
        'cooldown_minutes': 15
    },
    'resource_adjustment': {
        'description': 'Adjust resource limits/requests',
        'severity_threshold': 0.6,
        'cooldown_minutes': 30
    },
    'alert_escalation': {
        'description': 'Escalate to human operators',
        'severity_threshold': 0.9,
        'cooldown_minutes': 5
    }
}

print(f"🎯 Coordination engine: {COORDINATION_ENGINE_CONFIG['base_url']}")
print(f"🔧 Available remediation actions: {len(REMEDIATION_ACTIONS)}")

## Coordination Engine Client

Implement client for communicating with the coordination engine.

In [None]:
class CoordinationEngineClient:
    """
    Client for interacting with the Self-Healing Platform coordination engine.

    This client uses the coordination engine's KServe proxy to call ML models
    and submits incidents for remediation orchestration.

    Architecture (ADR-039, ADR-040):
        Notebook → Coordination Engine → KServe InferenceServices
                   /api/v1/detect        (user-deployed models)
    """
    
    def __init__(self, base_url, timeout=30):
        self.base_url = base_url.rstrip('/')
        self.timeout = timeout
        self.session = requests.Session()
        
    def health_check(self):
        """
        Check if coordination engine is healthy
        """
        try:
            response = self.session.get(
                f"{self.base_url}/health",
                timeout=self.timeout
            )
            return response.status_code == 200
        except Exception as e:
            print(f"❌ Health check failed: {e}")
            return False
    
    def detect_anomaly(self, model_name, instances):
        """
        Call coordination engine's KServe proxy for anomaly detection.

        The coordination engine proxies requests to KServe InferenceServices,
        allowing users to add custom models via values-hub.yaml configuration.

        Args:
            model_name (str): KServe model name (e.g., 'anomaly-detector', 'predictive-analytics')
                             Users can add custom models via values-hub.yaml
            instances (list): List of feature vectors [[f1, f2, ...]]

        Returns:
            dict: Detection results from KServe model
                {
                    "predictions": [-1, 1],  # -1=anomaly, 1=normal
                    "model_name": "anomaly-detector",
                    "model_version": "v2"
                }

        Example:
            result = client.detect_anomaly(
                model_name="anomaly-detector",
                instances=[[0.5, 1.2, 0.8]]
            )
        """
        try:
            response = self.session.post(
                f"{self.base_url}/api/v1/detect",
                json={
                    "model": model_name,
                    "instances": instances
                },
                timeout=self.timeout
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"❌ Anomaly detection failed: {e}")
            return None
    
    def submit_incident(self, incident_data):
        """
        Submit incident to coordination engine for remediation.

        Args:
            incident_data (dict): Incident details including anomaly info
                {
                    'timestamp': '2025-01-07T...',
                    'type': 'anomaly_detected',
                    'severity': 'high',
                    'metrics': {...},
                    'prediction': {...}
                }

        Returns:
            dict: Incident ID and status
                {
                    'id': 'incident-123',
                    'status': 'pending'
                }
        """
        try:
            response = self.session.post(
                f"{self.base_url}/api/v1/incidents",
                json=incident_data,
                timeout=self.timeout
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"❌ Failed to submit incident: {e}")
            return None
    
    def get_incident_status(self, incident_id):
        """
        Get status of an incident

        Args:
            incident_id (str): Incident identifier

        Returns:
            dict: Incident status and remediation details
        """
        try:
            response = self.session.get(
                f"{self.base_url}/api/v1/incidents/{incident_id}",
                timeout=self.timeout
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"❌ Failed to get incident status: {e}")
            return None
    
    def list_models(self):
        """
        List all registered KServe models available through coordination engine

        Returns:
            dict: List of available models
                {
                    'models': ['anomaly-detector', 'predictive-analytics', ...],
                    'count': 2
                }
        """
        try:
            response = self.session.get(
                f"{self.base_url}/api/v1/models",
                timeout=self.timeout
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"❌ Failed to list models: {e}")
            return None
    
    def get_metrics(self):
        """
        Get coordination engine metrics
        """
        try:
            response = self.session.get(
                f"{self.base_url}/metrics",
                timeout=self.timeout
            )
            return response.status_code == 200, response.text if response.status_code == 200 else None
        except Exception as e:
            print(f"❌ Failed to get metrics: {e}")
            return False, None

# Initialize coordination engine client
coord_client = CoordinationEngineClient(
    base_url=COORDINATION_ENGINE_CONFIG['base_url'],
    timeout=COORDINATION_ENGINE_CONFIG['timeout']
)

# Test connection
print("🔍 Testing coordination engine connection...")
if coord_client.health_check():
    print("✅ Coordination engine is healthy")
    
    # List available models (if coordination engine supports it)
    models = coord_client.list_models()
    if models:
        print(f"📊 Available models: {models.get('models', [])}")
else:
    print("⚠️ Coordination engine not available - using simulation mode")
    print("   Make sure coordination engine is deployed with KServe integration enabled")
    
print("✅ Coordination engine client initialized")

## Architecture: Coordination Engine as KServe Proxy

This notebook demonstrates the **production-ready architecture** where the coordination engine acts as a proxy to KServe InferenceServices.

### Architecture Overview (ADR-039, ADR-040)

```
┌────────────────┐           ┌─────────────────────┐           ┌──────────────────────┐
│   Notebook     │           │ Coordination Engine │           │ KServe Models        │
│                │           │                     │           │                      │
│  - Extract     │           │ - KServe proxy      │           │ - anomaly-detector   │
│    features    │──POST────>│ - Incident mgmt     │──HTTP────>│ - predictive-analytics│
│  - Submit      │           │ - Remediation       │           │ - user-custom-models │
│    incidents   │<──JSON───>│   orchestration     │<──JSON────│                      │
└────────────────┘           └─────────────────────┘           └──────────────────────┘
      /api/v1/detect                                               KServe v1 API
      /api/v1/incidents
```

### Benefits

- ✅ **Central orchestration**: Coordination engine as single point of control
- ✅ **Easy forking**: Users modify notebook without changing ML infrastructure
- ✅ **Extensible**: Add custom models via `values-hub.yaml` (ADR-040)
- ✅ **Platform agnostic**: Works on vanilla Kubernetes and OpenShift
- ✅ **GitOps-native**: All configuration in version control

### How It Works

1. **Notebook extracts features** from metrics
2. **Calls coordination engine** `/api/v1/detect` with model name and instances
3. **Coordination engine proxies** to KServe InferenceService
4. **KServe model returns** predictions
5. **If anomaly detected**, notebook submits incident to `/api/v1/incidents`
6. **Coordination engine orchestrates** remediation

### Adding Custom Models (ADR-040)

Users can add their own KServe models in `values-hub.yaml`:

```yaml
coordinationEngine:
  kserve:
    enabled: true
    namespace: self-healing-platform
    services:
      # Platform default models
      anomaly_detector: "anomaly-detector-predictor"
      predictive_analytics: "predictive-analytics-predictor"
      
      # Add your custom models ⬇️
      disk_failure_predictor: "disk-failure-predictor-predictor"
      postgres_query_anomaly: "postgres-query-anomaly-predictor"
```

**Workflow**:
1. Train model in notebook or locally
2. Deploy as KServe InferenceService
3. Register in `values-hub.yaml`
4. Use from any notebook via coordination engine

See: `docs/guides/USER-MODEL-DEPLOYMENT-GUIDE.md`

### No Local ML Model Needed

This notebook **does not** include a local ML model (no `AnomalyDetectionPipeline`).
All ML inference is handled by the coordination engine's KServe proxy.

## Self-Healing Workflow Implementation

Implement the complete self-healing workflow from anomaly detection to remediation.

In [None]:
class SelfHealingWorkflow:
    """
    Complete self-healing workflow using coordination engine.

    This workflow demonstrates how to:
    1. Extract features from metrics
    2. Call coordination engine for anomaly detection (proxies to KServe)
    3. Submit incidents to coordination engine for remediation

    The coordination engine handles:
    - KServe model proxy (calls InferenceServices)
    - Incident management
    - Remediation orchestration
    """
    
    def __init__(self, coord_client):
        self.coord_client = coord_client  # Only need coordination engine client
        self.remediation_history = []
        self.cooldown_tracker = {}
        
    def process_metrics(self, metrics_data, model_name="anomaly-detector"):
        """
        Process metrics through coordination engine.

        Workflow:
        1. Extract features from metrics
        2. Call coordination engine /api/v1/detect (proxies to KServe)
        3. If anomaly detected, submit incident to coordination engine
        4. Coordination engine handles remediation orchestration

        Args:
            metrics_data (dict): Metrics to analyze
            model_name (str): KServe model to use (default: 'anomaly-detector')

        Returns:
            dict: Detection results from KServe model
        """
        print(f"📊 Processing metrics at {datetime.now()}")
        
        # Extract features from metrics
        features = self._extract_features(metrics_data)
        
        # Call coordination engine for anomaly detection (proxies to KServe)
        print(f"🔍 Calling coordination engine /api/v1/detect (model: {model_name})")
        detection_result = self.coord_client.detect_anomaly(
            model_name=model_name,
            instances=features.tolist()
        )
        
        if not detection_result:
            print("⚠️ Detection failed - coordination engine unavailable")
            return None
        
        # Check predictions
        predictions = detection_result.get('predictions', [])
        print(f"📊 Received {len(predictions)} predictions from KServe")
        
        for i, prediction in enumerate(predictions):
            if prediction == -1:  # -1 = anomaly in sklearn IsolationForest
                print(f"🚨 Anomaly detected! (prediction: {prediction})")
                
                # Create incident
                incident = {
                    'timestamp': datetime.now().isoformat(),
                    'type': 'anomaly_detected',
                    'severity': 'high',
                    'metrics': metrics_data,
                    'model': model_name,
                    'prediction': prediction,
                    'status': 'pending'
                }
                
                # Submit to coordination engine for remediation
                print(f"📝 Submitting incident to coordination engine...")
                incident_result = self.coord_client.submit_incident(incident)
                
                if incident_result:
                    incident_id = incident_result.get('id', 'unknown')
                    print(f"✅ Incident submitted: {incident_id}")
                    
                    # Track in history
                    self.remediation_history.append({
                        'incident_id': incident_id,
                        'timestamp': datetime.now(),
                        'metrics': metrics_data,
                        'model': model_name,
                        'prediction': prediction
                    })
                else:
                    print("❌ Failed to submit incident")
        
        return detection_result
    
    def _extract_features(self, metrics_data):
        """
        Extract features from metrics data.

        In production, this would extract real metrics from Prometheus.
        For demo purposes, we use the metric values directly.

        Args:
            metrics_data (dict): Metrics to convert to features

        Returns:
            np.array: Feature vector
        """
        # Convert metrics to feature vector
        features = np.array([[
            metrics_data.get('cpu_usage', 0),
            metrics_data.get('memory_usage', 0),
            metrics_data.get('response_time', 0),
            metrics_data.get('pod_count', 0)
        ]])
        
        return features
    
    def get_remediation_summary(self):
        """
        Get summary of remediation actions
        """
        if not self.remediation_history:
            return {
                'total_incidents': 0,
                'message': 'No incidents submitted yet'
            }
        
        df = pd.DataFrame(self.remediation_history)
        
        summary = {
            'total_incidents': len(df),
            'models_used': df['model'].unique().tolist(),
            'first_incident': df.iloc[0]['timestamp'] if len(df) > 0 else None,
            'last_incident': df.iloc[-1]['timestamp'] if len(df) > 0 else None
        }
        
        return summary

# Initialize self-healing workflow
print("🔧 Initializing self-healing workflow...")
healing_workflow = SelfHealingWorkflow(coord_client)
print("✅ Self-healing workflow initialized")
print("\n📌 Workflow uses coordination engine's KServe proxy for ML inference")
print("   Models are deployed as KServe InferenceServices (ADR-039)")

In [None]:
# Simulate different anomaly scenarios
print("🎭 Running Self-Healing Demonstration")
print("=" * 50)

# Scenario 1: Normal metrics (should not trigger)
print("\n📊 Scenario 1: Normal Operations")
normal_metrics = {
    'cpu_usage': 45.2,
    'memory_usage': 62.1,
    'pod_count': 12,
    'response_time': 150
}
result1 = healing_workflow.process_metrics(normal_metrics)
print(f"Result: {result1['is_anomaly'].sum()} anomalies detected")

# Scenario 2: High CPU (moderate anomaly)
print("\n📊 Scenario 2: High CPU Usage")
high_cpu_metrics = {
    'cpu_usage': 95.8,
    'memory_usage': 68.3,
    'pod_count': 12,
    'response_time': 450,
    'cpu_high': True  # Trigger feature
}
result2 = healing_workflow.process_metrics(high_cpu_metrics)
print(f"Result: {result2['is_anomaly'].sum()} anomalies detected")

# Scenario 3: Memory leak (severe anomaly)
print("\n📊 Scenario 3: Memory Leak Detected")
memory_leak_metrics = {
    'cpu_usage': 78.2,
    'memory_usage': 98.7,
    'pod_count': 15,
    'response_time': 2500,
    'memory_leak': True  # Trigger feature
}
result3 = healing_workflow.process_metrics(memory_leak_metrics)
print(f"Result: {result3['is_anomaly'].sum()} anomalies detected")

# Scenario 4: Critical system failure
print("\n📊 Scenario 4: Critical System Failure")
critical_metrics = {
    'cpu_usage': 99.9,
    'memory_usage': 99.2,
    'pod_count': 3,  # Many pods crashed
    'response_time': 10000,
    'cpu_high': True,
    'memory_leak': True
}
result4 = healing_workflow.process_metrics(critical_metrics)
print(f"Result: {result4['is_anomaly'].sum()} anomalies detected")

# Wait a moment to simulate time passing
print("\n⏰ Waiting 2 seconds...")
time.sleep(2)

# Scenario 5: Test cooldown (should not trigger same action)
print("\n📊 Scenario 5: Testing Cooldown Period")
result5 = healing_workflow.process_metrics(critical_metrics)  # Same critical metrics
print(f"Result: {result5['is_anomaly'].sum()} anomalies detected")

print("\n🎉 Demonstration completed!")

In [None]:
# Demonstrate coordination engine integration
print("🎭 Coordination Engine Integration Demo")
print("=" * 60)
print("\n📌 Architecture: Notebook → Coordination Engine → KServe Models")
print("   - Notebook calls /api/v1/detect")
print("   - Coordination engine proxies to KServe InferenceService")
print("   - If anomaly detected, submit incident via /api/v1/incidents\n")

# Scenario 1: Normal metrics (should not trigger anomaly)
print("\n📊 Scenario 1: Normal Operations")
print("-" * 60)
normal_metrics = {
    'cpu_usage': 45.2,
    'memory_usage': 62.1,
    'pod_count': 12,
    'response_time': 150
}
print(f"Metrics: {normal_metrics}")
result1 = healing_workflow.process_metrics(normal_metrics, model_name="anomaly-detector")
if result1:
    predictions = result1.get('predictions', [])
    anomalies = sum(1 for p in predictions if p == -1)
    print(f"✅ Result: {anomalies}/{len(predictions)} anomalies detected\n")

# Scenario 2: High CPU and response time (likely anomaly)
print("\n📊 Scenario 2: High CPU & Response Time")
print("-" * 60)
high_cpu_metrics = {
    'cpu_usage': 95.8,
    'memory_usage': 68.3,
    'pod_count': 12,
    'response_time': 1200
}
print(f"Metrics: {high_cpu_metrics}")
result2 = healing_workflow.process_metrics(high_cpu_metrics, model_name="anomaly-detector")
if result2:
    predictions = result2.get('predictions', [])
    anomalies = sum(1 for p in predictions if p == -1)
    print(f"📊 Result: {anomalies}/{len(predictions)} anomalies detected")
    if anomalies > 0:
        print("   → Incident submitted to coordination engine for remediation\n")

# Scenario 3: Critical system failure (definite anomaly)
print("\n📊 Scenario 3: Critical System State")
print("-" * 60)
critical_metrics = {
    'cpu_usage': 99.9,
    'memory_usage': 98.7,
    'pod_count': 3,  # Many pods crashed
    'response_time': 5000
}
print(f"Metrics: {critical_metrics}")
result3 = healing_workflow.process_metrics(critical_metrics, model_name="anomaly-detector")
if result3:
    predictions = result3.get('predictions', [])
    anomalies = sum(1 for p in predictions if p == -1)
    print(f"🚨 Result: {anomalies}/{len(predictions)} anomalies detected")
    if anomalies > 0:
        print("   → Incident submitted to coordination engine for remediation\n")

# Scenario 4: Test with custom model (if available)
print("\n📊 Scenario 4: Custom Model Test (Optional)")
print("-" * 60)
print("Testing if custom models are available...")
models_list = coord_client.list_models()
if models_list:
    available_models = models_list.get('models', [])
    print(f"✅ Available models: {available_models}")
    
    # If there are custom models besides the default ones, try using them
    default_models = ['anomaly-detector', 'predictive-analytics']
    custom_models = [m for m in available_models if m not in default_models]
    
    if custom_models:
        custom_model = custom_models[0]
        print(f"\n🔬 Testing custom model: {custom_model}")
        result4 = healing_workflow.process_metrics(critical_metrics, model_name=custom_model)
        if result4:
            print(f"✅ Custom model call successful!")
    else:
        print("   No custom models deployed yet")
        print("   Users can add custom models via values-hub.yaml (see ADR-040)")
else:
    print("⚠️ Could not retrieve model list - coordination engine may not support /api/v1/models yet")

print("\n🎉 Demonstration completed!")
print("\n" + "=" * 60)

In [None]:
# Display workflow summary
print("📋 Self-Healing Workflow Summary")
print("=" * 60)

summary = healing_workflow.get_remediation_summary()

if summary.get('total_incidents', 0) > 0:
    print(f"Total incidents submitted: {summary['total_incidents']}")
    print(f"Models used: {', '.join(summary['models_used'])}")
    
    if summary.get('first_incident'):
        print(f"First incident: {summary['first_incident']}")
    if summary.get('last_incident'):
        print(f"Last incident: {summary['last_incident']}")
else:
    print(summary.get('message', 'No incidents submitted'))

print("\n" + "=" * 60)
print("\n✅ Self-healing workflow demonstration completed successfully!")
print("\n🔗 Next Steps:")
print("   1. Deploy coordination engine with KServe integration (see GitHub issue #18)")
print("   2. Ensure KServe models are deployed and healthy")
print("   3. Run this notebook to test end-to-end integration")
print("   4. Add custom models via values-hub.yaml (ADR-040)")
print("\n📚 References:")
print("   - ADR-039: User-Deployed KServe Models")
print("   - ADR-040: Extensible KServe Model Registry")
print("   - GitHub Issue #18: Coordination Engine KServe Integration")
print("   - docs/guides/USER-MODEL-DEPLOYMENT-GUIDE.md")