# Prometheus Metrics Monitoring

## Overview
This notebook implements comprehensive Prometheus monitoring for the self-healing platform. It adds custom metrics, creates dashboards, and sets up alerts for all platform components.

## Prerequisites
- Completed: All Phase 1-6 notebooks
- Prometheus deployed in cluster
- Grafana available for dashboards
- AlertManager configured

## Learning Objectives
- Add custom Prometheus metrics
- Create monitoring dashboards
- Set up alerts and notifications
- Monitor platform health
- Track component performance

## Key Concepts
- **Custom Metrics**: Application-specific metrics
- **Dashboards**: Visualize metrics in Grafana
- **Alerts**: Proactive notifications
- **Health Checks**: Component status monitoring
- **Performance Tracking**: Latency and throughput metrics

## Setup Section

In [None]:
import sys
import os
import json
import logging
from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import requests
from typing import Dict, List, Any

# Setup path for utils module - works from any directory
def find_utils_path():
    """Find utils path regardless of current working directory"""
    possible_paths = [
        Path(__file__).parent.parent / 'utils' if '__file__' in dir() else None,
        Path.cwd() / 'notebooks' / 'utils',
        Path.cwd().parent / 'utils',
        Path('/workspace/repo/notebooks/utils'),
        Path('/opt/app-root/src/notebooks/utils'),
        Path('/opt/app-root/src/openshift-aiops-platform/notebooks/utils'),
    ]
    for p in possible_paths:
        if p and p.exists() and (p / 'common_functions.py').exists():
            return str(p)
    current = Path.cwd()
    for _ in range(5):
        utils_path = current / 'notebooks' / 'utils'
        if utils_path.exists():
            return str(utils_path)
        current = current.parent
    return None

utils_path = find_utils_path()
if utils_path:
    sys.path.insert(0, utils_path)
    print(f"✅ Utils path found: {utils_path}")
else:
    print("⚠️ Utils path not found - will use fallback implementations")

# Try to import common functions, with fallback
try:
    from common_functions import setup_environment
    print("✅ Common functions imported")
except ImportError as e:
    print(f"⚠️ Common functions not available: {e}")
    def setup_environment():
        os.makedirs('/opt/app-root/src/data/processed', exist_ok=True)
        os.makedirs('/opt/app-root/src/models', exist_ok=True)
        return {'data_dir': '/opt/app-root/src/data', 'models_dir': '/opt/app-root/src/models'}

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Setup environment
env_info = setup_environment()
logger.info(f"Environment ready: {env_info}")

# Define paths
DATA_DIR = Path('/opt/app-root/src/data')
PROCESSED_DIR = DATA_DIR / 'processed'
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Configuration
PROMETHEUS_URL = os.getenv('PROMETHEUS_URL', 'http://prometheus:9090')
GRAFANA_URL = os.getenv('GRAFANA_URL', 'http://grafana:3000')
NAMESPACE = 'self-healing-platform'
REQUEST_TIMEOUT = 30

logger.info(f"Prometheus monitoring initialized")
logger.info(f"Prometheus URL: {PROMETHEUS_URL}")
logger.info(f"Grafana URL: {GRAFANA_URL}")

## Implementation Section

### 1. Define Custom Metrics

In [None]:
# Custom metrics for self-healing platform
custom_metrics = {
    'anomaly_detection': {
        'anomalies_detected_total': {
            'type': 'counter',
            'help': 'Total anomalies detected',
            'labels': ['anomaly_type', 'severity']
        },
        'anomaly_detection_latency_ms': {
            'type': 'histogram',
            'help': 'Anomaly detection latency',
            'buckets': [10, 50, 100, 500, 1000, 5000]
        },
        'model_accuracy': {
            'type': 'gauge',
            'help': 'Model accuracy score',
            'labels': ['model_name']
        }
    },
    'remediation': {
        'remediation_actions_total': {
            'type': 'counter',
            'help': 'Total remediation actions executed',
            'labels': ['action_type', 'status']
        },
        'remediation_success_rate': {
            'type': 'gauge',
            'help': 'Remediation success rate',
            'labels': ['action_type']
        },
        'remediation_latency_ms': {
            'type': 'histogram',
            'help': 'Remediation execution latency',
            'buckets': [100, 500, 1000, 5000, 10000]
        }
    },
    'platform_health': {
        'platform_uptime_seconds': {
            'type': 'counter',
            'help': 'Platform uptime in seconds'
        },
        'component_health': {
            'type': 'gauge',
            'help': 'Component health status (1=healthy, 0=unhealthy)',
            'labels': ['component']
        },
        'incident_resolution_time_seconds': {
            'type': 'histogram',
            'help': 'Time to resolve incidents',
            'buckets': [10, 30, 60, 300, 600, 1800]
        }
    }
}

logger.info(f"Custom metrics defined: {len(custom_metrics)} categories")
print(json.dumps(custom_metrics, indent=2))

### 2. Create Prometheus Scrape Config

In [None]:
# Prometheus scrape configuration
scrape_config = {
    'global': {
        'scrape_interval': '15s',
        'evaluation_interval': '15s'
    },
    'scrape_configs': [
        {
            'job_name': 'self-healing-platform',
            'kubernetes_sd_configs': [
                {
                    'role': 'pod',
                    'namespaces': {'names': [NAMESPACE]}
                }
            ],
            'relabel_configs': [
                {
                    'source_labels': ['__meta_kubernetes_pod_annotation_prometheus_io_scrape'],
                    'action': 'keep',
                    'regex': 'true'
                },
                {
                    'source_labels': ['__meta_kubernetes_pod_annotation_prometheus_io_path'],
                    'action': 'replace',
                    'target_label': '__metrics_path__',
                    'regex': '(.+)'
                }
            ]
        },
        {
            'job_name': 'coordination-engine',
            'static_configs': [
                {'targets': ['coordination-engine:8080']}
            ]
        }
    ]
}

logger.info(f"Prometheus scrape config created")
print(json.dumps(scrape_config, indent=2))

### 3. Create Alert Rules

In [None]:
# Alert rules for platform monitoring
alert_rules = {
    'groups': [
        {
            'name': 'self-healing-platform',
            'interval': '30s',
            'rules': [
                {
                    'alert': 'HighAnomalyDetectionLatency',
                    'expr': 'histogram_quantile(0.95, anomaly_detection_latency_ms) > 1000',
                    'for': '5m',
                    'labels': {'severity': 'warning'},
                    'annotations': {'summary': 'High anomaly detection latency'}
                },
                {
                    'alert': 'LowRemediationSuccessRate',
                    'expr': 'remediation_success_rate < 0.85',
                    'for': '10m',
                    'labels': {'severity': 'critical'},
                    'annotations': {'summary': 'Low remediation success rate'}
                },
                {
                    'alert': 'ComponentUnhealthy',
                    'expr': 'component_health == 0',
                    'for': '2m',
                    'labels': {'severity': 'critical'},
                    'annotations': {'summary': 'Component unhealthy'}
                },
                {
                    'alert': 'ModelAccuracyDegraded',
                    'expr': 'model_accuracy < 0.80',
                    'for': '15m',
                    'labels': {'severity': 'warning'},
                    'annotations': {'summary': 'Model accuracy degraded'}
                }
            ]
        }
    ]
}

logger.info(f"Alert rules created: {len(alert_rules['groups'][0]['rules'])} rules")
print(json.dumps(alert_rules, indent=2))

### 4. Create Grafana Dashboard

In [None]:
# Grafana dashboard configuration
grafana_dashboard = {
    'dashboard': {
        'title': 'Self-Healing Platform Monitoring',
        'description': 'Comprehensive monitoring dashboard for self-healing platform',
        'tags': ['self-healing', 'platform', 'monitoring'],
        'timezone': 'browser',
        'panels': [
            {
                'title': 'Anomalies Detected',
                'targets': [{'expr': 'rate(anomalies_detected_total[5m])'}],
                'type': 'graph'
            },
            {
                'title': 'Remediation Success Rate',
                'targets': [{'expr': 'remediation_success_rate'}],
                'type': 'gauge'
            },
            {
                'title': 'Component Health',
                'targets': [{'expr': 'component_health'}],
                'type': 'stat'
            },
            {
                'title': 'Detection Latency (p95)',
                'targets': [{'expr': 'histogram_quantile(0.95, anomaly_detection_latency_ms)'}],
                'type': 'graph'
            },
            {
                'title': 'Model Accuracy',
                'targets': [{'expr': 'model_accuracy'}],
                'type': 'gauge'
            },
            {
                'title': 'Incident Resolution Time',
                'targets': [{'expr': 'histogram_quantile(0.95, incident_resolution_time_seconds)'}],
                'type': 'graph'
            }
        ]
    }
}

logger.info(f"Grafana dashboard created with {len(grafana_dashboard['dashboard']['panels'])} panels")
print(json.dumps(grafana_dashboard, indent=2))

### 5. Track Monitoring Metrics

In [None]:
# Create monitoring metrics tracking dataframe
monitoring_tracking = pd.DataFrame([
    {
        'timestamp': datetime.now().isoformat(),
        'metric_type': np.random.choice(['counter', 'gauge', 'histogram']),
        'component': np.random.choice(['anomaly_detection', 'remediation', 'platform_health']),
        'value': np.random.uniform(0, 100),
        'alert_triggered': np.random.choice([True, False], p=[0.1, 0.9]),
        'scrape_duration_ms': np.random.randint(10, 100)
    }
    for _ in range(30)  # Simulate 30 metric collections
])

# Save tracking data
tracking_file = PROCESSED_DIR / 'prometheus_monitoring_tracking.parquet'
monitoring_tracking.to_parquet(tracking_file)

logger.info(f"Saved Prometheus monitoring tracking data")
print(monitoring_tracking.to_string())

## Validation Section

In [None]:
# Verify outputs
assert tracking_file.exists(), "Monitoring tracking file not created"
assert len(custom_metrics) > 0, "No custom metrics defined"
assert len(alert_rules['groups'][0]['rules']) > 0, "No alert rules defined"

alert_rate = monitoring_tracking['alert_triggered'].sum() / len(monitoring_tracking)
avg_scrape_time = monitoring_tracking['scrape_duration_ms'].mean()

logger.info(f"✅ All validations passed")
print(f"\nPrometheus Metrics Monitoring Summary:")
print(f"  Custom Metrics Defined: {sum(len(v) for v in custom_metrics.values())}")
print(f"  Alert Rules Created: {len(alert_rules['groups'][0]['rules'])}")
print(f"  Dashboard Panels: {len(grafana_dashboard['dashboard']['panels'])}")
print(f"  Metrics Collected: {len(monitoring_tracking)}")
print(f"  Alert Trigger Rate: {alert_rate:.1%}")
print(f"  Average Scrape Time: {avg_scrape_time:.0f}ms")

## Integration Section

This notebook integrates with:
- **Input**: Platform metrics from all components
- **Output**: Prometheus metrics, alerts, and dashboards
- **Monitoring**: Metric collection and alert triggering
- **Next**: Model performance monitoring

## Next Steps

1. Deploy custom metrics to platform
2. Proceed to `model-performance-monitoring.ipynb`
3. Monitor model accuracy and drift
4. Set up automated retraining
5. Complete Phase 7 implementation

## References

- ADR-003: Self-Healing Platform Architecture
- ADR-012: Notebook Architecture for End-to-End Workflows
- [Prometheus Documentation](https://prometheus.io/docs/)
- [Grafana Dashboards](https://grafana.com/docs/grafana/latest/dashboards/)