# Prometheus Metrics Collection for Self-Healing Platform

## Overview
This notebook demonstrates how to collect and process Prometheus metrics for AI/ML-driven anomaly detection in OpenShift environments.

## Prerequisites
- Access to OpenShift cluster with Prometheus monitoring
- PyTorch workbench environment (ADR-011)
- Persistent storage mounted at `/opt/app-root/src/data`

## Expected Outcomes
- Understand Prometheus query patterns for self-healing use cases
- Collect time-series data for anomaly detection model training
- Export processed data to persistent storage for ML consumption

## References
- ADR-007: Prometheus-Based Monitoring and Data Collection
- ADR-013: Data Collection and Preprocessing Workflows

## Setup and Configuration

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import requests
import json
import os
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("‚úÖ Libraries imported successfully")
print(f"üìä Pandas version: {pd.__version__}")
print(f"üî¢ NumPy version: {np.__version__}")

## 3. Verify Environment and Storage

Verify that the Kubernetes client is properly configured and check available storage for metrics data.

In [None]:
# Verify environment and storage
data_dir = '/opt/app-root/src/data'
models_dir = '/opt/app-root/src/models'

# Create directories if they don't exist
os.makedirs(f"{data_dir}/prometheus", exist_ok=True)
os.makedirs(f"{data_dir}/processed", exist_ok=True)

print(f"üìÅ Data directory: {data_dir}")
print(f"üóÉÔ∏è Models directory: {models_dir}")
print(f"üíæ Available space: {os.statvfs(data_dir).f_bavail * os.statvfs(data_dir).f_frsize / (1024**3):.2f} GB")

# Verify we're in the workbench
if os.path.exists('/opt/app-root/src/.jupyter'):
    print("‚úÖ Running in Self-Healing Workbench environment")
else:
    print("‚ö†Ô∏è Not in expected workbench environment")

## Prometheus Configuration

Configure connection to OpenShift Prometheus instance.

In [None]:
# Prometheus configuration for in-cluster access
# Since we're running in the OpenShift cluster, we can directly access Prometheus
# using service account tokens for authentication

import os

# Read service account token (automatically mounted in pod)
def get_service_account_token():
    token_path = '/var/run/secrets/kubernetes.io/serviceaccount/token'
    if os.path.exists(token_path):
        with open(token_path, 'r') as f:
            return f.read().strip()
    return None

# Get CA certificate for TLS verification
def get_ca_cert_path():
    ca_path = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
    return ca_path if os.path.exists(ca_path) else None

PROMETHEUS_CONFIG = {
    'base_url': 'https://prometheus-k8s.openshift-monitoring.svc.cluster.local:9091',
    'timeout': 30,
    'max_samples': 10000,
    'token': get_service_account_token(),
    'ca_cert': get_ca_cert_path(),
    # Disable SSL verification for in-cluster connections
    # Prometheus uses its own cert chain signed by cluster CA, not the ServiceAccount CA
    # This is safe for internal cluster communication
    'verify_ssl': False
}

# Alternative Thanos Querier endpoint (often more reliable for historical data)
THANOS_CONFIG = {
    'base_url': 'https://thanos-querier.openshift-monitoring.svc.cluster.local:9091',
    'timeout': 30,
    'max_samples': 10000,
    'token': get_service_account_token(),
    'ca_cert': get_ca_cert_path(),
    'verify_ssl': False
}

# Key metrics for self-healing platform
INFRASTRUCTURE_METRICS = {
    'node_cpu_utilization': 'node:node_cpu_utilisation:rate5m',
    'node_memory_utilization': 'node:node_memory_utilisation:',
    'node_disk_io': 'node:node_disk_io_utilisation:rate5m',
    'node_network_traffic': 'node:node_net_utilisation:rate5m'
}

APPLICATION_METRICS = {
    'pod_cpu_usage': 'pod:container_cpu_usage:rate5m',
    'pod_memory_usage': 'pod:container_memory_usage_bytes:sum',
    'container_restart_count': 'kube_pod_container_status_restarts_total',
    'http_request_duration': 'http_request_duration_seconds'
}

CLUSTER_METRICS = {
    'cluster_resource_quota': 'kube_resourcequota',
    'namespace_pod_count': 'kube_namespace_status_phase',
    'persistent_volume_usage': 'kubelet_volume_stats_used_bytes',
    'etcd_performance': 'etcd_request_duration_seconds'
}

print("üìä Prometheus configuration loaded")
print(f"üéØ Infrastructure metrics: {len(INFRASTRUCTURE_METRICS)}")
print(f"üöÄ Application metrics: {len(APPLICATION_METRICS)}")
print(f"üèóÔ∏è Cluster metrics: {len(CLUSTER_METRICS)}")

## Real Prometheus Data Collection

Now let's implement real Prometheus data collection using the in-cluster service account.

In [None]:
class PrometheusClient:
    """
    Client for querying Prometheus from within the OpenShift cluster
    """
    
    def __init__(self, config=None):
        self.config = config or PROMETHEUS_CONFIG
        self.session = requests.Session()
        
        # Set up authentication if token is available
        if self.config.get('token'):
            self.session.headers.update({
                'Authorization': f"Bearer {self.config['token']}"
            })
        
        # Configure SSL verification
        if self.config.get('ca_cert') and self.config.get('verify_ssl'):
            self.session.verify = self.config['ca_cert']
        elif not self.config.get('verify_ssl', True):
            self.session.verify = False
            import urllib3
            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    
    def query(self, query_string, time=None):
        """
        Execute a PromQL query
        """
        url = f"{self.config['base_url']}/api/v1/query"
        params = {'query': query_string}
        
        if time:
            params['time'] = time
        
        try:
            response = self.session.get(
                url, 
                params=params, 
                timeout=self.config.get('timeout', 30)
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"‚ùå Prometheus query failed: {e}")
            return None
    
    def query_range(self, query_string, start_time, end_time, step='1m'):
        """
        Execute a PromQL range query
        """
        url = f"{self.config['base_url']}/api/v1/query_range"
        params = {
            'query': query_string,
            'start': start_time,
            'end': end_time,
            'step': step
        }
        
        try:
            response = self.session.get(
                url, 
                params=params, 
                timeout=self.config.get('timeout', 30)
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"‚ùå Prometheus range query failed: {e}")
            return None
    
    def test_connection(self):
        """
        Test connection to Prometheus
        """
        try:
            result = self.query('up')
            if result and result.get('status') == 'success':
                print("‚úÖ Prometheus connection successful")
                return True
            else:
                print("‚ùå Prometheus connection failed - invalid response")
                return False
        except Exception as e:
            print(f"‚ùå Prometheus connection test failed: {e}")
            return False

# Initialize Prometheus client
prom_client = PrometheusClient()

# Test connection
print("üîç Testing Prometheus connection...")
prometheus_available = prom_client.test_connection()

if prometheus_available:
    print("üéØ Using real Prometheus data")
else:
    print("‚ö†Ô∏è Prometheus not available, will use synthetic data")
    print("üí° This is normal in development environments")

## Data Collection Functions

Functions to collect real metrics from Prometheus. If real data is successfully collected, we skip synthetic data generation entirely.

In [None]:
def collect_real_metrics(metric_query, duration_hours=24, step='1m'):
    """
    Collect real metrics from Prometheus
    """
    if not prometheus_available:
        return None
    
    # Calculate time range
    end_time = datetime.now()
    start_time = end_time - timedelta(hours=duration_hours)
    
    # Convert to Unix timestamps
    start_timestamp = int(start_time.timestamp())
    end_timestamp = int(end_time.timestamp())
    
    # Query Prometheus
    result = prom_client.query_range(
        metric_query,
        start_timestamp,
        end_timestamp,
        step
    )
    
    if not result or result.get('status') != 'success':
        return None
    
    # Convert to DataFrame
    data_points = []
    
    for series in result['data']['result']:
        metric_labels = series.get('metric', {})
        values = series.get('values', [])
        
        for timestamp, value in values:
            data_points.append({
                'timestamp': pd.to_datetime(timestamp, unit='s'),
                'value': float(value),
                'metric_labels': metric_labels
            })
    
    if data_points:
        df = pd.DataFrame(data_points)
        print(f"    üìä Collected {len(df)} real data points from Prometheus")
        return df
    
    return None

## Synthetic Data Generation (Fallback)

‚ö†Ô∏è **This section is only used if Prometheus is unavailable or specific metrics fail to collect.**

Generate realistic synthetic metrics when Prometheus is not available.

In [None]:
def generate_synthetic_metrics(metric_name, duration_hours=24, interval_minutes=1):
    """
    Generate synthetic time-series metrics that mimic real OpenShift behavior
    
    ‚ö†Ô∏è This is only used when Prometheus data is unavailable
    """
    # Calculate number of data points
    num_points = int(duration_hours * 60 / interval_minutes)
    
    # Create time index
    end_time = datetime.now()
    start_time = end_time - timedelta(hours=duration_hours)
    timestamps = pd.date_range(start=start_time, end=end_time, periods=num_points)
    
    # Generate base pattern based on metric type
    if 'cpu' in metric_name.lower():
        # CPU usage: daily pattern with some randomness
        base_pattern = 30 + 20 * np.sin(2 * np.pi * np.arange(num_points) / (24 * 60))  # Daily cycle
        noise = np.random.normal(0, 5, num_points)
        values = np.clip(base_pattern + noise, 0, 100)
        
    elif 'memory' in metric_name.lower():
        # Memory usage: gradual increase with occasional drops
        trend = np.linspace(40, 70, num_points)
        noise = np.random.normal(0, 3, num_points)
        # Occasional memory cleanup events
        cleanup_events = np.random.choice([0, -20], num_points, p=[0.99, 0.01])
        values = np.clip(trend + noise + cleanup_events, 10, 95)
        
    elif 'restart' in metric_name.lower():
        # Container restarts: mostly zero with occasional spikes
        values = np.random.poisson(0.1, num_points)
        # Add some anomalous restart events
        anomaly_indices = np.random.choice(num_points, size=int(num_points * 0.02), replace=False)
        values[anomaly_indices] += np.random.poisson(3, len(anomaly_indices))
        
    else:
        # Generic metric: normal distribution with trend
        trend = np.linspace(50, 60, num_points)
        noise = np.random.normal(0, 10, num_points)
        values = np.clip(trend + noise, 0, 100)
    
    # Create DataFrame
    df = pd.DataFrame({
        'timestamp': timestamps,
        'value': values,
        'metric': metric_name
    })
    
    return df

# Test the function
sample_metric = generate_synthetic_metrics('node_cpu_utilization', duration_hours=2)
print(f"‚úÖ Generated {len(sample_metric)} data points for sample metric")
print(f"üìä Value range: {sample_metric['value'].min():.2f} - {sample_metric['value'].max():.2f}")
sample_metric.head()

## Data Collection Pipeline

Implement the main data collection pipeline for all metric categories.

In [None]:
def collect_all_metrics(duration_hours=24, use_real_data=True):
    """
    Collect all metrics defined in our configuration
    Uses real Prometheus data when available, falls back to synthetic data
    Tracks data source (REAL vs SYNTHETIC) for each metric
    """
    all_metrics = {}
    metrics_source = {}  # Track data source for each metric
    
    print("üîÑ Starting metrics collection...")
    print(f"üìä Prometheus available: {prometheus_available}")
    print(f"üìä Use real data: {use_real_data}")
    print()
    
    # Helper function to collect metric with clear output
    def collect_metric(metric_name, query, category_name):
        data_source = 'SYNTHETIC'  # Default to synthetic
        
        if prometheus_available and use_real_data:
            # Try to get real data first
            df = collect_real_metrics(query, duration_hours)
            if df is not None and not df.empty:
                data_source = 'REAL'
                metrics_source[metric_name] = data_source
                print(f"  ‚úÖ [{data_source:8}] {metric_name}: {len(df)} data points")
                return df
        
        # Fallback to synthetic data
        df = generate_synthetic_metrics(metric_name, duration_hours)
        metrics_source[metric_name] = data_source
        print(f"  ‚ö†Ô∏è  [{data_source:8}] {metric_name}: {len(df)} data points (Prometheus unavailable)")
        return df
    
    # Collect infrastructure metrics
    print("üèóÔ∏è Collecting infrastructure metrics...")
    for metric_name, query in INFRASTRUCTURE_METRICS.items():
        df = collect_metric(metric_name, query, "infrastructure")
        all_metrics[metric_name] = df
    
    # Collect application metrics
    print("üöÄ Collecting application metrics...")
    for metric_name, query in APPLICATION_METRICS.items():
        df = collect_metric(metric_name, query, "application")
        all_metrics[metric_name] = df
    
    # Collect cluster metrics
    print("üèóÔ∏è Collecting cluster metrics...")
    for metric_name, query in CLUSTER_METRICS.items():
        df = collect_metric(metric_name, query, "cluster")
        all_metrics[metric_name] = df
    
    # Summary statistics
    real_count = sum(1 for source in metrics_source.values() if source == 'REAL')
    synthetic_count = sum(1 for source in metrics_source.values() if source == 'SYNTHETIC')
    
    print(f"\nüéâ Collection complete!")
    print(f"   Total metrics: {len(all_metrics)}")
    print(f"   ‚úÖ REAL data: {real_count} metrics")
    print(f"   ‚ö†Ô∏è  SYNTHETIC data: {synthetic_count} metrics")
    
    if synthetic_count > 0:
        print(f"\n   ‚ö†Ô∏è  Note: {synthetic_count} metrics using synthetic data (Prometheus unavailable for those queries)")
    
    # Store source information in the returned dict for reference
    all_metrics['_metadata'] = {'sources': metrics_source}
    
    return all_metrics

# Collect metrics for the last 24 hours
# This will use real Prometheus data when running in the cluster
metrics_data = collect_all_metrics(duration_hours=24, use_real_data=True)

In [None]:
# Display data source summary
print("\n" + "="*80)
print("üìä DATA SOURCE SUMMARY")
print("="*80)

# Extract metadata
metadata = metrics_data.pop('_metadata', {})
sources = metadata.get('sources', {})

# Create summary dataframe
source_summary = pd.DataFrame([
    {'Metric': metric, 'Data Source': source}
    for metric, source in sources.items()
])

# Display by source type
print("\nüéØ REAL DATA METRICS:")
real_metrics = source_summary[source_summary['Data Source'] == 'REAL']
if len(real_metrics) > 0:
    for idx, row in real_metrics.iterrows():
        print(f"  ‚úÖ {row['Metric']}")
else:
    print("  ‚ùå No real data collected")

print("\nüîÑ SYNTHETIC DATA METRICS:")
synthetic_metrics = source_summary[source_summary['Data Source'] == 'SYNTHETIC']
if len(synthetic_metrics) > 0:
    for idx, row in synthetic_metrics.iterrows():
        print(f"  üìä {row['Metric']}")
else:
    print("  ‚úÖ All metrics using real data!")

print("\n" + "="*80)
print(f"Total Metrics: {len(source_summary)}")
print(f"Real Data: {len(real_metrics)} ({len(real_metrics)/len(source_summary)*100:.1f}%)")
print(f"Synthetic Data: {len(synthetic_metrics)} ({len(synthetic_metrics)/len(source_summary)*100:.1f}%)")
print("="*80)

# Display full table
print("\nüìã DETAILED BREAKDOWN:")
print(source_summary.to_string(index=False))

## Data Quality Validation

Implement quality checks as defined in ADR-013.

In [None]:
def validate_data_quality(metrics_data):
    """
    Perform data quality validation checks
    """
    quality_report = {
        'total_metrics': len(metrics_data),
        'quality_scores': {},
        'issues': []
    }
    
    print("üîç Performing data quality validation...")
    
    for metric_name, df in metrics_data.items():
        metric_quality = {
            'completeness': 0,
            'consistency': 0,
            'accuracy': 0
        }
        
        # Completeness check
        missing_ratio = df['value'].isnull().sum() / len(df)
        metric_quality['completeness'] = max(0, 1 - missing_ratio * 20)  # Penalize missing values
        
        if missing_ratio > 0.05:
            quality_report['issues'].append(f"{metric_name}: High missing values ({missing_ratio:.2%})")
        
        # Consistency check (time gaps)
        time_diffs = df['timestamp'].diff().dt.total_seconds().dropna()
        expected_interval = time_diffs.median()
        large_gaps = (time_diffs > expected_interval * 2).sum()
        metric_quality['consistency'] = max(0, 1 - large_gaps / len(time_diffs))
        
        if large_gaps > len(time_diffs) * 0.01:
            quality_report['issues'].append(f"{metric_name}: Time gaps detected ({large_gaps} gaps)")
        
        # Accuracy check (outlier detection)
        Q1 = df['value'].quantile(0.25)
        Q3 = df['value'].quantile(0.75)
        IQR = Q3 - Q1
        outliers = ((df['value'] < Q1 - 1.5 * IQR) | (df['value'] > Q3 + 1.5 * IQR)).sum()
        outlier_ratio = outliers / len(df)
        metric_quality['accuracy'] = max(0, 1 - outlier_ratio * 10)  # Penalize outliers
        
        if outlier_ratio > 0.05:
            quality_report['issues'].append(f"{metric_name}: High outlier ratio ({outlier_ratio:.2%})")
        
        # Overall quality score
        overall_score = np.mean(list(metric_quality.values()))
        quality_report['quality_scores'][metric_name] = {
            'overall': overall_score,
            'details': metric_quality
        }
    
    # Calculate average quality score
    avg_quality = np.mean([score['overall'] for score in quality_report['quality_scores'].values()])
    quality_report['average_quality'] = avg_quality
    
    print(f"üìä Average data quality score: {avg_quality:.2f}")
    print(f"‚ö†Ô∏è Issues found: {len(quality_report['issues'])}")
    
    return quality_report

# Validate data quality
quality_report = validate_data_quality(metrics_data)

# Display quality summary
print("\nüìã Quality Summary:")
for metric, scores in list(quality_report['quality_scores'].items())[:5]:  # Show first 5
    print(f"  {metric}: {scores['overall']:.2f}")

if quality_report['issues']:
    print("\n‚ö†Ô∏è Issues to address:")
    for issue in quality_report['issues'][:3]:  # Show first 3
        print(f"  - {issue}")