# OpenShift Events Analysis for Self-Healing Platform

## Overview
This notebook demonstrates how to collect, process, and analyze OpenShift events for pattern recognition and anomaly detection. It integrates with the Kubernetes Python client to gather real-time cluster events and process them for the self-healing platform.

## Prerequisites
- Access to OpenShift cluster with appropriate RBAC permissions
- Kubernetes Python client installed
- Running coordination engine in the cluster
- Persistent storage for event data

## Expected Outcomes
- Understand OpenShift event structure and types
- Implement event filtering and processing pipelines
- Identify patterns in cluster events for anomaly detection
- Integrate event analysis with coordination engine

## References
- ADR-012: Notebook Architecture for End-to-End Workflows
- ADR-013: Data Collection and Preprocessing Workflows
- Kubernetes Events API Documentation

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import json
import warnings
import sys
import os
from collections import Counter, defaultdict
import re
from pathlib import Path

# Kubernetes client
try:
    from kubernetes import client, config, watch
    k8s_available = True
    print("‚úÖ Kubernetes client available")
except ImportError:
    k8s_available = False
    print("‚ö†Ô∏è Kubernetes client not available - using simulation mode")

# Setup path for utils module - works from any directory
def find_utils_path():
    """Find utils path regardless of current working directory"""
    possible_paths = [
        Path(__file__).parent.parent / 'utils' if '__file__' in dir() else None,
        Path.cwd() / 'notebooks' / 'utils',
        Path.cwd().parent / 'utils',
        Path('/workspace/repo/notebooks/utils'),
        Path('/opt/app-root/src/notebooks/utils'),
        Path('/opt/app-root/src/openshift-aiops-platform/notebooks/utils'),
    ]

    for p in possible_paths:
        if p and p.exists() and (p / 'common_functions.py').exists():
            return str(p)

    # Fallback: search upward from cwd
    current = Path.cwd()
    for _ in range(5):
        utils_path = current / 'notebooks' / 'utils'
        if utils_path.exists():
            return str(utils_path)
        current = current.parent

    return None

utils_path = find_utils_path()
if utils_path:
    sys.path.insert(0, utils_path)
    print(f"‚úÖ Utils path found: {utils_path}")
else:
    print("‚ö†Ô∏è Utils path not found - will use fallback implementations")

# Try to import common functions, with fallback
try:
    from common_functions import (
        setup_environment, print_environment_info,
        save_processed_data, load_processed_data,
        validate_data_quality
    )
    print("‚úÖ Common functions imported")
except ImportError as e:
    print(f"‚ö†Ô∏è Common functions not available: {e}")
    print("   Using minimal fallback implementations")

    # Minimal fallback implementations
    def setup_environment():
        return {
            'data_dir': '/opt/app-root/src/data',
            'models_dir': '/opt/app-root/src/models',
            'working_dir': os.getcwd()
        }

    def print_environment_info(env_info):
        print(f"üìÅ Data dir: {env_info.get('data_dir', 'N/A')}")
        print(f"üìÅ Models dir: {env_info.get('models_dir', 'N/A')}")

    def save_processed_data(data, filename):
        os.makedirs('/opt/app-root/src/data/processed', exist_ok=True)
        filepath = f'/opt/app-root/src/data/processed/{filename}'
        if filename.endswith('.parquet') and hasattr(data, 'to_parquet'):
            data.to_parquet(filepath)
        elif filename.endswith('.json'):
            with open(filepath, 'w') as f:
                if hasattr(data, 'items'):
                    serializable = {}
                    for k, v in data.items():
                        if hasattr(v, 'to_dict'):
                            serializable[k] = v.to_dict()
                        else:
                            serializable[k] = v
                    json.dump(serializable, f, default=str)
                else:
                    json.dump(data, f, default=str)
        print(f"üíæ Saved: {filepath}")

    def load_processed_data(filename):
        filepath = f'/opt/app-root/src/data/processed/{filename}'
        if filename.endswith('.parquet'):
            return pd.read_parquet(filepath)
        elif filename.endswith('.json'):
            with open(filepath, 'r') as f:
                return json.load(f)
        return None

    def validate_data_quality(df):
        return {'valid': True, 'issues': []}

print("‚úÖ Libraries imported successfully")

## 2. Setup & Configuration

Initialize the Kubernetes client and configure parameters for event collection from multiple namespaces.

In [None]:
# Set up environment
env_info = setup_environment()
print_environment_info(env_info)

# OpenShift Events Configuration
EVENTS_CONFIG = {
    'collection_duration_minutes': 60,  # How long to collect events
    'batch_size': 100,  # Events per batch for processing
    'namespaces': ['self-healing-platform', 'openshift-monitoring', 'default'],
    'event_types': ['Warning', 'Normal'],
    'reasons_of_interest': [
        'Failed', 'FailedMount', 'FailedScheduling', 'Unhealthy',
        'BackOff', 'Killing', 'Created', 'Started', 'Pulled'
    ]
}

print(f"üìã Events collection configured for {EVENTS_CONFIG['collection_duration_minutes']} minutes")
print(f"üéØ Monitoring namespaces: {', '.join(EVENTS_CONFIG['namespaces'])}")

## 3. Define Event Collection Functions

Define helper functions to collect events from Kubernetes API or generate synthetic events for testing.

In [None]:
def setup_kubernetes_client():
    """
    Set up Kubernetes client with in-cluster configuration
    """
    try:
        # Try in-cluster config first
        config.load_incluster_config()
        print("‚úÖ Using in-cluster Kubernetes configuration")
    except:
        try:
            # Fallback to local kubeconfig
            config.load_kube_config()
            print("‚úÖ Using local kubeconfig")
        except:
            print("‚ùå Failed to load Kubernetes configuration")
            return None
    
    return client.CoreV1Api()

def collect_cluster_events(duration_minutes=60, namespaces=None):
    """
    Collect OpenShift events from specified namespaces
    
    Args:
        duration_minutes: How long to collect events
        namespaces: List of namespaces to monitor
    
    Returns:
        DataFrame with collected events
    """
    if not k8s_available:
        return generate_synthetic_events(duration_minutes)
    
    v1 = setup_kubernetes_client()
    if v1 is None:
        return generate_synthetic_events(duration_minutes)
    
    events_data = []
    
    try:
        # Collect events from each namespace
        for namespace in (namespaces or ['default']):
            print(f"üì° Collecting events from namespace: {namespace}")
            
            try:
                events = v1.list_namespaced_event(namespace=namespace)
                
                for event in events.items:
                    event_data = {
                        'timestamp': event.first_timestamp or event.last_timestamp or datetime.now(),
                        'namespace': event.metadata.namespace,
                        'name': event.metadata.name,
                        'type': event.type,
                        'reason': event.reason,
                        'message': event.message,
                        'source_component': event.source.component if event.source else 'unknown',
                        'source_host': event.source.host if event.source else 'unknown',
                        'involved_object_kind': event.involved_object.kind,
                        'involved_object_name': event.involved_object.name,
                        'count': event.count or 1
                    }
                    events_data.append(event_data)
                
                print(f"  ‚úÖ Collected {len([e for e in events_data if e['namespace'] == namespace])} events")
                
            except Exception as e:
                print(f"  ‚ö†Ô∏è Failed to collect from {namespace}: {e}")
                continue
    
    except Exception as e:
        print(f"‚ùå Failed to collect events: {e}")
        return generate_synthetic_events(duration_minutes)
    
    if not events_data:
        print("‚ö†Ô∏è No events collected, generating synthetic data")
        return generate_synthetic_events(duration_minutes)
    
    df = pd.DataFrame(events_data)
    # Convert timestamps to datetime, handling both timezone-aware and naive timestamps
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
    # Convert to naive UTC for consistency
    df['timestamp'] = df['timestamp'].dt.tz_localize(None)
    df = df.sort_values('timestamp')
    
    print(f"üéâ Successfully collected {len(df)} events")
    return df

# Test Kubernetes connection
if k8s_available:
    v1_test = setup_kubernetes_client()
    if v1_test:
        print("üîó Kubernetes client connection successful")
    else:
        print("‚ö†Ô∏è Kubernetes client connection failed - will use synthetic data")
else:
    print("‚ö†Ô∏è Kubernetes client not available - will use synthetic data")

## 4. Generate Synthetic Events

Generate realistic synthetic Kubernetes events to simulate cluster activity when real API is unavailable.

In [None]:
def generate_synthetic_events(duration_minutes=60):
    """
    Generate realistic synthetic OpenShift events for testing
    """
    print("üé≠ Generating synthetic OpenShift events...")
    
    # Event patterns based on real OpenShift clusters
    event_patterns = {
        'Normal': {
            'Created': ['Pod', 'Service', 'ConfigMap'],
            'Started': ['Pod'],
            'Pulled': ['Pod'],
            'Scheduled': ['Pod'],
            'SuccessfulCreate': ['ReplicaSet', 'Job']
        },
        'Warning': {
            'Failed': ['Pod', 'Job'],
            'FailedScheduling': ['Pod'],
            'BackOff': ['Pod'],
            'Unhealthy': ['Pod'],
            'FailedMount': ['Pod']
        }
    }
    
    namespaces = ['self-healing-platform', 'openshift-monitoring', 'default', 'kube-system']
    components = ['kubelet', 'scheduler', 'controller-manager', 'default-scheduler']
    
    events_data = []
    start_time = datetime.now() - timedelta(minutes=duration_minutes)
    
    # Generate events with realistic patterns
    num_events = np.random.randint(50, 200)  # Realistic event count
    
    for i in range(num_events):
        event_type = np.random.choice(['Normal', 'Warning'], p=[0.7, 0.3])
        reason = np.random.choice(list(event_patterns[event_type].keys()))
        object_kind = np.random.choice(event_patterns[event_type][reason])
        
        timestamp = start_time + timedelta(
            minutes=np.random.uniform(0, duration_minutes)
        )
        
        event_data = {
            'timestamp': timestamp,
            'namespace': np.random.choice(namespaces),
            'name': f"event-{i:04d}",
            'type': event_type,
            'reason': reason,
            'message': f"{reason} event for {object_kind.lower()}",
            'source_component': np.random.choice(components),
            'source_host': f"node-{np.random.randint(1, 5)}",
            'involved_object_kind': object_kind,
            'involved_object_name': f"{object_kind.lower()}-{np.random.randint(1000, 9999)}",
            'count': np.random.randint(1, 5)
        }
        events_data.append(event_data)
    
    df = pd.DataFrame(events_data)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp')
    
    print(f"‚úÖ Generated {len(df)} synthetic events")
    return df

# Collect events
print("üöÄ Starting event collection...")
events_df = collect_cluster_events(
    duration_minutes=EVENTS_CONFIG['collection_duration_minutes'],
    namespaces=EVENTS_CONFIG['namespaces']
)

print(f"\nüìä Event Collection Summary:")
print(f"Total events: {len(events_df)}")
print(f"Time range: {events_df['timestamp'].min()} to {events_df['timestamp'].max()}")
print(f"Namespaces: {events_df['namespace'].nunique()}")
print(f"Event types: {', '.join(events_df['type'].unique())}")

## 5. Analyze Event Patterns

Analyze collected events to identify patterns, anomalies, and potential issues in the cluster.

In [None]:
def analyze_event_patterns(events_df):
    """
    Analyze patterns in OpenShift events for anomaly detection
    """
    print("üîç Analyzing event patterns...")
    
    # Event frequency analysis
    events_df['hour'] = events_df['timestamp'].dt.hour
    events_df['day_of_week'] = events_df['timestamp'].dt.day_name()
    
    # Pattern analysis results
    patterns = {
        'event_types': events_df['type'].value_counts(),
        'reasons': events_df['reason'].value_counts(),
        'namespaces': events_df['namespace'].value_counts(),
        'object_kinds': events_df['involved_object_kind'].value_counts(),
        'hourly_distribution': events_df['hour'].value_counts().sort_index(),
        'daily_distribution': events_df['day_of_week'].value_counts()
    }
    
    # Identify anomalous patterns
    warning_events = events_df[events_df['type'] == 'Warning']
    critical_reasons = ['Failed', 'FailedScheduling', 'BackOff', 'Unhealthy']
    critical_events = events_df[events_df['reason'].isin(critical_reasons)]
    
    patterns['warning_percentage'] = len(warning_events) / len(events_df) * 100
    patterns['critical_events'] = len(critical_events)
    
    return patterns

def filter_events_for_anomalies(events_df, config):
    """
    Filter events that are relevant for anomaly detection
    """
    print("üéØ Filtering events for anomaly detection...")
    
    # Filter by event types and reasons of interest
    filtered_df = events_df[
        (events_df['type'].isin(config['event_types'])) &
        (events_df['reason'].isin(config['reasons_of_interest']))
    ].copy()
    
    # Add severity scoring
    severity_map = {
        'Failed': 5, 'FailedMount': 4, 'FailedScheduling': 4,
        'Unhealthy': 4, 'BackOff': 3, 'Killing': 2,
        'Created': 1, 'Started': 1, 'Pulled': 1
    }
    
    filtered_df['severity'] = filtered_df['reason'].map(severity_map).fillna(1)
    
    print(f"‚úÖ Filtered to {len(filtered_df)} relevant events")
    return filtered_df

# Analyze patterns
patterns = analyze_event_patterns(events_df)
filtered_events = filter_events_for_anomalies(events_df, EVENTS_CONFIG)

print("\nüìà Event Pattern Analysis:")
print(f"Warning events: {patterns['warning_percentage']:.1f}%")
print(f"Critical events: {patterns['critical_events']}")
print(f"Most common reason: {patterns['reasons'].index[0]} ({patterns['reasons'].iloc[0]} events)")
print(f"Most active namespace: {patterns['namespaces'].index[0]} ({patterns['namespaces'].iloc[0]} events)")

## 6. Visualize Event Analysis Results

Create visualizations of event patterns, anomalies, and alerts for better understanding of cluster health.

In [None]:
# Visualize event patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('OpenShift Events Analysis Dashboard', fontsize=16, fontweight='bold')

# Event types distribution
patterns['event_types'].plot(kind='bar', ax=axes[0,0], color=['green', 'orange'])
axes[0,0].set_title('Event Types Distribution')
axes[0,0].set_xlabel('Event Type')
axes[0,0].set_ylabel('Count')
axes[0,0].tick_params(axis='x', rotation=45)

# Top reasons
patterns['reasons'].head(10).plot(kind='barh', ax=axes[0,1])
axes[0,1].set_title('Top 10 Event Reasons')
axes[0,1].set_xlabel('Count')

# Hourly distribution
patterns['hourly_distribution'].plot(kind='line', ax=axes[1,0], marker='o')
axes[1,0].set_title('Events by Hour of Day')
axes[1,0].set_xlabel('Hour')
axes[1,0].set_ylabel('Event Count')
axes[1,0].grid(True, alpha=0.3)

# Namespace distribution
patterns['namespaces'].plot(kind='pie', ax=axes[1,1], autopct='%1.1f%%')
axes[1,1].set_title('Events by Namespace')
axes[1,1].set_ylabel('')

plt.tight_layout()
plt.show()

# Save processed events data
save_processed_data(events_df, 'openshift_events_raw.parquet')
save_processed_data(filtered_events, 'openshift_events_filtered.parquet')
save_processed_data(patterns, 'event_patterns_analysis.json')

print("\nüíæ Data saved successfully:")
print("- openshift_events_raw.parquet: Raw events data")
print("- openshift_events_filtered.parquet: Filtered events for anomaly detection")
print("- event_patterns_analysis.json: Pattern analysis results")

## Integration with Coordination Engine

This section demonstrates how to integrate event analysis with the self-healing coordination engine.

In [None]:
# Import MCP client for coordination engine integration
try:
    from mcp_client import get_cluster_health_client
    mcp_available = True
    print("‚úÖ MCP client imported")
except ImportError:
    mcp_available = False
    print("‚ö†Ô∏è MCP client not available - using simulation mode")

    # Fallback MCP client simulation
    class SimulatedMCPClient:
        def query_anomaly_patterns(self, data):
            return {
                'status': 'simulated',
                'simulated': True,
                'anomalies': [],
                'message': 'MCP client not available - using simulation'
            }

    def get_cluster_health_client():
        return SimulatedMCPClient()

def send_events_to_coordination_engine(events_df, patterns):
    """
    Send processed events and patterns to coordination engine
    """
    print("üîó Integrating with coordination engine...")

    # Get Cluster Health MCP client
    mcp_client = get_cluster_health_client()

    # Prepare event summary for coordination engine
    event_summary = {
        'timestamp': datetime.now().isoformat(),
        'total_events': len(events_df),
        'warning_percentage': patterns['warning_percentage'],
        'critical_events': patterns['critical_events'],
        'top_reasons': patterns['reasons'].head(5).to_dict(),
        'namespace_distribution': patterns['namespaces'].to_dict(),
        'anomaly_indicators': {
            'high_warning_rate': patterns['warning_percentage'] > 30,
            'critical_events_present': patterns['critical_events'] > 0,
            'scheduling_issues': 'FailedScheduling' in patterns['reasons'].index
        }
    }

    # Send to coordination engine
    try:
        response = mcp_client.query_anomaly_patterns({
            'source': 'openshift-events',
            'data': event_summary,
            'severity': 'high' if event_summary['anomaly_indicators']['critical_events_present'] else 'medium'
        })

        print(f"‚úÖ Event data sent to coordination engine")
        if response.get('simulated'):
            print(f"   (Using simulation mode - MCP server not available)")
        return response

    except Exception as e:
        print(f"‚ö†Ô∏è Failed to send to coordination engine: {e}")
        return {'status': 'failed', 'error': str(e)}

# Send events to coordination engine
coordination_response = send_events_to_coordination_engine(filtered_events, patterns)

print("\nüéØ Next Steps:")
print("1. Review event patterns for anomalies")
print("2. Set up real-time event monitoring")
print("3. Configure alerting thresholds")
print("4. Integrate with anomaly detection models")
print("\nüìö Related Notebooks:")
print("- 02-anomaly-detection/: Use events data for ML model training")
print("- 03-self-healing-logic/: Implement event-driven remediation")