# ML Scheduler Data Exploration - Week 1
## Historical Cluster Analysis for Intelligent Pod Placement

**Objective**: Analyze 30+ days of historical cluster data to identify patterns for ML scheduler development.

**Success Criteria**:
- Identify temporal patterns in cluster usage
- Discover node/workload correlations
- Establish baseline performance metrics
- Quantify optimization opportunities

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# MLflow for experiment tracking
import mlflow
import mlflow.sklearn

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print("Libraries imported successfully")
print(f"Data exploration started at: {datetime.now()}")

In [None]:
# Load processed training data
data_path = "/home/jovyan/data/processed/"

# Find latest processed dataset
import os
import glob

parquet_files = glob.glob(f"{data_path}ml_scheduler_training_data_*.parquet")
if parquet_files:
    latest_file = max(parquet_files, key=os.path.getctime)
    df = pd.read_parquet(latest_file)
    print(f"Loaded dataset: {latest_file}")
    print(f"Shape: {df.shape}")
    print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
    print(f"Nodes: {df['node'].unique()}")
else:
    print("No processed data found. Run data collection first.")
    df = pd.DataFrame()

In [None]:
# Data quality assessment
if not df.empty:
    print("=== DATA QUALITY ASSESSMENT ===")
    print(f"Total records: {len(df):,}")
    print(f"Features: {len(df.columns)}")
    print(f"Nodes covered: {len(df['node'].unique())}")
    
    # Missing data analysis
    missing_pct = (df.isnull().sum() / len(df)) * 100
    print("\nMissing data by feature:")
    print(missing_pct[missing_pct > 0].sort_values(ascending=False))
    
    # Temporal coverage
    time_diff = df['timestamp'].max() - df['timestamp'].min()
    print(f"\nTemporal coverage: {time_diff.days} days, {time_diff.seconds//3600} hours")
    
    # Records per node
    records_per_node = df['node'].value_counts()
    print(f"\nRecords per node:")
    print(records_per_node)

In [None]:
# Temporal pattern analysis
if not df.empty:
    print("=== TEMPORAL PATTERN ANALYSIS ===")
    
    # CPU usage patterns by hour
    hourly_cpu = df.groupby('hour')['cpu_usage_rate'].mean()
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Hourly patterns
    axes[0,0].plot(hourly_cpu.index, hourly_cpu.values, marker='o')
    axes[0,0].set_title('Average CPU Usage by Hour')
    axes[0,0].set_xlabel('Hour of Day')
    axes[0,0].set_ylabel('CPU Usage Rate')
    axes[0,0].grid(True)
    
    # Daily patterns
    daily_cpu = df.groupby('day_of_week')['cpu_usage_rate'].mean()
    day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    axes[0,1].bar(range(7), daily_cpu.values)
    axes[0,1].set_title('Average CPU Usage by Day of Week')
    axes[0,1].set_xlabel('Day of Week')
    axes[0,1].set_ylabel('CPU Usage Rate')
    axes[0,1].set_xticks(range(7))
    axes[0,1].set_xticklabels(day_names)
    
    # Memory usage patterns
    hourly_memory = df.groupby('hour')['memory_usage'].mean()
    axes[1,0].plot(hourly_memory.index, hourly_memory.values, marker='o', color='orange')
    axes[1,0].set_title('Average Memory Usage by Hour')
    axes[1,0].set_xlabel('Hour of Day')
    axes[1,0].set_ylabel('Memory Usage')
    axes[1,0].grid(True)
    
    # Load patterns
    hourly_load = df.groupby('hour')['load_1m'].mean()
    axes[1,1].plot(hourly_load.index, hourly_load.values, marker='o', color='green')
    axes[1,1].set_title('Average Load by Hour')
    axes[1,1].set_xlabel('Hour of Day')
    axes[1,1].set_ylabel('Load (1min)')
    axes[1,1].grid(True)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Node comparison analysis
if not df.empty:
    print("=== NODE PERFORMANCE COMPARISON ===")
    
    # Average resource usage by node
    node_stats = df.groupby('node').agg({
        'cpu_usage_rate': ['mean', 'std', 'max'],
        'memory_usage': ['mean', 'std', 'max'],
        'load_1m': ['mean', 'std', 'max']
    }).round(4)
    
    print("Node performance statistics:")
    print(node_stats)
    
    # Visualize node comparison
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # CPU comparison
    node_cpu_mean = df.groupby('node')['cpu_usage_rate'].mean()
    axes[0].bar(node_cpu_mean.index, node_cpu_mean.values)
    axes[0].set_title('Average CPU Usage by Node')
    axes[0].set_ylabel('CPU Usage Rate')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Memory comparison  
    node_memory_mean = df.groupby('node')['memory_usage'].mean()
    axes[1].bar(node_memory_mean.index, node_memory_mean.values, color='orange')
    axes[1].set_title('Average Memory Usage by Node')
    axes[1].set_ylabel('Memory Usage')
    axes[1].tick_params(axis='x', rotation=45)
    
    # Load comparison
    node_load_mean = df.groupby('node')['load_1m'].mean()
    axes[2].bar(node_load_mean.index, node_load_mean.values, color='green')
    axes[2].set_title('Average Load by Node')
    axes[2].set_ylabel('Load (1min)')
    axes[2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation analysis for feature selection
if not df.empty:
    print("=== CORRELATION ANALYSIS ===")
    
    # Select numeric features for correlation
    numeric_features = df.select_dtypes(include=[np.number]).columns
    correlation_matrix = df[numeric_features].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5, fmt='.2f')
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # High correlation pairs
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_val = correlation_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                high_corr_pairs.append((
                    correlation_matrix.columns[i],
                    correlation_matrix.columns[j], 
                    corr_val
                ))
    
    print(f"\nHigh correlation pairs (>0.7):")
    for feat1, feat2, corr in high_corr_pairs:
        print(f"{feat1} <-> {feat2}: {corr:.3f}")

In [None]:
# Baseline performance metrics establishment
if not df.empty:
    print("=== BASELINE PERFORMANCE METRICS ===")
    
    # Current cluster utilization metrics
    baseline_metrics = {
        'avg_cpu_utilization': df['cpu_usage_rate'].mean(),
        'max_cpu_utilization': df['cpu_usage_rate'].max(),
        'avg_memory_utilization': df['memory_usage'].mean(),
        'max_memory_utilization': df['memory_usage'].max(),
        'avg_load_1m': df['load_1m'].mean(),
        'max_load_1m': df['load_1m'].max(),
        'cpu_utilization_std': df['cpu_usage_rate'].std(),
        'memory_utilization_std': df['memory_usage'].std()
    }
    
    print("Current cluster baseline metrics:")
    for metric, value in baseline_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Calculate optimization potential
    current_avg_cpu = baseline_metrics['avg_cpu_utilization']
    target_cpu = 0.65  # 65% target from plan
    optimization_potential = (current_avg_cpu - target_cpu) / current_avg_cpu * 100
    
    print(f"\nOptimization Analysis:")
    print(f"Current average CPU utilization: {current_avg_cpu:.1%}")
    print(f"Target CPU utilization: {target_cpu:.1%}")
    print(f"Optimization potential: {optimization_potential:.1f}%")
    
    # Log baseline metrics to MLflow
    with mlflow.start_run(run_name="baseline_analysis"):
        mlflow.log_metrics(baseline_metrics)
        mlflow.log_metric("optimization_potential_pct", optimization_potential)
        mlflow.log_param("analysis_date", datetime.now().isoformat())
        mlflow.log_param("data_points", len(df))
        mlflow.log_param("time_span_days", (df['timestamp'].max() - df['timestamp'].min()).days)
        
    print("\nBaseline metrics logged to MLflow")

In [None]:
# Resource utilization trends over time
if not df.empty:
    print("=== RESOURCE UTILIZATION TRENDS ===")
    
    # Resample to hourly data for trend analysis
    df_hourly = df.set_index('timestamp').groupby('node').resample('1H').agg({
        'cpu_usage_rate': 'mean',
        'memory_usage': 'mean',
        'load_1m': 'mean'
    }).reset_index()
    
    # Plot trends
    fig = go.Figure()
    
    for node in df['node'].unique():
        node_data = df_hourly[df_hourly['node'] == node]
        fig.add_trace(go.Scatter(
            x=node_data['timestamp'],
            y=node_data['cpu_usage_rate'],
            mode='lines',
            name=f'{node} CPU',
            line=dict(width=1)
        ))
    
    fig.update_layout(
        title='CPU Usage Trends by Node',
        xaxis_title='Time',
        yaxis_title='CPU Usage Rate',
        height=500
    )
    
    fig.show()
    
    # Weekly patterns
    weekly_patterns = df.groupby(['day_of_week', 'hour']).agg({
        'cpu_usage_rate': 'mean',
        'memory_usage': 'mean'
    }).reset_index()
    
    # Create heatmap for weekly CPU patterns
    cpu_heatmap = weekly_patterns.pivot(index='hour', columns='day_of_week', values='cpu_usage_rate')
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cpu_heatmap, annot=True, fmt='.3f', cmap='YlOrRd')
    plt.title('CPU Usage Heatmap: Hour vs Day of Week')
    plt.xlabel('Day of Week (0=Monday)')
    plt.ylabel('Hour of Day')
    plt.show()

In [None]:
# Save analysis results and insights
if not df.empty:
    print("=== ANALYSIS SUMMARY ===")
    
    insights = {
        'peak_hour': int(hourly_cpu.idxmax()),
        'low_hour': int(hourly_cpu.idxmin()),
        'peak_cpu_usage': float(hourly_cpu.max()),
        'low_cpu_usage': float(hourly_cpu.min()),
        'busiest_node': node_cpu_mean.idxmax(),
        'least_busy_node': node_cpu_mean.idxmin(),
        'cpu_variation_coefficient': float(df['cpu_usage_rate'].std() / df['cpu_usage_rate'].mean()),
        'optimization_opportunity_pct': optimization_potential
    }
    
    print("Key insights discovered:")
    print(f"Peak usage hour: {insights['peak_hour']}:00 ({insights['peak_cpu_usage']:.1%} CPU)")
    print(f"Low usage hour: {insights['low_hour']}:00 ({insights['low_cpu_usage']:.1%} CPU)")
    print(f"Busiest node: {insights['busiest_node']}")
    print(f"Least busy node: {insights['least_busy_node']}")
    print(f"CPU variation coefficient: {insights['cpu_variation_coefficient']:.3f}")
    print(f"Optimization opportunity: {insights['optimization_opportunity_pct']:.1f}%")
    
    # Save insights
    import json
    insights_path = "/home/jovyan/data/analysis_insights.json"
    with open(insights_path, 'w') as f:
        json.dump(insights, f, indent=2)
    
    print(f"\nInsights saved to: {insights_path}")
    
    # Log insights to MLflow
    with mlflow.start_run(run_name="pattern_analysis"):
        mlflow.log_metrics(insights)
        mlflow.log_artifact(insights_path)
    
    print("Analysis completed and logged to MLflow")