In [None]:
#!/usr/bin/env python3
"""
Complete Model Comparison Script for Anomaly Detection
======================================================
This script trains and compares multiple ML models on your HDFS anomaly data.

Required files:
- anomaly_label.csv
- event_occurrence_matrix.csv
- event_traces.csv (optional, for additional analysis)

Models trained:
1. Logistic Regression (baseline)
2. Random Forest
3. Gradient Boosting
4. XGBoost (if available)
5. LightGBM (if available)
6. Neural Network (MLP)

Output:
- Performance metrics for all models
- Visualizations (PNG files)
- Feature importance analysis
- Detailed classification reports
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, precision_recall_curve, f1_score, accuracy_score,
    precision_score, recall_score, average_precision_score
)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')

# Try to import XGBoost and LightGBM
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    print("‚ö†Ô∏è  XGBoost not available. Install with: pip install xgboost")
    XGB_AVAILABLE = False

try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
except ImportError:
    print("‚ö†Ô∏è  LightGBM not available. Install with: pip install lightgbm")
    LGB_AVAILABLE = False

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")


def load_and_prepare_data(anomaly_path='anomaly_label.csv', 
                          event_path='event_occurrence_matrix.csv'):
    """Load and merge the datasets."""
    print("\n" + "="*80)
    print("LOADING DATA")
    print("="*80)
    
    anomaly_label = pd.read_csv(anomaly_path)
    event_occurrence = pd.read_csv(event_path)
    
    print(f"‚úÖ Anomaly labels loaded: {anomaly_label.shape}")
    print(f"‚úÖ Event occurrence loaded: {event_occurrence.shape}")
    
    # Merge datasets
    merged_df = event_occurrence.merge(anomaly_label, on='BlockId', how='left')
    
    # Clean up column names
    if 'Label_x' in merged_df.columns:
        merged_df = merged_df.drop('Label_x', axis=1)
        merged_df = merged_df.rename(columns={'Label_y': 'Label'})
    
    print(f"‚úÖ Merged dataset: {merged_df.shape}")
    print(f"\nLabel distribution:")
    print(merged_df['Label'].value_counts())
    
    anomaly_pct = (merged_df['Label'] == 'Anomaly').sum() / len(merged_df) * 100
    print(f"\nAnomaly percentage: {anomaly_pct:.2f}%")
    
    if anomaly_pct < 1:
        print("‚ö†Ô∏è  WARNING: Very low anomaly rate - consider using SMOTE or other techniques")

    merged_df.info()
    
    return merged_df


def analyze_event_patterns(merged_df):
    """Analyze differences between normal and anomalous event patterns."""
    print("\n" + "="*80)
    print("EVENT PATTERN ANALYSIS")
    print("="*80)
    
    event_cols = [col for col in merged_df.columns if col.startswith('E')]
    
    normal_events = merged_df[merged_df['Label'] == 'Normal'][event_cols]
    anomaly_events = merged_df[merged_df['Label'] == 'Anomaly'][event_cols]
    
    # Compare event statistics
    comparison_df = pd.DataFrame({
        'Normal_Avg': normal_events.mean(),
        'Anomaly_Avg': anomaly_events.mean(),
        'Difference': anomaly_events.mean() - normal_events.mean()
    })
    comparison_df['Percent_Change'] = (
        (comparison_df['Difference'] / (comparison_df['Normal_Avg'] + 0.0001)) * 100
    )
    
    print("\nüìä Top 10 Event Differences (Anomaly - Normal):")
    print(comparison_df.sort_values('Difference', ascending=False).head(10).round(2))
    
    # Total events
    normal_total = normal_events.sum(axis=1).mean()
    anomaly_total = anomaly_events.sum(axis=1).mean()
    
    print(f"\nüìà Total Events per Block:")
    print(f"   Normal:  {normal_total:.2f} events")
    print(f"   Anomaly: {anomaly_total:.2f} events")
    print(f"   Difference: +{anomaly_total - normal_total:.2f} ({((anomaly_total/normal_total - 1)*100):.1f}% more)")
    
    # Event diversity
    normal_diversity = (normal_events > 0).sum(axis=1).mean()
    anomaly_diversity = (anomaly_events > 0).sum(axis=1).mean()
    
    print(f"\nüéØ Event Diversity (unique events triggered):")
    print(f"   Normal:  {normal_diversity:.2f} unique events")
    print(f"   Anomaly: {anomaly_diversity:.2f} unique events")
    
    return event_cols


def prepare_features(merged_df):
    """Prepare features and target for modeling."""
    print("\n" + "="*80)
    print("PREPARING FEATURES")
    print("="*80)
    
    # Prepare features
    X = merged_df.drop(['BlockId', 'Label'], axis=1)
    
    # Handle Type column if present
    if 'Type' in X.columns:
        print("Found 'Type' column - converting to binary feature")
        X['Type_IsFail'] = (X['Type'] == 21.0).astype(int)
        X = X.drop('Type', axis=1)
    
    X = X.fillna(0)
    
    # Encode target
    y = (merged_df['Label'] == 'Anomaly').astype(int)
    
    print(f"\n‚úÖ Feature matrix: {X.shape}")
    print(f"‚úÖ Features: {list(X.columns)[:10]}{'...' if len(X.columns) > 10 else ''}")
    
    return X, y


def split_and_scale_data(X, y, test_size=0.2, random_state=42):
    """Split data and scale features."""
    print("\n" + "="*80)
    print("SPLITTING AND SCALING DATA")
    print("="*80)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    print(f"‚úÖ Train set: {X_train.shape[0]:,} samples ({y_train.sum():,} anomalies)")
    print(f"‚úÖ Test set:  {X_test.shape[0]:,} samples ({y_test.sum():,} anomalies)")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print("‚úÖ Features scaled (for LR and NN)")
    
    return X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler


def train_models(X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled):
    """Train all available models."""
    print("\n" + "="*80)
    print("TRAINING MODELS")
    print("="*80)
    
    # Calculate class weight
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"\nClass weight ratio: {scale_pos_weight:.2f} (Normal/Anomaly)")
    
    # Define models
    models = {
        'Logistic Regression': LogisticRegression(
            class_weight='balanced',
            max_iter=1000,
            random_state=42
        ),
        
        'Random Forest': RandomForestClassifier(
            n_estimators=100,
            max_depth=15,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        ),
        
        'Gradient Boosting': GradientBoostingClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            random_state=42
        ),
        
        'Neural Network': MLPClassifier(
            hidden_layer_sizes=(128, 64, 32),
            activation='relu',
            max_iter=500,
            early_stopping=True,
            random_state=42
        )
    }
    
    # Add XGBoost if available
    if XGB_AVAILABLE:
        models['XGBoost'] = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            scale_pos_weight=scale_pos_weight,
            random_state=42,
            n_jobs=-1,
            eval_metric='logloss'
        )
    
    # Add LightGBM if available
    if LGB_AVAILABLE:
        models['LightGBM'] = lgb.LGBMClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1,
            verbose=-1
        )
    
    print(f"\nüöÄ Training {len(models)} models...\n")
    
    # Train all models
    results = {}
    trained_models = {}
    
    for name, model in models.items():
        print(f"Training {name}...", end=' ')
        
        # Use scaled data for LR and NN
        if name in ['Logistic Regression', 'Neural Network']:
            X_tr, X_te = X_train_scaled, X_test_scaled
        else:
            X_tr, X_te = X_train, X_test
        
        # Train
        model.fit(X_tr, y_train)
        
        # Predict
        y_pred = model.predict(X_te)
        y_pred_proba = model.predict_proba(X_te)[:, 1]
        
        # Calculate metrics
        results[name] = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'roc_auc': roc_auc_score(y_test, y_pred_proba),
            'avg_precision': average_precision_score(y_test, y_pred_proba),
            'predictions': y_pred,
            'probabilities': y_pred_proba
        }
        
        trained_models[name] = model
        
        print(f"‚úÖ F1: {results[name]['f1']:.4f} | ROC-AUC: {results[name]['roc_auc']:.4f}")
    
    print("\n" + "="*80)
    print("‚úÖ ALL MODELS TRAINED!")
    print("="*80)
    
    return results, trained_models


def compare_models(results):
    """Print detailed model comparison."""
    print("\n" + "="*80)
    print("MODEL PERFORMANCE COMPARISON")
    print("="*80)
    
    # Create comparison dataframe
    comparison_df = pd.DataFrame(results).T
    comparison_df = comparison_df[['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'avg_precision']]
    comparison_df = comparison_df.round(4)
    
    print("\n" + comparison_df.to_string())
    
    # Find best models
    print("\nüèÜ BEST MODELS BY METRIC:")
    for metric in comparison_df.columns:
        best_model = comparison_df[metric].idxmax()
        best_score = comparison_df[metric].max()
        print(f"  {metric.upper():15s}: {best_model:25s} ({best_score:.4f})")
    
    # Overall ranking
    print("\nüìà OVERALL RANKING (by F1 Score):")
    ranked = comparison_df.sort_values('f1', ascending=False)
    for i, (model, row) in enumerate(ranked.iterrows(), 1):
        stars = "‚≠ê" * min(int(row['f1'] * 5), 5)
        print(f"  {i}. {model:25s} {stars} | F1={row['f1']:.4f} | AUC={row['roc_auc']:.4f}")
    
    return comparison_df


def analyze_feature_importance(trained_models, X):
    """Analyze and print feature importance."""
    print("\n" + "="*80)
    print("FEATURE IMPORTANCE ANALYSIS")
    print("="*80)
    
    tree_based = [m for m in trained_models.keys() 
                  if m in ['Random Forest', 'XGBoost', 'LightGBM', 'Gradient Boosting']]
    
    for model_name in tree_based:
        model = trained_models[model_name]
        
        # Get feature importances
        importances = model.feature_importances_
        importance_df = pd.DataFrame({
            'Feature': X.columns,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        print(f"\nüéØ {model_name} - Top 10 Features:")
        for idx, row in importance_df.head(10).iterrows():
            bar = "‚ñà" * int(row['Importance'] * 50)
            print(f"  {row['Feature']:15s} {bar} {row['Importance']:.4f}")


def create_visualizations(results, y_test, comparison_df, trained_models, X):
    """Create and save all visualizations."""
    print("\n" + "="*80)
    print("CREATING VISUALIZATIONS")
    print("="*80)
    
    # 1. Model Comparison
    print("\n1. Creating model comparison chart...", end=' ')
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()
    
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'avg_precision']
    metric_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC', 'Avg Precision']
    
    for idx, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
        data = comparison_df[metric].sort_values(ascending=True)
        colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(data)))
        data.plot(kind='barh', ax=axes[idx], color=colors)
        axes[idx].set_title(f'{metric_name}', fontsize=13, fontweight='bold')
        axes[idx].set_xlabel('Score')
        axes[idx].set_xlim(0, 1)
        axes[idx].grid(axis='x', alpha=0.3)
        
        for i, v in enumerate(data.values):
            axes[idx].text(v + 0.02, i, f'{v:.3f}', va='center', fontsize=10)
    
    plt.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("‚úÖ")
    
    # 2. ROC Curves
    print("2. Creating ROC curves...", end=' ')
    fig, ax = plt.subplots(figsize=(12, 8))
    colors = plt.cm.tab10(np.linspace(0, 1, len(results)))
    
    for (name, result), color in zip(results.items(), colors):
        fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
        auc_score = result['roc_auc']
        ax.plot(fpr, tpr, label=f"{name} (AUC={auc_score:.3f})", linewidth=2.5, color=color)
    
    ax.plot([0, 1], [0, 1], 'k--', label='Random (AUC=0.500)', linewidth=2)
    ax.set_xlabel('False Positive Rate', fontsize=12)
    ax.set_ylabel('True Positive Rate', fontsize=12)
    ax.set_title('ROC Curves - Model Comparison', fontsize=14, fontweight='bold')
    ax.legend(loc='lower right', fontsize=10)
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig('roc_curves.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("‚úÖ")
    
    # 3. Confusion Matrices
    print("3. Creating confusion matrices...", end=' ')
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()
    
    for idx, (name, result) in enumerate(results.items()):
        if idx >= len(axes):
            break
        cm = confusion_matrix(y_test, result['predictions'])
        
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                    cbar_kws={'label': 'Count'},
                    xticklabels=['Normal', 'Anomaly'],
                    yticklabels=['Normal', 'Anomaly'])
        
        axes[idx].set_title(f'{name}\nF1={result["f1"]:.3f}', fontsize=12, fontweight='bold')
        axes[idx].set_ylabel('True Label')
        axes[idx].set_xlabel('Predicted Label')
    
    # Hide extra subplots
    for idx in range(len(results), len(axes)):
        axes[idx].axis('off')
    
    plt.suptitle('Confusion Matrices - Model Comparison', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("‚úÖ")
    
    # 4. Feature Importance (for tree-based models)
    tree_based = [m for m in trained_models.keys() 
                  if m in ['Random Forest', 'XGBoost', 'LightGBM', 'Gradient Boosting']]
    
    if tree_based:
        print("4. Creating feature importance charts...", end=' ')
        n_models = len(tree_based)
        fig, axes = plt.subplots(2, 2, figsize=(18, 14))
        axes = axes.flatten()
        
        for idx, model_name in enumerate(tree_based[:4]):
            model = trained_models[model_name]
            importances = model.feature_importances_
            
            importance_df = pd.DataFrame({
                'Feature': X.columns,
                'Importance': importances
            }).sort_values('Importance', ascending=True).tail(15)
            
            importance_df.plot(x='Feature', y='Importance', kind='barh', ax=axes[idx],
                              legend=False, color='steelblue')
            axes[idx].set_title(f'{model_name}\nTop 15 Features', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel('Importance')
            axes[idx].set_ylabel('')
        
        # Hide unused subplots
        for idx in range(len(tree_based), 4):
            axes[idx].axis('off')
        
        plt.suptitle('Feature Importance Comparison', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("‚úÖ")
    
    print("\n‚úÖ All visualizations saved:")
    print("   - model_comparison.png")
    print("   - roc_curves.png")
    print("   - confusion_matrices.png")
    if tree_based:
        print("   - feature_importance.png")


def main():
    """Main execution function."""
    print("\n" + "="*80)
    print("ANOMALY DETECTION: COMPREHENSIVE MODEL COMPARISON")
    print("="*80)
    
    try:
        # Load data
        merged_df = load_and_prepare_data()
        
        # Analyze patterns
        event_cols = analyze_event_patterns(merged_df)
        
        # Prepare features
        X, y = prepare_features(merged_df)
        
        # Split and scale
        X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler = split_and_scale_data(X, y)
        
        # Train models
        results, trained_models = train_models(
            X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled
        )
        
        # Compare models
        comparison_df = compare_models(results)
        
        # Feature importance
        analyze_feature_importance(trained_models, X)
        
        # Create visualizations
        create_visualizations(results, y_test, comparison_df, trained_models, X)
        
        print("\n" + "="*80)
        print("‚úÖ ANALYSIS COMPLETE!")
        print("="*80)
        
        print("\nüìå SUMMARY:")
        best_model = comparison_df['f1'].idxmax()
        best_f1 = comparison_df.loc[best_model, 'f1']
        best_auc = comparison_df.loc[best_model, 'roc_auc']
        
        print(f"\nüèÜ Best Overall Model: {best_model}")
        print(f"   F1 Score: {best_f1:.4f}")
        print(f"   ROC-AUC:  {best_auc:.4f}")
        
        print("\nüìä Next Steps:")
        print("   1. Review the generated PNG files")
        print("   2. Analyze feature importance results")
        print("   3. Consider hyperparameter tuning for best models")
        print("   4. Examine misclassified samples for insights")
        print("   5. Deploy the best model to production")
        
        return results, trained_models, comparison_df
        
    except FileNotFoundError as e:
        print(f"\n‚ùå ERROR: Could not find required file")
        print(f"   {e}")
        print("\n   Please ensure these files are in the current directory:")
        print("   - anomaly_label.csv")
        print("   - event_occurrence_matrix.csv")
        return None, None, None
    
    except Exception as e:
        print(f"\n‚ùå ERROR: {e}")
        import traceback
        traceback.print_exc()
        return None, None, None


if __name__ == "__main__":
    results, trained_models, comparison_df = main()


ANOMALY DETECTION: COMPREHENSIVE MODEL COMPARISON

LOADING DATA
‚úÖ Anomaly labels loaded: (575061, 2)
‚úÖ Event occurrence loaded: (575061, 32)
‚úÖ Merged dataset: (575061, 32)

Label distribution:
Label
Normal     558223
Anomaly     16838
Name: count, dtype: int64

Anomaly percentage: 2.93%

EVENT PATTERN ANALYSIS

üìä Top 10 Event Differences (Anomaly - Normal):
     Normal_Avg  Anomaly_Avg  Difference  Percent_Change
E20         0.0         0.32        0.32        63439.52
E6          0.0         0.27        0.26         5458.89
E25         0.0         0.26        0.26         5590.39
E18         0.0         0.26        0.26         5590.39
E16         0.0         0.26        0.26         5510.91
E7          0.0         0.20        0.20       202874.45
E13         0.0         0.09        0.09        86946.19
E28         0.0         0.08        0.08        62355.13
E27         0.0         0.06        0.06        54777.35
E14         0.0         0.01        0.01         9205.37

ü