# ML Pipeline: Data Breach Regulatory Action Prediction

This notebook orchestrates the complete ML pipeline:
1. Data Loading & EDA
2. Data Preprocessing
3. Model Training
4. Model Evaluation
5. Results Summary

## 1. Setup & Configuration

In [None]:
import sys
import os
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Import pipeline modules
from config import (
    RANDOM_SEED, TARGET_VARIABLE, MODELS_DIR,
    METRICS_DIR, FIGURES_DIR
)
from data_loader import DataLoader
from preprocessor import DataPreprocessor
from trainer import ModelTrainer
from evaluator import ModelEvaluator
from utils import setup_output_dirs, get_timestamp

# Set random seed for reproducibility
np.random.seed(RANDOM_SEED)

# Setup output directories
setup_output_dirs()

print(f"Pipeline initialized. Random seed: {RANDOM_SEED}")
print(f"Target variable: {TARGET_VARIABLE}")

## 2. Data Loading & EDA

In [None]:
# Load dataset
loader = DataLoader()
df = loader.load_dataset()

# Display basic info
print(f"\nDataset shape: {df.shape}")
print(f"\nData types:\n{df.dtypes.value_counts()}")
print(f"\nMissing values:\n{df.isnull().sum().sort_values(ascending=False).head(10)}")

In [None]:
# Validate schema
loader.validate_schema()

# Target distribution
print(f"\nTarget variable distribution:")
target_dist = loader.get_target_distribution()

# Visualize target
plt.figure(figsize=(8, 5))
plt.bar(target_dist.keys(), target_dist.values())
plt.xlabel('Regulatory Action')
plt.ylabel('Count')
plt.title(f'Target Variable Distribution: {TARGET_VARIABLE}')
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'target_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Data Splitting

In [None]:
# Split data
X_train, X_test, y_train, y_test = loader.split_data()
loader.save_splits()

print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTest set class distribution:")
print(y_test.value_counts())

## 4. Data Preprocessing

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Fit on training data and transform
X_train_processed = preprocessor.fit_transform(X_train)

# Transform test data
X_test_processed = preprocessor.transform(X_test)

# Save preprocessor
preprocessor.save(MODELS_DIR / 'preprocessor.pkl')

print(f"\nPreprocessed training shape: {X_train_processed.shape}")
print(f"Preprocessed test shape: {X_test_processed.shape}")

## 5. Model Training

In [None]:
# Initialize trainer
trainer = ModelTrainer()

# Train all models
models = trainer.train_all_models(X_train_processed, y_train.values)

# Save models
for model_name, model in models.items():
    filepath = MODELS_DIR / f'{model_name}_model.pkl'
    model.save(filepath)

## 6. Model Evaluation

In [None]:
# Initialize evaluator
evaluator = ModelEvaluator()

# Get feature names
feature_names = preprocessor.get_feature_names()

# Generate complete evaluation report
comparison_df = evaluator.generate_report(
    models, X_test_processed, y_test.values, feature_names
)

print(f"\n\nFinal Model Comparison:")
print(comparison_df.round(4))

## 7. Summary & Recommendations

In [None]:
print(f"\n{'='*60}")
print("PIPELINE EXECUTION SUMMARY")
print(f"{'='*60}")

print(f"\n‚úì Data loaded: {len(df)} records")
print(f"‚úì Training set: {len(X_train)} samples")
print(f"‚úì Test set: {len(X_test)} samples")
print(f"‚úì Features after preprocessing: {X_train_processed.shape[1]}")
print(f"‚úì Models trained: 2 (Random Forest, XGBoost)")

# Best model
best_model = comparison_df['roc_auc'].idxmax()
best_auc = comparison_df['roc_auc'].max()

print(f"\nüèÜ Best Model: {best_model.upper()}")
print(f"   ROC-AUC: {best_auc:.4f}")
print(f"   Accuracy: {comparison_df.loc[best_model, 'accuracy']:.4f}")
print(f"   F1-Score: {comparison_df.loc[best_model, 'f1']:.4f}")

print(f"\nüìÅ Output files saved to:")
print(f"   Models: {MODELS_DIR}")
print(f"   Metrics: {METRICS_DIR}")
print(f"   Figures: {FIGURES_DIR}")

print(f"\n‚è±Ô∏è  Pipeline completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")