# Notebook 12: Complete ML Pipeline

End-to-end machine learning workflow.

## Learning Objectives
- Build complete ML pipelines
- Combine preprocessing and modeling
- Save and load models
- Deploy-ready code structure

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_curve, roc_auc_score
)
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')

## Part 1: Data Loading and Exploration

In [None]:
# Generate synthetic dataset
X, y = make_classification(
    n_samples=2000,
    n_features=20,
    n_informative=12,
    n_redundant=4,
    n_classes=2,
    weights=[0.7, 0.3],
    random_state=42
)

# Create DataFrame
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\nTarget distribution:\n{df['target'].value_counts()}")
print(f"\nFeature statistics:\n{df.describe().T.head()}") 

In [None]:
# Quick visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Target distribution
df['target'].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('Target Distribution')
axes[0].set_xlabel('Class')

# Feature correlations with target
correlations = df.corr()['target'].drop('target').sort_values()
correlations.plot(kind='barh', ax=axes[1])
axes[1].set_title('Feature Correlations with Target')

plt.tight_layout()
plt.show()

## Part 2: Data Splitting

In [None]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nTraining class distribution:\n{y_train.value_counts()}")

## Part 3: Build Pipeline

In [None]:
# Create pipeline with preprocessing and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        alpha=0.001,
        max_iter=500,
        early_stopping=True,
        random_state=42
    ))
])

print("Pipeline steps:")
for name, step in pipeline.steps:
    print(f"  {name}: {step.__class__.__name__}")

In [None]:
# Cross-validation on pipeline
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')

print(f"CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

## Part 4: Hyperparameter Tuning with Pipeline

In [None]:
# Define parameter grid (note: prefix with step name)
param_grid = {
    'classifier__hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate_init': [0.001, 0.01]
}

# Grid search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"\nBest CV Score: {grid_search.best_score_:.4f}")
print(f"\nBest Parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

## Part 5: Final Evaluation

In [None]:
# Get best model
best_pipeline = grid_search.best_estimator_

# Predictions
y_pred = best_pipeline.predict(X_test)
y_prob = best_pipeline.predict_proba(X_test)[:, 1]

# Metrics
print("Final Model Evaluation:")
print("="*50)
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
print(f"\nClassification Report:\n")
print(classification_report(y_test, y_pred))

In [None]:
# Visualizations
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_title('Confusion Matrix')

# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)
axes[1].plot(fpr, tpr, 'b-', lw=2, label=f'ROC (AUC = {auc:.3f})')
axes[1].plot([0, 1], [0, 1], 'k--')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('ROC Curve')
axes[1].legend()

plt.tight_layout()
plt.show()

## Part 6: Save and Load Model

In [None]:
# Save the complete pipeline
model_filename = 'best_mlp_pipeline.joblib'
joblib.dump(best_pipeline, model_filename)
print(f"Model saved to: {model_filename}")

# Load and verify
loaded_pipeline = joblib.load(model_filename)
loaded_score = loaded_pipeline.score(X_test, y_test)
print(f"Loaded model test score: {loaded_score:.4f}")

## Part 7: Production-Ready Prediction Function

In [None]:
def predict_new_data(model_path, new_data):
    """
    Load model and make predictions on new data.
    
    Parameters:
    -----------
    model_path : str
        Path to saved model
    new_data : array-like
        New samples to predict
        
    Returns:
    --------
    predictions : array
        Class predictions
    probabilities : array
        Class probabilities
    """
    # Load model
    model = joblib.load(model_path)
    
    # Convert to DataFrame if needed
    if not isinstance(new_data, pd.DataFrame):
        new_data = pd.DataFrame(new_data, columns=feature_names)
    
    # Predict
    predictions = model.predict(new_data)
    probabilities = model.predict_proba(new_data)
    
    return predictions, probabilities

# Test with sample data
sample = X_test.iloc[:5]
preds, probs = predict_new_data(model_filename, sample)

print("Sample Predictions:")
for i, (pred, prob) in enumerate(zip(preds, probs)):
    print(f"  Sample {i}: Class {pred}, Prob: {prob}")

## Part 8: Complete Workflow Summary

In [None]:
print("Complete ML Pipeline Workflow:")
print("="*60)
print()
print("1. DATA LOADING")
print("   • Load data from source")
print("   • Initial exploration and visualization")
print("   • Check for missing values and outliers")
print()
print("2. DATA SPLITTING")
print("   • Train/test split (80/20 typical)")
print("   • Stratify for imbalanced classes")
print("   • Keep test set untouched until final evaluation")
print()
print("3. PIPELINE CONSTRUCTION")
print("   • Preprocessing (scaling, encoding)")
print("   • Feature selection (optional)")
print("   • Model")
print()
print("4. HYPERPARAMETER TUNING")
print("   • GridSearchCV or RandomizedSearchCV")
print("   • Cross-validation")
print("   • Select best parameters")
print()
print("5. FINAL EVALUATION")
print("   • Test set performance")
print("   • Multiple metrics")
print("   • Confusion matrix and ROC curve")
print()
print("6. MODEL PERSISTENCE")
print("   • Save with joblib")
print("   • Version control model artifacts")
print("   • Document model parameters")

## Summary

Congratulations! You've completed the ML Learning curriculum.

### What You've Learned

**Foundations (Notebooks 1-3)**
- Data simulation and generation
- Preprocessing and feature engineering
- Model evaluation metrics

**Classical Models (Notebooks 4-6)**
- Linear models and regularization
- Tree and ensemble methods
- SVM and kernel methods

**Neural Networks (Notebooks 7-9)**
- MLP fundamentals
- Hyperparameter space exploration
- Waveform prediction with regression

**Advanced Topics (Notebooks 10-12)**
- Automated hyperparameter tuning
- Model comparison and selection
- Complete ML pipelines

### Next Steps
1. Apply these techniques to real datasets
2. Explore deep learning (TensorFlow/PyTorch)
3. Learn specialized domains (NLP, Computer Vision)
4. Study MLOps and model deployment

### Resources
- scikit-learn documentation: https://scikit-learn.org
- Kaggle competitions for practice
- UCI ML Repository for datasets

Happy Learning!