**6. Error Analysis:**
   - 6.1 Analyzing Misclassifications
      - Identifying patterns in misclassified instances
      - Confusion matrix analysis
      - Error rate by class

   - 6.2 Bias-Variance Tradeoff
      - Learning curves analysis
      - Bias-variance decomposition

   - 6.3 Residual Analysis (for regression)
      - Residual plots
      - Q-Q plots
      - Heteroscedasticity check

   - 6.4 Cross-validation Insights
      - K-fold CV score distribution
      - Out-of-fold predictions analysis

   - 6.5 Feature Importance in Errors
      - SHAP values for misclassifications
      - Feature importance for error cases

**7. Model Persistence:**
   - 7.1 Saving Models
      - Pickle serialization
      - Joblib serialization
      - TensorFlow SavedModel format
      - ONNX format

   - 7.2 Loading Models
      - Deserializing saved models
      - Versioning loaded models

   - 7.3 Model Versioning
      - Version control for models (e.g., DVC, MLflow)
      - Model metadata tracking

   - 7.4 Model Registry
      - Centralized model storage
      - Model lifecycle management

**8. Model Deployment:**
   - 8.1 API Development
      - Flask API
      - FastAPI
      - Django REST framework

   - 8.2 Containerization
      - Docker containerization
      - Docker-compose for multi-container apps

   - 8.3 Cloud Deployment
      - AWS SageMaker
      - Google Cloud AI Platform
      - Azure Machine Learning

   - 8.4 Serverless Deployment
      - AWS Lambda
      - Google Cloud Functions
      - Azure Functions

   - 8.5 Edge Deployment
      - TensorFlow Lite
      - ONNX Runtime

**9. Monitoring and Maintenance:**
   - 9.1 Logging
      - Application logging
      - Model prediction logging
      - Error logging

   - 9.2 Performance Monitoring
      - Model accuracy tracking
      - Prediction latency monitoring
      - Resource utilization monitoring

   - 9.3 Data Drift Detection
      - Feature distribution monitoring
      - Concept drift detection
      - Outlier detection in new data

   - 9.4 Automated Alerts
      - Performance degradation alerts
      - Data quality alerts
      - System health alerts

   - 9.5 Model Updating
      - Incremental learning
      - Periodic retraining
      - A/B testing for model updates

   - 9.6 Feedback Loop Implementation
      - User feedback collection
      - Ground truth acquisition
      - Continuous learning pipeline

**10. Advanced Techniques:**
   - 10.1 Automated Machine Learning (AutoML)
      - Auto-sklearn
      - TPOT
      - H2O AutoML
      - Google Cloud AutoML

____________________________________________________________________________


**6. Error Analysis**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.inspection import permutation_importance
import shap

# 6.1 Analyzing Misclassifications
def analyze_misclassifications(y_true, y_pred, X):
    misclassified = X[y_true != y_pred]
    misclassified_true = y_true[y_true != y_pred]
    misclassified_pred = y_pred[y_true != y_pred]
    
    print(f"Number of misclassifications: {len(misclassified)}")
    
    # Confusion matrix analysis
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    
    # Error rate by class
    error_rate = 1 - np.diag(cm) / np.sum(cm, axis=1)
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(error_rate)), error_rate)
    plt.title('Error Rate by Class')
    plt.xlabel('Class')
    plt.ylabel('Error Rate')
    plt.show()

# 6.2 Bias-Variance Tradeoff
def plot_learning_curve(estimator, X, y, cv=5):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5))
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.title("Learning Curve")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.show()

# 6.3 Residual Analysis (for regression)
def residual_analysis(y_true, y_pred):
    residuals = y_true - y_pred
    
    # Residual plot
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred, residuals)
    plt.title('Residual Plot')
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    plt.show()
    
    # Q-Q plot
    from scipy import stats
    fig, ax = plt.subplots(figsize=(10, 6))
    stats.probplot(residuals, dist="norm", plot=ax)
    ax.set_title("Q-Q plot")
    plt.show()
    
    # Heteroscedasticity check
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred, np.abs(residuals))
    plt.title('Heteroscedasticity Check')
    plt.xlabel('Predicted values')
    plt.ylabel('Absolute residuals')
    plt.show()

# 6.4 Cross-validation Insights
def cv_insights(estimator, X, y, cv=5):
    from sklearn.model_selection import cross_val_score, cross_val_predict
    
    # K-fold CV score distribution
    scores = cross_val_score(estimator, X, y, cv=cv)
    plt.figure(figsize=(10, 6))
    plt.hist(scores, bins=10)
    plt.title('K-fold CV Score Distribution')
    plt.xlabel('Score')
    plt.ylabel('Frequency')
    plt.show()
    
    # Out-of-fold predictions analysis
    oof_predictions = cross_val_predict(estimator, X, y, cv=cv)
    plt.figure(figsize=(10, 6))
    plt.scatter(y, oof_predictions)
    plt.title('Out-of-fold Predictions vs True Values')
    plt.xlabel('True Values')
    plt.ylabel('OOF Predictions')
    plt.show()

# 6.5 Feature Importance in Errors
def feature_importance_in_errors(estimator, X, y_true, y_pred):
    # SHAP values for misclassifications
    explainer = shap.Explainer(estimator)
    shap_values = explainer(X[y_true != y_pred])
    shap.summary_plot(shap_values, X[y_true != y_pred])
    
    # Feature importance for error cases
    error_importance = permutation_importance(estimator, X[y_true != y_pred], y_true[y_true != y_pred])
    sorted_idx = error_importance.importances_mean.argsort()
    plt.figure(figsize=(10, 6))
    plt.barh(range(X.shape[1]), error_importance.importances_mean[sorted_idx])
    plt.yticks(range(X.shape[1]), X.columns[sorted_idx])
    plt.title('Feature Importance for Error Cases')
    plt.show()

# Example usage:
# analyze_misclassifications(y_true, y_pred, X)
# plot_learning_curve(estimator, X, y)
# residual_analysis(y_true, y_pred)
# cv_insights(estimator, X, y)
# feature_importance_in_errors(estimator, X, y_true, y_pred)

  from .autonotebook import tqdm as notebook_tqdm


**7. Model Persistence**

In [4]:
import pickle
import joblib
import tensorflow as tf
import onnx
import onnxruntime as ortz

# 7.1 Saving Models
def save_model_pickle(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

def save_model_joblib(model, filename):
    joblib.dump(model, filename)

def save_model_tf(model, directory):
    tf.saved_model.save(model, directory)

def save_model_onnx(model, filename):
    onnx.save(model, filename)

# 7.2 Loading Models
def load_model_pickle(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

def load_model_joblib(filename):
    return joblib.load(filename)

def load_model_tf(directory):
    return tf.saved_model.load(directory)

def load_model_onnx(filename):
    return onnx.load(filename)

# 7.3 Model Versioning (using MLflow)
import mlflow

def log_model_mlflow(model, artifact_path):
    mlflow.sklearn.log_model(model, artifact_path)

def load_model_mlflow(model_uri):
    return mlflow.sklearn.load_model(model_uri)

# 7.4 Model Registry (using MLflow)
def register_model_mlflow(model, name):
    mlflow.sklearn.log_model(model, "model")
    mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/model", name)

# Example usage:
# save_model_pickle(model, 'model.pkl')
# model = load_model_pickle('model.pkl')
# log_model_mlflow(model, "best_model")
# registered_model = register_model_mlflow(model, "production_model")

**8. Model Deployement**

In [None]:
# 8.1 API Development (using Flask)
from flask import Flask, request, jsonify

app = Flask(__name__)
model = joblib.load('./output/pipeline_feature.pkl')
@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    # Preprocess data and make prediction
    prediction = model.predict(data)
    return jsonify({'prediction': prediction.tolist()})

# Run the app
if __name__ == '__main__':
    app.run(debug=True)

# 8.2 Containerization (Dockerfile example)
# Dockerfile
# FROM python:3.10.1-slim-buster
# WORKDIR /app
# COPY requirements.txt .
# RUN pip install -r requirements.txt
# COPY . .
# CMD ["python", "app.py"]

# 8.3 Cloud Deployment (AWS SageMaker example)
import sagemaker
from sagemaker.sklearn import SKLearn

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

sklearn_estimator = SKLearn(
    entry_point='train.py',
    role=role,
    instance_type='ml.m5.xlarge',
    framework_version='0.23-1'
)

sklearn_estimator.fit({'train': 's3://bucket/path/to/train/data'})

# 8.4 Serverless Deployment (AWS Lambda example)
import json
import pickle

def lambda_handler(event, context):
    # Load the model (assuming it's been packaged with the Lambda function)
    with open('model.pkl', 'rb') as f:
        model = pickle.load(f)
    
    # Parse the input
    data = json.loads(event['body'])
    
    # Make prediction
    prediction = model.predict([data])
    
    return {
        'statusCode': 200,
        'body': json.dumps({'prediction': prediction.tolist()})
    }

# 8.5 Edge Deployment (TensorFlow Lite example)
import tensorflow as tf

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

with open('model.tflite', 'wb') as f:
    f.write(tflite_model)

**9. Monitoring and Maintanence**

In [None]:
import logging
from sklearn.metrics import accuracy_score
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataDriftTab, CatTargetDriftTab
from evidently.pipeline.column_mapping import ColumnMapping

# 9.1 Logging
logging.basicConfig(filename='app.log', level=logging.INFO)

def log_prediction(input_data, prediction):
    logging.info(f"Input: {input_data}, Prediction: {prediction}")

# 9.2 Performance Monitoring
def monitor_performance(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    logging.info(f"Model accuracy: {accuracy}")

# 9.3 Data Drift Detection (using Evidently)
def detect_data_drift(reference_data, current_data, column_mapping):
    dashboard = Dashboard(tabs=[DataDriftTab(), CatTargetDriftTab()])
    dashboard.calculate(reference_data, current_data, column_mapping=column_mapping)
    dashboard.save("data_drift_report.html")

# 9.4 Automated Alerts
def send_alert(message):
    # This is a placeholder. In a real-world scenario, you might use an email service or messaging platform.
    print(f"ALERT: {message}")

def check_performance_threshold(accuracy, threshold=0.8):
    if accuracy < threshold:
        send_alert(f"Model accuracy ({accuracy}) is below threshold ({threshold})")

# 9.5 Model Updating
from sklearn.model_selection import train_test_split

def retrain_model(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    logging.info(f"Model retrained. New accuracy: {accuracy}")
    return model

# 9.6 Feedback Loop Implementation
def collect_feedback(prediction, actual):
    # This is a placeholder. In a real-world scenario, you might store this in a database.
    logging.info(f"Feedback collected. Prediction: {prediction}, Actual: {actual}")

# Example usage:
# log_prediction(input_data, prediction)
# monitor_performance(y_true, y_pred)
# detect_data_drift(reference_data, current_data, column_mapping)
# check_performance_threshold(accuracy)
# model = retrain_model(model, X_new, y_new)
# collect_feedback(prediction, actual)

**10. AutoML**

In [None]:
# 10.1 Automated Machine Learning (AutoML)

# Auto-sklearn
from autosklearn.classification import AutoSklearnClassifier

def autosklearn_classification(X, y, time_left_for_this_task=3600):
    automl = AutoSklearnClassifier(time_left_for_this_task=time_left_for_this_task,
                                   per_run_time_limit=300,
                                   ensemble_size=50)
    automl.fit(X, y)
    return automl

# TPOT
from tpot import TPOTClassifier

def tpot_classification(X, y, generations=100, population_size=100):
    tpot = TPOTClassifier(generations=generations, population_size=population_size, verbosity=2)
    tpot.fit(X, y)
    return tpot

# H2O AutoML
import h2o
from h2o.automl import H2OAutoML

def h2o_automl(X, y, max_runtime_secs=3600):
    h2o.init()
    
    # Convert data to H2OFrame
    train = h2o.H2OFrame(pd.concat([X, y], axis=1))
    
    # Identify predictors and response
    x = train.columns
    y = y.name
    x.remove(y)
    
    # Run AutoML
    aml = H2OAutoML(max_runtime_secs=max_runtime_secs, seed=1)
    aml.train(x=x, y=y, training_frame=train)
    
    return aml

# Google Cloud AutoML (this would typically be done through the Google Cloud Console or using their Python client library)
from google.cloud import automl_v1beta1 as automl

def google_automl_tables(project_id, compute_region, dataset_display_name, target_column_name, train_budget_milli_node_hours):
    client = automl.TablesClient(project=project_id, region=compute_region)
    
    # Create a dataset
    dataset = client.create_dataset(dataset_display_name)
    
    # Import data (assuming you've already uploaded your data to Google Cloud Storage)
    client.import_data(dataset=dataset, gcs_source='gs://your-bucket/your-data.csv')
    
    # Create a model
    model = client.create_model(
        display_name='your_model_name',
        dataset=dataset,
        train_budget_milli_node_hours=train_budget_milli_node_hours,
        target_column_spec=client.column_spec(dataset, target_column_name)
    )
    
    # Wait for model to finish training
    model = model.result()
    
    print(f"Model training completed. Model name: {model.display_name}")
    return model

# Utility functions for working with AutoML results

def get_best_model_autosklearn(automl):
    return automl.show_models().sort_values('rank')['model_id'].iloc[0]

def get_best_pipeline_tpot(tpot):
    return tpot.fitted_pipeline_

def get_best_model_h2o(aml):
    return aml.leader

def evaluate_automl_model(model, X_test, y_test):
    from sklearn.metrics import accuracy_score, classification_report
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

# Example usage of AutoML
# X, y = load_your_data()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Auto-sklearn
# automl = autosklearn_classification(X_train, y_train)
# best_model_autosklearn = get_best_model_autosklearn(automl)
# evaluate_automl_model(automl, X_test, y_test)

# # TPOT
# tpot_model = tpot_classification(X_train, y_train)
# best_pipeline_tpot = get_best_pipeline_tpot(tpot_model)
# evaluate_automl_model(tpot_model, X_test, y_test)

# # H2O AutoML
# aml = h2o_automl(X_train, y_train)
# best_model_h2o = get_best_model_h2o(aml)
# # Note: Evaluation for H2O models would be done differently, using H2O's built-in methods

# # Google Cloud AutoML
# # Note: This would be run in a Google Cloud environment
# model = google_automl_tables(project_id, compute_region, dataset_display_name, target_column_name, train_budget_milli_node_hours)
# # Evaluation would be done using Google Cloud's evaluation methods
    
    