<img src="../media/LandingPage-Header-RED-CENTRE.jpg" alt="Notebook Banner" style="width:100%; height:auto; display:block; margin-left:auto; margin-right:auto;">

# End to End Customer Churn Prediction Pipeline

This Jupyter Notebook provides a comprehensive demonstration of a Machine Learning pipeline for **Customer Churn Prediction**. It encompasses the entire lifecycle from raw data ingestion to model deployment and evaluation.

The notebook showcases:

* **Data Loading**: Functions for efficiently loading the raw customer churn dataset.
* **Data Preprocessing & Transformation**: Steps for cleaning, handling missing values, encoding categorical features, scaling numerical features, and splitting data into training and testing sets.
* **Model Definition & Training**: Implementation of a churn prediction model, including its training on the prepared data.
* **Model Evaluation**: Calculation and reporting of key classification metrics to assess model performance.
* **Model & Log Saving**: Persistence of the trained model and its performance metrics for future use and traceability.

Initially, a single, comprehensive script is presented, followed by a demonstration of how its functionalities can be **refactored into modular components** (e.g., `config.py`, `data_loader.py`, `preprocessing.py`) for better organization, reusability, and maintainability.

Original code: 

In [None]:
import pandas as pd
import numpy as np
import json
import os
from datetime import datetime
from typing import Optional, Any, cast 

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import joblib

# ──────────────────────────────────────────────────
# STATELESS HELPER FUNCTIONS
# ──────────────────────────────────────────────────

def load_churn_dataset(filepath: str) -> pd.DataFrame:
    """
    Loads the Customer Churn dataset from a CSV file.
    
    Args:
        filepath (str): Path to the CSV file.
    
    Returns:
        pd.DataFrame: The raw churn dataset.
    """
    try:
        df = pd.read_csv(filepath)
        print(f"Dataset loaded from CSV: {filepath}")
        print(f"Raw dataset: {len(df)} samples, {len(df.columns)} features")
        print(f"Available columns: {list(df.columns)}")
        print(f"Missing values per column:\n{df.isnull().sum()}")
        
        return df
        
    except Exception as e:
        raise RuntimeError(f"Failed to load churn dataset: {e}")

def clean_churn_data(
    df: pd.DataFrame, 
    target_column: str, 
    numeric_columns: list[str], 
    categorical_columns: list[str]
) -> pd.DataFrame:
    """
    Cleans the churn dataset by validating required columns and basic data type handling.
    
    Args:
        df (pd.DataFrame): Raw dataset.
        target_column (str): Target column that must be present.
        numeric_columns (list[str]): Numeric feature columns.
        categorical_columns (list[str]): Categorical feature columns.
        
    Returns:
        pd.DataFrame: Cleaned dataset.
    """
    df_clean = df.copy()
    
    print(f"Initial data shape: {df_clean.shape}")
    
    # Validate that required columns are present
    all_required_columns = [target_column] + numeric_columns + categorical_columns
    missing_columns = [col for col in all_required_columns if col not in df_clean.columns]
    
    if missing_columns:
        raise ValueError(f"Missing required columns in dataset: {missing_columns}")
    
    print(f"All required columns found: {all_required_columns}")
    
    # Keep only the required columns
    df_clean = df_clean[all_required_columns].copy()
    print(f"Kept only required columns: {list(df_clean.columns)}")
    
    print(f"Missing values before cleaning:\n{df_clean.isnull().sum()}")
    
    # Convert TotalCharges to numeric (it might be stored as string)
    if 'TotalCharges' in df_clean.columns:
        df_clean['TotalCharges'] = pd.to_numeric(df_clean['TotalCharges'], errors='coerce')
        print(f"Converted TotalCharges to numeric. New missing values: {df_clean['TotalCharges'].isnull().sum()}")
    
    # Clean target variable - standardize Yes/No to 1/0
    if target_column in df_clean.columns:
        df_clean['churn_binary'] = df_clean[target_column].map({'Yes': 1, 'No': 0})
        print(f"Target variable distribution:\n{df_clean['churn_binary'].value_counts()}")
    
    print(f"Final data shape after cleaning: {df_clean.shape}")
    
    return df_clean

def build_preprocessing_pipeline(numeric_features: list[str], categorical_features: list[str]) -> ColumnTransformer:
    """
    Builds a ColumnTransformer for preprocessing numerical and categorical features.

    Args:
        numeric_features (list[str]): list of numerical feature names.
        categorical_features (list[str]): list of categorical feature names.

    Returns:
        ColumnTransformer: The configured ColumnTransformer.
    """
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Handle potential NaNs
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle potential NaNs
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # sparse_output=False for easier handling
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'  # Drop any columns not specified
    )
    return preprocessor

def transform_features(
    df: pd.DataFrame,
    target_column: str,
    numeric_columns: list[str], 
    categorical_columns: list[str]
) -> tuple[pd.DataFrame, ColumnTransformer, LabelEncoder]:
    """
    Transforms features by encoding target variable and creating preprocessing pipeline.
    
    Args:
        df (pd.DataFrame): Cleaned dataset.
        target_column (str): Target column name.
        numeric_columns (list[str]): Numeric feature columns.
        categorical_columns (list[str]): Categorical feature columns.
        
    Returns:
        Tuple[pd.DataFrame, ColumnTransformer, LabelEncoder]: 
            - Dataset with encoded target
            - Preprocessing pipeline for features
            - Label encoder for target variable
    """
    df_transformed = df.copy()
    
    # Encode target variable (churn_binary already created in cleaning)
    if 'churn_binary' in df_transformed.columns:
        # For churn, we don't need label encoding since it's already 0/1
        # But we'll create a dummy encoder for consistency
        label_encoder = LabelEncoder()
        df_transformed['target_encoded'] = df_transformed['churn_binary']
        
        # Store mapping information
        churn_mapping = {0: 'No Churn', 1: 'Churn'}
        df_transformed.attrs['target_mapping'] = churn_mapping
        df_transformed.attrs['target_names'] = ['No Churn', 'Churn']
        
        print(f"Target encoding - Churn mapping: {churn_mapping}")
    else:
        raise ValueError("Churn binary column not found in dataset")
    
    # Filter available features
    available_numeric = [col for col in numeric_columns if col in df_transformed.columns]
    available_categorical = [col for col in categorical_columns if col in df_transformed.columns]
    
    print(f"Available numeric features: {available_numeric}")
    print(f"Available categorical features: {available_categorical}")
    
    # Build preprocessing pipeline
    preprocessor = build_preprocessing_pipeline(available_numeric, available_categorical)
    
    # Store feature information for later use
    all_features = available_numeric + available_categorical
    df_transformed.attrs['feature_columns'] = all_features
    df_transformed.attrs['numeric_features'] = available_numeric
    df_transformed.attrs['categorical_features'] = available_categorical
    df_transformed.attrs['preprocessor'] = preprocessor
    
    print(f"Features for modeling: {all_features}")
    print(f"Preprocessing pipeline created with {len(available_numeric)} numeric and {len(available_categorical)} categorical features")
    
    return df_transformed, preprocessor, label_encoder

def split_features_and_target(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    """
    Splits the DataFrame into features and target variable.
    
    Args:
        df (pd.DataFrame): Transformed dataset.
        
    Returns:
        Tuple[pd.DataFrame, pd.Series]: Features (X) and target (y).
    """
    # Get feature columns from transformation step
    feature_columns = df.attrs.get('feature_columns', [])
    
    if not feature_columns:
        raise ValueError("No feature columns found in dataset attributes")
    
    # Ensure all required features are present
    missing_features = [f for f in feature_columns if f not in df.columns]
    if missing_features:
        raise ValueError(f"Missing required features: {missing_features}")
    
    X = df[feature_columns].copy()
    y = df['target_encoded'].copy()
    
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    print(f"Features used: {list(X.columns)}")
    
    return X, y

def stratified_split(
    X: pd.DataFrame, 
    y: pd.Series, 
    test_size: float = 0.25, 
    seed: int = 42
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Splits the data into train and test sets with stratification.
    
    Args:
        X (pd.DataFrame): Features.
        y (pd.Series): Target.
        test_size (float): Proportion of test data.
        seed (int): Random seed for reproducibility.
        
    Returns:
        Tuple: Split data - X_train, X_test, y_train, y_test
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y
    )
    # Cast to tuple to match the type hint
    return X_train, X_test, y_train, y_test

def compute_classification_metrics(
    y_true: np.ndarray, 
    y_pred: np.ndarray, 
    target_names: Optional[list[str]] = None
) -> dict[str, Any]:
    """
    Computes classification metrics for binary classification.
    
    Args:
        y_true (np.ndarray): True labels.
        y_pred (np.ndarray): Predicted labels.
        target_names (Optional[list[str]]): Names of target classes.
        
    Returns:
        Dict[str, Any]: Dictionary containing all computed metrics.
    """
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average='binary'),
        "recall": recall_score(y_true, y_pred, average='binary'),
        "f1_score": f1_score(y_true, y_pred, average='binary'),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
        "classification_report": classification_report(
            y_true, y_pred, 
            target_names=target_names,
            output_dict=False
        )
    }
    
    return metrics

def report_classification_metrics(metrics: dict[str, Any]) -> None:
    """
    Prints formatted classification metrics.
    
    Args:
        metrics (Dict[str, Any]): Model evaluation metrics.
    """
    output_lines = []
    output_lines.append("\n" + "="*40)
    output_lines.append("CLASSIFICATION METRICS")
    output_lines.append("="*40)
    
    # Check for key existence to prevent KeyErrors if metrics dict is incomplete
    output_lines.append(f"Accuracy : {metrics.get('accuracy', float('nan')):.4f}")
    output_lines.append(f"Precision: {metrics.get('precision', float('nan')):.4f}")
    output_lines.append(f"Recall   : {metrics.get('recall', float('nan')):.4f}")
    output_lines.append(f"F1-Score : {metrics.get('f1_score', float('nan')):.4f}")
    
    output_lines.append(f"\nConfusion Matrix:")
    confusion_matrix_data = metrics.get('confusion_matrix')
    if isinstance(confusion_matrix_data, list):
        for row in confusion_matrix_data:
            output_lines.append(f"   {row}")
    elif confusion_matrix_data is not None:
        output_lines.append(f"   Unexpected format for confusion matrix: {confusion_matrix_data}")
    else:
        output_lines.append(f"   Confusion matrix data not available.")
        
    print("\n".join(output_lines))

# ──────────────────────────────────────────────────
# STATEFUL MODEL CLASS
# ──────────────────────────────────────────────────

class ChurnPredictionModel:
    """
    Customer churn prediction model using configurable classifier with preprocessing pipeline.
    """
    
    def __init__(
        self, 
        classifier=None,
        preprocessor=None,
        random_state: int = 42
    ):
        """
        Initialize the churn prediction pipeline.
        
        Args:
            classifier: Scikit-learn classifier instance. If None, uses LogisticRegression.
            preprocessor: Scikit-learn preprocessing pipeline. If None, uses StandardScaler only.
            random_state (int): Random seed for reproducibility.
        """
        self.random_state = random_state
        
        # Use provided classifier or default to LogisticRegression
        if classifier is None:
            self.classifier = LogisticRegression(
                random_state=self.random_state,
                max_iter=1000,
                class_weight='balanced'  # Handle potential class imbalance
            )
        else:
            self.classifier = classifier
        
        # Use provided preprocessor or create simple scaling pipeline
        if preprocessor is None:
            self.pipe = Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', self.classifier)
            ])
        else:
            self.pipe = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', self.classifier)
            ])
    
    def fit(self, X: pd.DataFrame, y: pd.Series) -> 'ChurnPredictionModel':
        """
        Fit the model to training data.
        
        Args:
            X (pd.DataFrame): Training features.
            y (pd.Series): Training target.
            
        Returns:
            ChurnPredictionModel: Self for method chaining.
        """
        print(f"Training model with {len(X)} samples and {len(X.columns)} features...")
        
        # Fit the pipeline
        self.pipe.fit(X, y)
        
        print(f"Model trained successfully!")
        print(f"Features used: {list(X.columns)}")
        
        return self
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """
        Make predictions on input data.
        
        Args:
            X (pd.DataFrame): Input features.
            
        Returns:
            np.ndarray: Predicted class labels.
        """
        return cast(np.ndarray, self.pipe.predict(X))
    
    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
        """
        Predict class probabilities.
        
        Args:
            X (pd.DataFrame): Input features.
            
        Returns:
            np.ndarray: Predicted class probabilities.
        """
        return cast(np.ndarray, self.pipe.predict_proba(X))
    
    def save(self, filepath: str) -> None:
        """
        Save the trained model to a file.
        
        Args:
            filepath (str): Full path to save the model.
        """
        directory = os.path.dirname(filepath)
        if directory and not os.path.exists(directory):
            os.makedirs(directory, exist_ok=True)
        
        joblib.dump(self.pipe, filepath)
        print(f"Model saved to {filepath}")
    
    def save_run_log(self, directory: str, metrics: dict[str, Any], dataset_info: dict[str, Any]) -> None:
        """
        Save model configuration and performance metrics to a JSON file.
        
        Args:
            directory (str): Directory where the JSON file will be stored.
            metrics (Dict[str, Any]): Evaluation metrics to save.
            dataset_info (Dict[str, Any]): Information about the dataset used.
        """
        if not os.path.exists(directory):
            os.makedirs(directory, exist_ok=True)
        
        run_info = {
            "timestamp": datetime.now().isoformat(),
            "model_class": "ChurnPredictionModel",
            "classifier": str(type(self.classifier).__name__),
            "dataset": "Customer Churn",
            "dataset_info": dataset_info,
            "parameters": {
                "random_state": self.random_state,
                "classifier_params": self.classifier.get_params()
            },
            "metrics": metrics
        }
        
        log_file = os.path.join(directory, "churn_model_run_log.json")
        
        # Load existing logs or create new list
        if os.path.exists(log_file):
            try:
                with open(log_file, "r") as f:
                    logs = json.load(f)
            except (json.JSONDecodeError, FileNotFoundError):
                logs = []
        else:
            logs = []
        
        logs.append(run_info)
        
        # Save updated logs
        with open(log_file, "w") as f:
            json.dump(logs, f, indent=4, default=str)
        
        print(f"Run log saved to {log_file}")

# ──────────────────────────────────────────────────
# MAIN ORCHESTRATOR
# ──────────────────────────────────────────────────

def main() -> None:
    """
    Main function to orchestrate the entire ML pipeline:
    1. Load and prepare data
    2. Split data
    3. Train model
    4. Evaluate performance
    5. Save model and logs
    """
    print("Starting Customer Churn Prediction Pipeline...")
    print("="*60)
    
    try:
        # Define column structure based on the churn dataset
        target_column = 'Churn'  # Target variable
        numeric_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
        categorical_columns = [
            'gender', 'SeniorCitizen', 'Partner', 'Dependents',
            'PhoneService', 'MultipleLines', 'InternetService',
            'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
            'TechSupport', 'StreamingTV', 'StreamingMovies',
            'Contract', 'PaperlessBilling', 'PaymentMethod'
        ]
        
        # Construct the absolute path to the data file (reverted to original path)
        script_dir = os.path.dirname(os.path.abspath(__file__))
        data_file_path = os.path.join(script_dir, "data", "WA_Fn-UseC_-Telco-Customer-Churn.csv") # Path restored

        # Load and prepare data
        print("\n1. Loading dataset...")
        df_raw = load_churn_dataset(data_file_path)

        
        print("\n2. Cleaning data...")
        df_clean = clean_churn_data(df_raw, target_column, numeric_columns, categorical_columns)
        
        print("\n3. Transforming features...")
        df_transformed, preprocessor, label_encoder = transform_features(df_clean, target_column, numeric_columns, categorical_columns)
        
        # Extract dataset information
        dataset_info = {
            "total_samples": len(df_transformed),
            "n_features": len(df_transformed.attrs.get('feature_columns', [])),
            "target_mapping": df_transformed.attrs.get('target_mapping', {}),
            "target_names": df_transformed.attrs.get('target_names', []),
            "churn_distribution": df_transformed['target_encoded'].value_counts().to_dict(),
            "feature_columns": df_transformed.attrs.get('feature_columns', []),
            "numeric_features": df_transformed.attrs.get('numeric_features', []),
            "categorical_features": df_transformed.attrs.get('categorical_features', [])
        }
        
        # Split features and target
        print("\n4. Preparing features and target...")
        X, y = split_features_and_target(df_transformed)
        
        # Split into train/test
        print("\n5. Splitting data...")
        X_train, X_test, y_train, y_test = stratified_split(X, y, test_size=0.25, seed=42)
        print(f"Training set: {len(X_train)} samples")
        print(f"Test set: {len(X_test)} samples")
        print(f"Train churn rate: {y_train.mean():.3f}")
        print(f"Test churn rate: {y_test.mean():.3f}")
        
        # Initialize and train model
        print("\n6. Training model...")
        model = ChurnPredictionModel(preprocessor=preprocessor, random_state=42)
        model.fit(X_train, y_train)
        
        # Make predictions
        print("\n7. Making predictions...")
        y_pred = model.predict(X_test)
        
        # Evaluate model
        print("\n8. Evaluating model...")
        metrics = compute_classification_metrics(
            y_test.to_numpy(),  # Use .to_numpy() for explicit np.ndarray conversion
            y_pred, 
            target_names=dataset_info['target_names']
        )
        
        # Report results
        report_classification_metrics(metrics)
        
        # Save model and logs
        print("\n9. Saving model and logs...")
        # Define the name of the directory for saved models
        saved_models_dirname = "saved_models"
        # Create the full path for the model directory relative to the script's location
        model_dir_path = os.path.join(script_dir, saved_models_dirname)
        
        # Create the full path for the model file
        model_file_path = os.path.join(model_dir_path, "churn_prediction_model_v1.joblib")
        
        model.save(model_file_path) # Pass the full path to the model file
        model.save_run_log(model_dir_path, metrics, dataset_info) # Pass the full path to the directory for logs
        
        print(f"\n{'='*60}")
        print("Pipeline completed successfully!")
        print(f"Model accuracy: {metrics['accuracy']:.4f}")
        print(f"Model saved to: {model_file_path}") # Updated to show the full path
        
    except Exception as e:
        print(f"\nERROR: Pipeline failed with exception: {e}")
        raise

# ──────────────────────────────────────────────────
# ENTRY POINT
# ──────────────────────────────────────────────────

if __name__ == "__main__":
    main()

## Refractoring Process: 

### config.py

Central definition of parameters and paths.

This script stores configuration variables and constants used across the ML project. It includes definitions for column names, test sizes, random states, and file names for models and logs, and paths to datasets.

* **Column Definitions**: Defines `TARGET_COLUMN` for the target variable 'Churn', and lists `NUMERIC_COLUMNS` and `CATEGORICAL_COLUMNS` for the churn dataset.
* **Test and Random State Constants**: Specifies `TEST_SIZE` as 0.25 and `RANDOM_STATE` as 42 for reproducibility.
* **File and Directory Names**: Defines filenames for the `MODEL_FILENAME` ("churn_prediction_model_v1.joblib") and `LOG_FILENAME` ("churn_model_run_log.json"), as well as the `MODEL_STORE_DIR` ("model_store").
* **Data Paths**: Sets up directory names (`DATA_DIR_NAME`, `RAW_DATA_DIR_NAME`) and the `DATASET_FILENAME` ("WA_Fn-UseC_-Telco-Customer-Churn.csv") for dataset locations.


In [None]:
"""
config.py

This script stores configuration variables and constants used across the ML project.
It includes definitions for column names, test sizes, random states, and file names
for models and logs, and paths to datasets.
"""
import os

# Define column structure based on the churn dataset
TARGET_COLUMN = 'Churn'  # Target variable
NUMERIC_COLUMNS = ['tenure', 'MonthlyCharges', 'TotalCharges']
CATEGORICAL_COLUMNS = [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents',
    'PhoneService', 'MultipleLines', 'InternetService',
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaperlessBilling', 'PaymentMethod'
]


TEST_SIZE = 0.25
RANDOM_STATE = 42

# Define file and directory names
MODEL_FILENAME = "churn_prediction_model_v1.joblib"
LOG_FILENAME = "churn_model_run_log.json"
MODEL_STORE_DIR = "model_store"

# Define data paths (relative to the project root)
DATA_DIR_NAME = "data"
RAW_DATA_DIR_NAME = "raw"
DATASET_FILENAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

### data_loader.py

Reading and basic cleaning of raw data. Initial splitting(e.g., sampling on large datasets).

This script is responsible for loading the raw customer churn dataset. It contains functions to read the data from a specified file path.

* **`load_churn_dataset(filepath: str) -> pd.DataFrame`**: This function loads the Customer Churn dataset from a specified CSV `filepath`.
    * **Functionality**: It uses `pandas.read_csv` to load the data.
    * **Output**: Prints information about the loaded dataset including shape, columns, and missing values. Returns a pandas DataFrame.
    * **Error Handling**: Includes a `try-except` block to catch any exceptions during file loading and raises a `RuntimeError`.


In [None]:
"""
data_loader.py

This script is responsible for loading the raw customer churn dataset.
It contains functions to read the data from a specified file path.
"""
import pandas as pd

# Assuming interfaces.py exists and defines DataLoader
# from src.interfaces import DataLoader # Not needed for static type checking in runtime if not inheriting

def load_churn_dataset(filepath: str) -> pd.DataFrame:
    """
    Loads the Customer Churn dataset from a CSV file.

    Args:
        filepath (str): Path to the CSV file.

    Returns:
        pd.DataFrame: The raw churn dataset.
    """
    try:
        df = pd.read_csv(filepath)
        print(f"Dataset loaded from CSV: {filepath}")
        print(f"Raw dataset: {len(df)} samples, {len(df.columns)} features")
        print(f"Available columns: {list(df.columns)}")
        print(f"Missing values per column:\n{df.isnull().sum()}")

        return df

    except Exception as e:
        raise RuntimeError(f"Failed to load churn dataset: {e}")

### preprocessing.py

This script contains functions for data cleaning and preprocessing. It handles tasks such as converting data types, encoding categorical variables, scaling numerical features, and splitting the data into training and testing sets.

* **`clean_churn_data(...)` Function**:
    * **Purpose**: Cleans the raw churn dataset, ensuring required columns are present and handling basic data type conversions.
    * **Functionality**: Validates column presence, converts `TotalCharges` to numeric (handling coerce errors), and converts the target `Yes`/`No` to binary `1`/`0`. Selects only relevant columns for the output.
    * **Inputs**: Raw DataFrame, target column name, lists of numeric and categorical column names.
    * **Outputs**: Cleaned DataFrame.
    * **Error Handling**: Raises `ValueError` if required columns are missing.

* **`build_preprocessing_pipeline(...)` Function**:
    * **Purpose**: Constructs a `ColumnTransformer` for comprehensive preprocessing of numerical and categorical features.
    * **Functionality**: Defines pipelines for numeric (median imputation, standard scaling) and categorical (most frequent imputation, one-hot encoding) transformations. Combines these using `ColumnTransformer`.
    * **Inputs**: Lists of numerical and categorical feature names.
    * **Outputs**: Configured `ColumnTransformer`.

* **`transform_features(...)` Function**:
    * **Purpose**: Encodes the target variable and applies/fits the preprocessing pipeline to the features.
    * **Functionality**: Ensures the target variable is encoded (assumes `churn_binary` is already present) and stores target mapping. Builds and fits the `ColumnTransformer` on the relevant features. Stores feature information in DataFrame attributes.
    * **Inputs**: Cleaned DataFrame, target column name, lists of numeric and categorical column names.
    * **Outputs**: Tuple containing the DataFrame with encoded target and the fitted `ColumnTransformer`.
    * **Error Handling**: Raises `ValueError` if `churn_binary` column is not found.

* **`split_features_and_target(...)` Function**:
    * **Purpose**: Separates the DataFrame into feature (X) and target (y) DataFrames/Series.
    * **Functionality**: Retrieves feature columns from DataFrame attributes and ensures they are present.
    * **Inputs**: Transformed DataFrame.
    * **Outputs**: Tuple of features `X` (DataFrame) and target `y` (Series).
    * **Error Handling**: Raises `ValueError` if feature columns are not found in attributes or if any required features are missing.

* **`stratified_split(...)` Function**:
    * **Purpose**: Splits data into training and testing sets while preserving the proportion of the target variable (stratification).
    * **Functionality**: Uses `sklearn.model_selection.train_test_split` with `stratify=y`.
    * **Inputs**: Features `X`, target `y`, test size, and random seed.
    * **Outputs**: Tuple of `X_train`, `X_test`, `y_train`, `y_test`.

In [None]:
"""
preprocessing.py

This script contains functions for data cleaning and preprocessing.
It handles tasks such as converting data types, encoding categorical variables,
scaling numerical features, and splitting the data into training and testing sets.
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder


def clean_churn_data(
    df: pd.DataFrame,
    target_column: str,
    numeric_columns: list[str],
    categorical_columns: list[str]
) -> pd.DataFrame:
    """
    Cleans the churn dataset by validating required columns and basic data type handling.

    Args:
        df (pd.DataFrame): Raw dataset.
        target_column (str): Target column that must be present.
        numeric_columns (list[str]): Numeric feature columns.
        categorical_columns (list[str]): Categorical feature columns.

    Returns:
        pd.DataFrame: Cleaned dataset.
    """
    df_clean = df.copy()

    print(f"Initial data shape: {df_clean.shape}")

    # Validate that required columns are present
    all_required_columns = [target_column] + numeric_columns + categorical_columns
    missing_columns = [col for col in all_required_columns if col not in df_clean.columns]

    if missing_columns:
        raise ValueError(f"Missing required columns in dataset: {missing_columns}")

    print(f"All required columns found: {all_required_columns}")

    # Keep only the required columns
    df_clean = df_clean[all_required_columns].copy()
    print(f"Kept only required columns: {list(df_clean.columns)}")

    print(f"Missing values before cleaning:\n{df_clean.isnull().sum()}")

    # Convert TotalCharges to numeric (it might be stored as string)
    if 'TotalCharges' in df_clean.columns:
        df_clean['TotalCharges'] = pd.to_numeric(df_clean['TotalCharges'], errors='coerce')
        print(f"Converted TotalCharges to numeric. New missing values: {df_clean['TotalCharges'].isnull().sum()}")

    # Clean target variable - standardize Yes/No to 1/0
    if target_column in df_clean.columns:
        df_clean['churn_binary'] = df_clean[target_column].map({'Yes': 1, 'No': 0})
        print(f"Target variable distribution:\n{df_clean['churn_binary'].value_counts()}")

    # Define the set of columns that should be in the final output DataFrame
    final_output_columns = numeric_columns + categorical_columns + ['churn_binary']

    # Select only these columns
    # Use .copy() to ensure this is a new DataFrame and avoid potential warnings later
    df_clean = df_clean[final_output_columns].copy()
    
    print(f"Final columns in returned DataFrame: {list(df_clean.columns)}")

    print(f"Final data shape after cleaning: {df_clean.shape}")

    return df_clean

def build_preprocessing_pipeline(numeric_features: list[str], categorical_features: list[str]) -> ColumnTransformer:
    """
    Builds a ColumnTransformer for preprocessing numerical and categorical features.

    Args:
        numeric_features (list[str]): list of numerical feature names.
        categorical_features (list[str]): list of categorical feature names.

    Returns:
        ColumnTransformer: The configured ColumnTransformer.
    """
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'
    )
    return preprocessor

def transform_features(
    df: pd.DataFrame,
    target_column: str,
    numeric_columns: list[str],
    categorical_columns: list[str]
) -> tuple[pd.DataFrame, ColumnTransformer]:
    
    #tuple[pd.DataFrame, ColumnTransformer, LabelEncoder]
    """
    Transforms features by encoding target variable and creating preprocessing pipeline.

    Args:
        df (pd.DataFrame): Cleaned dataset.
        target_column (str): Target column name.
        numeric_columns (list[str]): Numeric feature columns.
        categorical_columns (list[str]): Categorical feature columns.

    Returns:
        Tuple[pd.DataFrame, ColumnTransformer, LabelEncoder]:
            - Dataset with encoded target
            - Preprocessing pipeline for features
            - Label encoder for target variable
    """
    df_transformed = df.copy()

    # Encode target variable (churn_binary already created in cleaning)
    if 'churn_binary' in df_transformed.columns:
        # For churn, we don't need label encoding since it's already 0/1
        # But we'll create a dummy encoder for consistency

        #label_encoder = LabelEncoder()
        df_transformed['target_encoded'] = df_transformed['churn_binary']

        # Store mapping information
        churn_mapping = {0: 'No Churn', 1: 'Churn'}
        df_transformed.attrs['target_mapping'] = churn_mapping
        df_transformed.attrs['target_names'] = ['No Churn', 'Churn']

        print(f"Target encoding - Churn mapping: {churn_mapping}")
    else:
        raise ValueError("Churn binary column not found in dataset")

    # Filter available features
    available_numeric = [col for col in numeric_columns if col in df_transformed.columns]
    available_categorical = [col for col in categorical_columns if col in df_transformed.columns]

    print(f"Available numeric features: {available_numeric}")
    print(f"Available categorical features: {available_categorical}")

    # Build preprocessing pipeline
    preprocessor = build_preprocessing_pipeline(available_numeric, available_categorical)

    # Fit the preprocessor on the feature data
    features_to_fit = df_transformed[available_numeric + available_categorical]
    if not features_to_fit.empty: # Ensure there are features to fit
        preprocessor.fit(features_to_fit)

    # Store feature information for later use
    all_features = available_numeric + available_categorical
    df_transformed.attrs['feature_columns'] = all_features
    df_transformed.attrs['numeric_features'] = available_numeric
    df_transformed.attrs['categorical_features'] = available_categorical
    df_transformed.attrs['preprocessor'] = preprocessor

    print(f"Features for modeling: {all_features}")
    print(f"Preprocessing pipeline created with {len(available_numeric)} numeric and {len(available_categorical)} categorical features")

    #return df_transformed, preprocessor, label_encoder
    #return df_transformed, preprocessor, label_encoder
    return df_transformed, preprocessor

def split_features_and_target(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    """
    Splits the DataFrame into features and target variable.

    Args:
        df (pd.DataFrame): Transformed dataset.

    Returns:
        Tuple[pd.DataFrame, pd.Series]: Features (X) and target (y).
    """
    # Get feature columns from transformation step
    feature_columns = df.attrs.get('feature_columns', [])

    if not feature_columns:
        raise ValueError("No feature columns found in dataset attributes")

    # Ensure all required features are present
    missing_features = [f for f in feature_columns if f not in df.columns]
    if missing_features:
        raise ValueError(f"Missing required features: {missing_features}")

    X = df[feature_columns].copy()
    y = df['target_encoded'].copy()

    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    print(f"Features used: {list(X.columns)}")

    return X, y

def stratified_split(
    X: pd.DataFrame,
    y: pd.Series,
    test_size: float = 0.25,
    seed: int = 42
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Splits the data into train and test sets with stratification.

    Args:
        X (pd.DataFrame): Features.
        y (pd.Series): Target.
        test_size (float): Proportion of test data.
        seed (int): Random seed for reproducibility.

    Returns:
        Tuple: Split data - X_train, X_test, y_train, y_test
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y
    )
    # Cast to tuple to match the type hint
    return X_train, X_test, y_train, y_test

### model.py

Model training, inference, and saving logic.

This script defines the `ChurnPredictionModel` class, which encapsulates the machine learning model (e.g., Logistic Regression) and its associated preprocessing pipeline. It also includes functions for computing and reporting classification metrics, and for logging model run details.

* **`ChurnPredictionModel` Class**:
    * **`__init__(self, classifier=None, preprocessor=None, random_state: int = 42)`**: Initializes the model pipeline. If no classifier or preprocessor is provided, it defaults to `LogisticRegression` and `StandardScaler`, respectively.
    * **`fit(self, X: pd.DataFrame, y: pd.Series) -> 'ChurnPredictionModel'`**: Fits the machine learning pipeline to the training data.
    * **`predict(self, X: pd.DataFrame) -> np.ndarray`**: Makes class label predictions on new data.
    * **`predict_proba(self, X: pd.DataFrame) -> np.ndarray`**: Predicts class probabilities on new data.
    * **`save(self, filepath: str) -> None`**: Saves the trained model (pipeline) to a file using `joblib`. It also creates the directory if it doesn't exist.
    * **`log_run(self, directory: str, metrics: Dict[str, Any], dataset_info: Dict[str, Any], log_filename: str = "churn_model_run_log.json") -> None`**: Saves model configuration, performance metrics, and dataset information to a JSON log file. It appends to an existing log or creates a new one.

* **Helper Functions**:
    * **`compute_classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, target_names: Optional[list[str]] = None) -> dict[str, Any]`**: Computes standard classification metrics (accuracy, precision, recall, f1-score, confusion matrix, classification report) for binary classification.
    * **`report_classification_metrics(metrics: dict[str, Any]) -> None`**: Prints a formatted report of the classification metrics.


In [None]:
"""
model.py

This script defines the ChurnPredictionModel class, which encapsulates
the machine learning model (e.g., Logistic Regression) and its associated
preprocessing pipeline. It also includes functions for computing and reporting
classification metrics, and for logging model run details.
"""
import pandas as pd
import numpy as np
import joblib
import os
import json
from datetime import datetime
from typing import Any, cast, Optional
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.preprocessing import StandardScaler # Import for fallback in __init__


class ChurnPredictionModel:
    """
    Customer churn prediction model using configurable classifier with preprocessing pipeline.
    """

    def __init__(
        self,
        classifier=None,
        preprocessor=None,
        random_state: int = 42
    ):
        """
        Initialize the churn prediction pipeline.

        Args:
            classifier: Scikit-learn classifier instance. If None, uses LogisticRegression.
            preprocessor: Scikit-learn preprocessing pipeline. If None, uses StandardScaler only.
            random_state (int): Random seed for reproducibility.
        """
        self.random_state = random_state
        self.classifier = classifier if classifier is not None else LogisticRegression(
            random_state=self.random_state,
            max_iter=1000,
            class_weight='balanced'
        )
        self.pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', self.classifier)
        ]) if preprocessor is not None else Pipeline([
            ('scaler', StandardScaler()), # Fallback if no preprocessor provided
            ('classifier', self.classifier)
        ])

    def fit(self, X: pd.DataFrame, y: pd.Series) -> 'ChurnPredictionModel':
        """
        Fit the model to training data.

        Args:
            X (pd.DataFrame): Training features.
            y (pd.Series): Training target.

        Returns:
            ChurnPredictionModel: Self for method chaining.
        """
        print(f"Training model with {len(X)} samples and {len(X.columns)} features...")

        # Fit the pipeline
        self.pipe.fit(X, y)

        print(f"Model trained successfully!")
        print(f"Features used: {list(X.columns)}")

        return self

    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """
        Make predictions on input data.

        Args:
            X (pd.DataFrame): Input features.

        Returns:
            np.ndarray: Predicted class labels.
        """
        return cast(np.ndarray, self.pipe.predict(X))

    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
        """
        Predict class probabilities.

        Args:
            X (pd.DataFrame): Input features.

        Returns:
            np.ndarray: Predicted class probabilities.
        """
        return cast(np.ndarray, self.pipe.predict_proba(X))

    def save(self, filepath: str) -> None:
        """
        Save the trained model to a file.

        Args:
            filepath (str): Full path to save the model.
        """
        directory = os.path.dirname(filepath)
        if directory and not os.path.exists(directory):
            os.makedirs(directory, exist_ok=True)

        joblib.dump(self.pipe, filepath)
        print(f"Model saved to {filepath}")

    def log_run(
        self,
        directory: str,
        metrics: dict[str, Any],
        dataset_info: dict[str, Any],
        log_filename: str = "churn_model_run_log.json"
    ) -> None:
        """
        Save model configuration and performance metrics to a JSON file.

        Args:
            directory (str): Directory where the JSON file will be stored.
            metrics (Dict[str, Any]): Evaluation metrics to save.
            dataset_info (Dict[str, Any]): Information about the dataset used.
            log_filename (str): Name of the log file.
        """
        if not os.path.exists(directory):
            os.makedirs(directory, exist_ok=True)

        run_info = {
            "timestamp": datetime.now().isoformat(),
            "model_class": self.__class__.__name__,
            "classifier": str(type(self.classifier).__name__),
            "dataset": "Customer Churn",
            "dataset_info": dataset_info,
            "parameters": {
                "random_state": self.random_state,
                "classifier_params": self.classifier.get_params()
            },
            "metrics": metrics
        }

        log_file = os.path.join(directory, log_filename)

        # Load existing logs or create new list
        if os.path.exists(log_file):
            try:
                with open(log_file, "r") as f:
                    logs = json.load(f)
            except (json.JSONDecodeError, FileNotFoundError):
                logs = []
        else:
            logs = []

        logs.append(run_info)

        # Save updated logs
        with open(log_file, "w") as f:
            json.dump(logs, f, indent=4, default=str)

        print(f"Run log saved to {log_file}")


def compute_classification_metrics(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    target_names: Optional[list[str]] = None
) -> dict[str, Any]:
    """
    Computes classification metrics for binary classification.

    Args:
        y_true (np.ndarray): True labels.
        y_pred (np.ndarray): Predicted labels.
        target_names (Optional[list[str]]): Names of target classes.

    Returns:
        Dict[str, Any]: Dictionary containing all computed metrics.
    """
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average='binary'),
        "recall": recall_score(y_true, y_pred, average='binary'),
        "f1_score": f1_score(y_true, y_pred, average='binary'),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
        "classification_report": classification_report(
            y_true, y_pred,
            target_names=target_names,
            output_dict=False
        )
    }

    return metrics

def report_classification_metrics(metrics: dict[str, Any]) -> None:
    """
    Prints formatted classification metrics.

    Args:
        metrics (Dict[str, Any]): Model evaluation metrics.
    """
    output_lines = []
    output_lines.append("\n" + "="*40)
    output_lines.append("CLASSIFICATION METRICS")
    output_lines.append("="*40)

    # Check for key existence to prevent KeyErrors if metrics dict is incomplete
    output_lines.append(f"Accuracy : {metrics.get('accuracy', float('nan')):.4f}")
    output_lines.append(f"Precision: {metrics.get('precision', float('nan')):.4f}")
    output_lines.append(f"Recall   : {metrics.get('recall', float('nan')):.4f}")
    output_lines.append(f"F1-Score : {metrics.get('f1_score', float('nan')):.4f}")

    output_lines.append(f"\nConfusion Matrix:")
    confusion_matrix_data = metrics.get('confusion_matrix')
    if isinstance(confusion_matrix_data, list):
        for row in confusion_matrix_data:
            output_lines.append(f"  {row}")
    elif confusion_matrix_data is not None:
        output_lines.append(f"  Unexpected format for confusion matrix: {confusion_matrix_data}")
    else:
        output_lines.append(f"  Confusion matrix data not available.")

    print("\n".join(output_lines))

### pipeline.py

Orchestrating pipeline stages in sequence.

This script defines and orchestrates the end to end machine learning pipeline for customer churn prediction. It encapsulates the sequential steps from data loading and preprocessing to model training and evaluation.

* **`run_churn_pipeline(...)` Function**: This is the main function that runs the entire ML pipeline.
    * **Data Loading**: Uses `load_churn_dataset` to load raw data.
    * **Data Preprocessing**: Calls `clean_churn_data` for cleaning, `transform_features` for feature engineering and scaling, `split_features_and_target` to separate features and target, and `stratified_split` for splitting data into training and testing sets while preserving target distribution.
    * **Model Training**: Initializes and trains a `ChurnPredictionModel` instance.
    * **Prediction and Evaluation**: Makes predictions on the test set and computes and reports classification metrics using `compute_classification_metrics` and `report_classification_metrics`.
    * **Model and Log Saving**: Saves the trained model using `model.save` and logs the run details and metrics using `model.log_run`.
    * **Output**: Returns the trained model instance and the evaluation metrics.


In [None]:

"""
pipeline.py

This script defines and orchestrates the end-to-end machine learning pipeline
for customer churn prediction. It encapsulates the sequential steps from
data loading and preprocessing to model training and evaluation.
"""
import pandas as pd
import os
from typing import Any, Dict, Tuple

from src.data_loader import load_churn_dataset
from src.preprocessing import clean_churn_data, transform_features, split_features_and_target, stratified_split
from src.model import ChurnPredictionModel, compute_classification_metrics, report_classification_metrics


def run_churn_pipeline(
    data_file_path: str,
    target_column: str,
    numeric_columns: list[str],
    categorical_columns: list[str],
    test_size: float,
    random_state: int,
    model_dir_path: str,
    model_filename: str,
    log_filename: str
) -> Tuple[ChurnPredictionModel, Dict[str, Any]]:
    """
    Runs the complete customer churn prediction pipeline.

    Args:
        data_file_path (str): Path to the raw dataset CSV file.
        target_column (str): Name of the target column.
        numeric_columns (list[str]): List of numeric feature column names.
        categorical_columns (list[str]): List of categorical feature column names.
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Random seed for reproducibility.
        model_dir_path (str): Directory where the trained model and logs will be saved.
        model_filename (str): Name of the file to save the trained model.
        log_filename (str): Name of the file to save the run logs.

    Returns:
        Tuple[ChurnPredictionModel, Dict[str, Any]]:
            - The trained ChurnPredictionModel instance.
            - A dictionary containing the evaluation metrics.
    """
    print("\n1. Loading dataset...")
    df_raw = load_churn_dataset(data_file_path)

    print("\n2. Cleaning data...")
    df_clean = clean_churn_data(df_raw, target_column, numeric_columns, categorical_columns)

    print("\n3. Transforming features...")
    # If transform_features now returns only 2 items, adjust unpacking:
    df_transformed, preprocessor = transform_features(
        df_clean, target_column, numeric_columns, categorical_columns
    )

    # Extract dataset information
    dataset_info = {
        "total_samples": len(df_transformed),
        "n_features": len(df_transformed.attrs.get('feature_columns', [])),
        "target_mapping": df_transformed.attrs.get('target_mapping', {}),
        "target_names": df_transformed.attrs.get('target_names', []),
        "churn_distribution": df_transformed['target_encoded'].value_counts().to_dict(),
        "feature_columns": df_transformed.attrs.get('feature_columns', []),
        "numeric_features": df_transformed.attrs.get('numeric_features', []),
        "categorical_features": df_transformed.attrs.get('categorical_features', [])
    }

    print("\n4. Preparing features and target...")
    X, y = split_features_and_target(df_transformed)

    print("\n5. Splitting data...")
    X_train, X_test, y_train, y_test = stratified_split(X, y, test_size=test_size, seed=random_state)
    print(f"Training set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")
    print(f"Train churn rate: {y_train.mean():.3f}")
    print(f"Test churn rate: {y_test.mean():.3f}")

    print("\n6. Training model...")
    model = ChurnPredictionModel(preprocessor=preprocessor, random_state=random_state)
    model.fit(X_train, y_train)

    print("\n7. Making predictions...")
    y_pred = model.predict(X_test)

    print("\n8. Evaluating model...")
    metrics = compute_classification_metrics(
        y_test.to_numpy(),
        y_pred,
        target_names=dataset_info['target_names']
    )
    report_classification_metrics(metrics)

    print("\n9. Saving model and logs...")
    model_file_path = os.path.join(model_dir_path, model_filename)
    model.save(model_file_path)
    model.log_run(model_dir_path, metrics, dataset_info, log_filename=log_filename)

    print(f"Model saved to: {model_file_path}")
    print(f"Run log saved to: {os.path.join(model_dir_path, log_filename)}")

    return model, metrics

### main.py

Entry point for triggering the pipeline.

This is the main entry point for the Customer Churn Prediction ML pipeline. It orchestrates the entire process by calling the main pipeline function and handling overall execution flow.

* **Purpose**: Serves as the primary script to initiate and manage the execution of the customer churn prediction machine learning pipeline.
* **`main(output_base_dir: Optional[Path] = None) -> None` Function**:
    * **Initialization**: Prints a starting message and constructs file paths for data and model artifacts, defaulting to the script's directory or using a provided `output_base_dir`.
    * **Pipeline Execution**: Calls `run_churn_pipeline` (imported from `src.pipeline`) with all necessary configuration parameters defined in `src.config`.
    * **Logging and Error Handling**: Prints a success message and the final model accuracy upon successful completion. Includes robust error handling to catch and report exceptions during pipeline execution.
* **Entry Point (`if __name__ == "__main__":`)**: Ensures that the `main()` function is called when the script is executed directly.


In [None]:
"""
main.py

This is the main entry point for the Customer Churn Prediction ML pipeline.
It orchestrates the entire process by calling the main pipeline function
and handling overall execution flow.
"""
from typing import Optional
from pathlib import Path
import os
import sys
from src.pipeline import run_churn_pipeline # Import the new pipeline function
from src.config import (
    TARGET_COLUMN, NUMERIC_COLUMNS, CATEGORICAL_COLUMNS, TEST_SIZE,
    RANDOM_STATE, MODEL_FILENAME, LOG_FILENAME, DATA_DIR_NAME,
    RAW_DATA_DIR_NAME, DATASET_FILENAME, MODEL_STORE_DIR)


def main(output_base_dir: Optional[Path] = None) -> None:
    """
    Main function to orchestrate the entire ML pipeline.

    Args:
        output_base_dir (Path, optional): The base directory where
                                         data and model artifacts should be
                                         read from/saved to. If None,
                                         defaults to the script's directory.
    """
    print("Starting Customer Churn Prediction Pipeline...")
    print("="*60)

    try:
        # Construct necessary paths
        if output_base_dir is None:
            # Default to script's directory for normal runs
            base_path = Path(os.path.dirname(os.path.abspath(__file__)))
        else:
            # Use the provided base directory for testing
            base_path = output_base_dir

        data_file_path = base_path / DATA_DIR_NAME / RAW_DATA_DIR_NAME / DATASET_FILENAME
        model_dir_path = base_path / MODEL_STORE_DIR

        # Run the entire churn prediction pipeline
        trained_model, evaluation_metrics = run_churn_pipeline(
            data_file_path=str(data_file_path),
            target_column=TARGET_COLUMN,
            numeric_columns=NUMERIC_COLUMNS,
            categorical_columns=CATEGORICAL_COLUMNS,
            test_size=TEST_SIZE,
            random_state=RANDOM_STATE,
            model_dir_path=str(model_dir_path),
            model_filename=MODEL_FILENAME,
            log_filename=LOG_FILENAME
        )

        print(f"\n{'='*60}")
        print("Pipeline completed successfully!")
        print(f"Final Model Accuracy: {evaluation_metrics['accuracy']:.4f}") # Using metrics from pipeline output

    except Exception as e:
        #print(f"\nERROR: Pipeline failed with exception: {e}")
        print(f"ERROR: Pipeline failed with exception: {e}", file=sys.stderr) # Direct output to stderr
        raise

if __name__ == "__main__":
    main()