In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    log_loss,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
)

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit

import mlflow
import mlflow.sklearn
import mlflow.xgboost

from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from catboost import CatBoostClassifier
import xgboost as xgb


In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("churn-predictor")

In [None]:
df = pd.read_csv('/.../dataset.csv')
df = df.iloc[:,1:]

In [None]:
for col in df.select_dtypes('object').columns:
    df[col] = df[col].astype('category')
    
X = df.drop('Churn', axis=1)
y = df['Churn']

In [None]:
# Enable autologging for sklearn
mlflow.sklearn.autolog()


def objective(params):
    try:
        # Extract StratifiedShuffleSplit parameters from the search space
        n_splits = int(params['n_splits'])
        test_size = params['test_size']
        
        # Remove StratifiedShuffleSplit parameters from the XGBoost parameters
        xgb_params = {k: params[k] for k in params if k not in ['n_splits', 'test_size', 'num_boost_round']}
        
        xgb_params['max_depth'] = int(xgb_params['max_depth'])
        xgb_params['objective'] = 'binary:logistic'  # Set objective for binary classification
        xgb_params['eval_metric'] = 'logloss'  # Set evaluation metric
        
        sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)
        train_index, test_index = next(sss.split(X, y))  # Use the first split
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        with mlflow.start_run():
            mlflow.set_tag("model", "xgboost")
            mlflow.log_params(xgb_params)
            
            dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)  # Enable categorical handling
            dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
            
            booster = xgb.train(
                params=xgb_params,
                dtrain=dtrain,
                num_boost_round=int(params['num_boost_round']),
                evals=[(dtest, 'validation')],
                early_stopping_rounds=50,
                verbose_eval=False  # Turn off verbose output
            )
            
            # Save the model manually in JSON format
            model_path = "xgboost_model.json"
            booster.save_model(model_path)  # Save model as JSON

            # Log the model manually with MLflow
            mlflow.log_artifact(model_path, artifact_path="xgboost_model")

            y_pred_proba = booster.predict(dtest)
            
            # Binary class predictions
            y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_proba]
            
            logloss = log_loss(y_test, y_pred_proba)
            auc = roc_auc_score(y_test, y_pred_proba)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            
            mlflow.log_metric("log_loss", logloss)
            mlflow.log_metric("auc", auc)
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            
            # Remove the saved model file after logging
            os.remove(model_path)

        return {'loss': logloss, 'status': STATUS_OK}
    
    except Exception as e:
        print(f"Error: {e}")
        return {'loss': float('inf'), 'status': STATUS_FAIL}


def objective_catboost(params):
    # Extract StratifiedShuffleSplit parameters from the search space
    n_splits = params['n_splits']
    test_size = params['test_size']
    
    # StratifiedShuffleSplit with the sampled parameters
    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)
    train_index, test_index = next(sss.split(X, y))  # Use the first split
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
    X_train.to_csv('')

 
    with mlflow.start_run():
        mlflow.set_tag("model", "catboost")
        mlflow.log_params(params)

        model = CatBoostClassifier(
            iterations=int(params['iterations']),
            depth=int(params['depth']),
            learning_rate=params['learning_rate'],
            l2_leaf_reg=params['l2_leaf_reg'],
            random_seed=42,
            verbose=0
        )
        model.fit(X_train, y_train, cat_features=categorical_features)
        
        # Log model
        mlflow.catboost.log_model(model, "catboost_model")
        
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        y_pred = model.predict(X_test)
        
        logloss = log_loss(y_test, y_pred_proba)
        auc = roc_auc_score(y_test, y_pred_proba)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        
        # Log metrics with MLflow
        mlflow.log_metric("log_loss", logloss)
        mlflow.log_metric("auc", auc)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)

    return {'loss': logloss, 'status': STATUS_OK}



# Function to preprocess data by encoding categorical features
def preprocess_data(X):
    # Identify categorical columns
    X_encoded = X.copy()
    categorical_cols = X_encoded.select_dtypes(include=['object', 'category']).columns

    label_encoder = LabelEncoder()
    for col in categorical_cols:
        if X_encoded[col].nunique() == 2:  
            X_encoded[col] = label_encoder.fit_transform(X_encoded[col])
        else:
            # Multi-class columns
            X_encoded = pd.get_dummies(X_encoded, columns=[col], drop_first=True)

    return X_encoded

# Xgboost searchspace for MLflow
search_space = {
    'n_splits': hp.choice('n_splits', [5, 10]),  # Number of splits for StratifiedShuffleSplit
    'test_size': hp.choice('test_size', [0.2, 0.3]),  # Test size for StratifiedShuffleSplit
    'max_depth': hp.quniform('max_depth', 3, 10, 1),  # Maximum depth of each tree
    'learning_rate': hp.loguniform('learning_rate', -5, 0),  # Learning rate (eta)
    'num_boost_round': hp.quniform('num_boost_round', 100, 1000, 50),  # Number of boosting rounds
    'min_child_weight': hp.uniform('min_child_weight', 1, 10),  # Minimum sum of instance weight (Hessian)
    'subsample': hp.uniform('subsample', 0.5, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),  # Subsample ratio of columns
    'gamma': hp.uniform('gamma', 0, 5),  # Minimum loss reduction
    'reg_lambda': hp.loguniform('reg_lambda', -3, 3),  # L2 regularization term
    'reg_alpha': hp.loguniform('reg_alpha', -3, 3),  # L1 regularization term
}

# Catboost searchspace for MLflow
search_space_catboost = {'n_splits': hp.choice('n_splits', [5, 10]), 
    'test_size': hp.choice('test_size', [0.2, 0.3]), 
    'iterations': scope.int(hp.quniform('iterations', 100, 1000, 50)), 
    'depth': scope.int(hp.quniform('depth', 4, 12, 1)), 
    'learning_rate': hp.loguniform('learning_rate', -5, 0),  
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, 3), 
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 1), 
    'border_count': scope.int(hp.quniform('border_count', 32, 254, 1)),
    'rsm': hp.uniform('rsm', 0.5, 1),  
    'grow_policy': hp.choice('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),  
    'sampling_frequency': hp.choice('sampling_frequency', ['PerTree', 'PerTreeLevel']),  
    'od_wait': scope.int(hp.quniform('od_wait', 20, 100, 10)), 
    'loss_function': hp.choice('loss_function', ['Logloss', 'CrossEntropy', 'RMSE', 'MAE']), 
    'scale_pos_weight':scope.int(hp.quniform('scale_pos_weight', 1, 5, 1)) }

In [None]:
# CatBoost
best_result_catboost = fmin(
    fn=objective_catboost,
    space=search_space_catboost,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:
# xgboost
trials = Trials()
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials
)


In [None]:
mlflow.end_run()