In [2]:
# Import libraries

import pandas as pd 
import pycaret as pyc

In [3]:
# Load the data

data = pd.read_csv("data/Customer-Churn.csv")
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
target = 'Churn'
type = 'classification'

In [5]:
data[target].value_counts(normalize=True)

Churn
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64

In [6]:
# Initialize the setup
from pycaret.classification import *
clf1 = setup(data, target = target, session_id = 123,
             normalize = True,
             transformation = True,
             feature_selection=True,
            #  ignore_low_variance = True,
             remove_multicollinearity = True,
             multicollinearity_threshold = 0.95,
             fix_imbalance = True,
            #  fix_imbalance_method = 'SMOTE',
             log_experiment = "mlflow",
             verbose = False)

[LightGBM] [Info] Number of positive: 3622, number of negative: 3622
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6617
[LightGBM] [Info] Number of data points in the train set: 7244, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [None]:
# Get preproccessed data


In [7]:
# Compare Models
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.7357,0.533,0.7357,0.5657,0.6359,0.0438,0.046,0.138
lr,Logistic Regression,0.7347,0.8097,0.7347,0.5398,0.6223,0.0,0.0,0.838
knn,K Neighbors Classifier,0.7347,0.5,0.7347,0.5398,0.6223,0.0,0.0,0.828
nb,Naive Bayes,0.7347,0.5,0.7347,0.5398,0.6223,0.0,0.0,0.139
dt,Decision Tree Classifier,0.7347,0.5,0.7347,0.5398,0.6223,0.0,0.0,0.138
svm,SVM - Linear Kernel,0.7347,0.4933,0.7347,0.5398,0.6223,0.0,0.0,0.147
ridge,Ridge Classifier,0.7347,0.8097,0.7347,0.5398,0.6223,0.0,0.0,0.138
rf,Random Forest Classifier,0.7347,0.5234,0.7347,0.5398,0.6223,0.0,0.0,0.172
ada,Ada Boost Classifier,0.7347,0.5,0.7347,0.5398,0.6223,0.0,0.0,0.14
gbc,Gradient Boosting Classifier,0.7347,0.4999,0.7347,0.5398,0.6223,0.0,0.0,0.193




In [13]:
# Finetune best model
from pycaret.classification import tune_model

tuned_best, tuner = tune_model(best, return_tuner = True, search_library="optuna" )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7363,0.8225,0.7363,0.5421,0.6245,0.0,0.0
1,0.7363,0.8138,0.7363,0.5421,0.6245,0.0,0.0
2,0.7343,0.7761,0.7343,0.5392,0.6218,0.0,0.0
3,0.7343,0.8135,0.7343,0.5392,0.6218,0.0,0.0
4,0.7343,0.8025,0.7343,0.5392,0.6218,0.0,0.0
5,0.7343,0.7895,0.7343,0.5392,0.6218,0.0,0.0
6,0.7343,0.8352,0.7343,0.5392,0.6218,0.0,0.0
7,0.7343,0.7997,0.7343,0.5392,0.6218,0.0,0.0
8,0.7343,0.83,0.7343,0.5392,0.6218,0.0,0.0
9,0.7343,0.813,0.7343,0.5392,0.6218,0.0,0.0


[I 2024-08-12 09:35:15,425] Searching the best hyperparameters using 4930 samples...
[I 2024-08-12 09:37:43,771] Finished hyperparameter search!


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).




In [14]:
# Display the Tuner. 

tuner

In [9]:
# !mlflow ui

^C


In [15]:
final_model = create_model(tuned_best)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7363,0.5,0.7363,0.5421,0.6245,0.0,0.0
1,0.7363,0.5,0.7363,0.5421,0.6245,0.0,0.0
2,0.7343,0.5,0.7343,0.5392,0.6218,0.0,0.0
3,0.7343,0.5,0.7343,0.5392,0.6218,0.0,0.0
4,0.7343,0.5,0.7343,0.5392,0.6218,0.0,0.0
5,0.7343,0.5,0.7343,0.5392,0.6218,0.0,0.0
6,0.7343,0.5,0.7343,0.5392,0.6218,0.0,0.0
7,0.7343,0.5,0.7343,0.5392,0.6218,0.0,0.0
8,0.7444,0.8301,0.7444,0.798,0.7575,0.4381,0.4598
9,0.7343,0.5,0.7343,0.5392,0.6218,0.0,0.0




In [16]:
# Dashboard 

from explainerdashboard import ClassifierExplainer, ExplainerDashboard

In [19]:
dashboard(final_model, display_format="inline")

Note: shap values for shap='kernel' normally get calculated against X_background, but paramater X_background=None, so setting X_background=shap.sample(X, 50)...
Generating self.shap_explainer = shap.KernelExplainer(model, X, link='identity')
Building ExplainerDashboard..
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...


  0%|          | 0/2113 [00:00<?, ?it/s]

Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating pred_percentiles...
Calculating predictions...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard inline (terminate it with ExplainerDashboard.terminate(8050))
