In [36]:
import pandas as pd 
from pycaret.classification import (setup, compare_models, 
                                    finalize_model, save_model, create_model,
                                    load_model, create_app)

SEED = 3141596

## Set up PyCaret for reproducibility

In [2]:
data = pd.read_csv('../data/external/raw_data.csv')

data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
def convert2float(value : str) -> float:
    if not value:
        return float(value)
    return None 

In [4]:
data['TotalCharges']= data['TotalCharges'].apply(convert2float)
data['SeniorCitizen'] = data['SeniorCitizen'].apply(lambda x : 'Yes' if x else 'No')

In [27]:
s = setup(data=data, 
          target='Churn', 
          ignore_features=['customerID'], 
          numeric_features=['tenure', 'MonthlyCharges', 'TotalCharges'],
          categorical_features=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 
                                'PhoneService', 'MultipleLines', 'InternetService', 
                                'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                                'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                                'PaperlessBilling', 'PaymentMethod'],
          ordinal_features=[],
          fix_imbalance=None, 
          session_id=SEED)

Unnamed: 0,Description,Value
0,Session id,3141596
1,Target,Churn
2,Target type,Binary
3,Target mapping,"No: 0, Yes: 1"
4,Original data shape,"(5517, 21)"
5,Transformed data shape,"(5517, 26)"
6,Transformed train set shape,"(3861, 26)"
7,Transformed test set shape,"(1656, 26)"
8,Ignore features,1
9,Ordinal features,13


In [30]:
s._fxs['Ordinal'] = dict()

In [33]:
from pprint import pprint 

for data_type in ['Ordinal', 'Categorical', 'Numerical']:
    pprint(s._fxs[data_type])
    print()

{}

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

[]



## Train and save a model if not done so:

In [34]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7723,0.0,0.5655,0.6712,0.6122,0.453,0.4574,0.09
lr,Logistic Regression,0.7718,0.8218,0.5793,0.6648,0.6177,0.4564,0.4596,0.487
lda,Linear Discriminant Analysis,0.7713,0.8185,0.5891,0.6593,0.6211,0.4583,0.4605,0.116
gbc,Gradient Boosting Classifier,0.7669,0.8188,0.5492,0.6638,0.5998,0.4376,0.4424,0.153
ada,Ada Boost Classifier,0.7661,0.819,0.563,0.6561,0.6049,0.4404,0.4437,0.12
lightgbm,Light Gradient Boosting Machine,0.7576,0.8027,0.5435,0.6422,0.5875,0.4178,0.4215,0.126
rf,Random Forest Classifier,0.7459,0.7909,0.4996,0.6288,0.5555,0.3811,0.3868,0.152
et,Extra Trees Classifier,0.7337,0.7614,0.4914,0.6008,0.5391,0.355,0.3594,0.167
svm,SVM - Linear Kernel,0.7319,0.0,0.4974,0.6788,0.5034,0.3448,0.3871,0.101
nb,Naive Bayes,0.6996,0.8015,0.8096,0.5187,0.6319,0.3983,0.4271,0.105


In [37]:
best = create_model('lr')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7984,0.853,0.6016,0.7184,0.6549,0.5141,0.5181
1,0.7539,0.7996,0.5246,0.6337,0.574,0.4031,0.4067
2,0.7591,0.8163,0.561,0.6389,0.5974,0.4265,0.4284
3,0.7798,0.799,0.5366,0.7021,0.6083,0.4589,0.4669
4,0.7409,0.8028,0.6179,0.5891,0.6032,0.411,0.4113
5,0.772,0.8301,0.5447,0.6768,0.6036,0.4462,0.4514
6,0.7902,0.8224,0.5935,0.7019,0.6432,0.496,0.4995
7,0.7876,0.8366,0.5772,0.703,0.6339,0.4863,0.491
8,0.7668,0.8335,0.6423,0.632,0.6371,0.4654,0.4654
9,0.7694,0.8249,0.5935,0.6518,0.6213,0.4561,0.4571


In [38]:
finalize_model(best)

In [39]:
save_model(best, '../models/model-0.1.0')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/var/folders/j_/kccjl38d66g4wrjfqs2q1xkc0000gn/T/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['tenure', 'MonthlyCharges',
                                              'TotalCharges'],
                                     transformer=SimpleImputer(add_indicator=Fal...
                                                               handle_missing='return_nan',
                                                               handle_unknown='value',
                                                               return_df=True,
                                                               use_cat_names=True,
                                                               

## Load a saved model

In [40]:
best = load_model('../models/model-0.1.0')

Transformation Pipeline and Model Successfully Loaded


## Create a web app

In [41]:
create_app(best)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


