In [11]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import FunctionTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.pipeline import Pipeline

In [12]:
df = pd.read_csv("prepared_data_to_ml.csv")
df.head(20)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,Female,No,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,Male,No,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,Female,No,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,Female,No,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,Male,No,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7010 entries, 0 to 7009
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7010 non-null   object 
 1   SeniorCitizen     7010 non-null   object 
 2   Partner           7010 non-null   object 
 3   Dependents        7010 non-null   object 
 4   tenure            7010 non-null   int64  
 5   PhoneService      7010 non-null   object 
 6   MultipleLines     7010 non-null   object 
 7   InternetService   7010 non-null   object 
 8   OnlineSecurity    7010 non-null   object 
 9   OnlineBackup      7010 non-null   object 
 10  DeviceProtection  7010 non-null   object 
 11  TechSupport       7010 non-null   object 
 12  StreamingTV       7010 non-null   object 
 13  StreamingMovies   7010 non-null   object 
 14  Contract          7010 non-null   object 
 15  PaperlessBilling  7010 non-null   object 
 16  PaymentMethod     7010 non-null   object 


In [14]:
# split the data
X = df.drop('Churn', axis=1)
y = df['Churn'] #target

y = y.map({'Yes': 1, 'No': 0})
categorical_vars = X.select_dtypes(include='object').columns
numerical_vars = X.select_dtypes(include=['int64', 'float64']).columns

# transformer for numerical data in pipeline 
log_transformer = FunctionTransformer(np.log1p, validate=True)

# list of models
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=20)),
    ('SVC', SVC(kernel='linear', C=1, probability=True)), # add 
    ('GaussianNB', GaussianNB()),
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=300,
                                                      max_depth=20,
                                                      max_features='sqrt',
                                                      n_jobs=-1,
                                                      random_state=42)),
    ('XGBClassifier', XGBClassifier(n_estimators=60,
                                    learning_rate=0.05,
                                    max_depth=6,
                                    gamma=0.1,
                                    random_state=42,
                                    n_jobs=-1)),
    ('DecisionTreeClassifier', DecisionTreeClassifier(random_state=42,
                                                      max_depth=4,
                                                      min_samples_leaf=8))
]

# transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('log_total_charges', log_transformer, ['TotalCharges']), # log for TotalCharges
        ('one_hot', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_vars), # encoding for caregorical variables
        ('num', MinMaxScaler(), numerical_vars) # scaling for numerical variables
    ]
)

# list of cross-validation results 
results = []

cv = StratifiedKFold(n_splits=5, shuffle=True)

# metrics
scoring = ['roc_auc', 'precision', 'recall', 'f1']

# pipeline for each model
for name, model in models:
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('balance', RandomUnderSampler()),
        ('model', model)
    ])

    # cross-validation
    scores = cross_validate(pipe, X, y, cv=cv, scoring=scoring)

    # results to list
    row = {'model': name}
    for metric in scoring:
        row[f'{metric}'] = np.mean(scores[f'test_{metric}'])
        row[f'{metric}_std'] = np.std(scores[f'test_{metric}'])
    results.append(row)

results_df = pd.DataFrame(results).sort_values('f1', ascending=False)


In [15]:
print(results_df)

                    model   roc_auc  roc_auc_std  precision  precision_std  \
0     Logistic Regression  0.849124     0.006936   0.520060       0.017733   
5           XGBClassifier  0.840625     0.007357   0.510520       0.017312   
2                     SVC  0.844005     0.006217   0.499671       0.021199   
4  RandomForestClassifier  0.826031     0.013777   0.509938       0.018644   
6  DecisionTreeClassifier  0.815484     0.009586   0.495152       0.017579   
1    KNeighborsClassifier  0.828796     0.008823   0.485819       0.008725   
3              GaussianNB  0.823061     0.016650   0.425572       0.008116   

     recall  recall_std        f1    f1_std  
0  0.807211    0.020933  0.632268  0.014159  
5  0.791056    0.009822  0.620345  0.012888  
2  0.815278    0.024697  0.618926  0.012990  
4  0.761989    0.017387  0.610783  0.015901  
6  0.793743    0.022155  0.609521  0.014015  
1  0.810989    0.007413  0.607571  0.006792  
3  0.885833    0.012038  0.574913  0.009398  
