In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import FunctionTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import classification_report, roc_auc_score

from imblearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("prepared_data_to_ml.csv")
df.head(20)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,Female,No,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,Male,No,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,Female,No,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,Female,No,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,Male,No,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7010 entries, 0 to 7009
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7010 non-null   object 
 1   SeniorCitizen     7010 non-null   object 
 2   Partner           7010 non-null   object 
 3   Dependents        7010 non-null   object 
 4   tenure            7010 non-null   int64  
 5   PhoneService      7010 non-null   object 
 6   MultipleLines     7010 non-null   object 
 7   InternetService   7010 non-null   object 
 8   OnlineSecurity    7010 non-null   object 
 9   OnlineBackup      7010 non-null   object 
 10  DeviceProtection  7010 non-null   object 
 11  TechSupport       7010 non-null   object 
 12  StreamingTV       7010 non-null   object 
 13  StreamingMovies   7010 non-null   object 
 14  Contract          7010 non-null   object 
 15  PaperlessBilling  7010 non-null   object 
 16  PaymentMethod     7010 non-null   object 


In [4]:
# split the data
X = df.drop('Churn', axis=1)
y = df['Churn'] #target
y = y.map({'Yes': 1, 'No': 0})

# training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42, stratify=y) 

categorical_vars = X.select_dtypes(include='object').columns
numerical_vars = X.select_dtypes(include=['int64', 'float64']).columns

# transformer for numerical data in pipeline 
log_transformer = FunctionTransformer(np.log1p, validate=True)

# list of models
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=20)),
    ('SVC', SVC(kernel='linear', C=1, probability=True)), # add 
    ('GaussianNB', GaussianNB()),
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=300,
                                                      max_depth=20,
                                                      max_features='sqrt',
                                                      n_jobs=-1,
                                                      random_state=42)),
    ('XGBClassifier', XGBClassifier(n_estimators=60,
                                    learning_rate=0.05,
                                    max_depth=6,
                                    gamma=0.1,
                                    random_state=42,
                                    n_jobs=-1)),
    ('DecisionTreeClassifier', DecisionTreeClassifier(random_state=42,
                                                      max_depth=4,
                                                      min_samples_leaf=8))
]


# function to evaluate model
def evaluate_model(pipe, X_test, y_test, name):
    y_pred = pipe.predict(X_test)
    if hasattr(pipe, 'predict_proba'):
        y_pred_proba = pipe.predict_proba(X_test)[:, 1]
    elif hasattr(pipe, 'decision_function'):
        y_pred_proba = pipe.decision_function(X_test)
    else:
        y_pred_proba = y_pred

    report = classification_report(y_test, y_pred, output_dict=True)
    auc = roc_auc_score(y_test, y_pred_proba)

    return {
        'model': name,
        'roc_auc': auc,
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1': report['1']['f1-score']
    }

# transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('log_total_charges', log_transformer, ['TotalCharges']), # log for TotalCharges
        ('one_hot', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_vars), # encoding for caregorical variables
        ('num', MinMaxScaler(), numerical_vars) # scaling for numerical variables
    ]
)

# list of results 
results = []

# pipeline for each model
for name, model in models:
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('balance', RandomUnderSampler(random_state=42)),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    results.append(evaluate_model(pipe, X_test, y_test, name))


results_df = pd.DataFrame(results).sort_values('f1', ascending=False)

In [5]:
results_df

Unnamed: 0,model,roc_auc,precision,recall,f1
0,Logistic Regression,0.854485,0.535844,0.791741,0.63913
5,XGBClassifier,0.845116,0.515982,0.81149,0.630844
2,SVC,0.853679,0.514286,0.807899,0.628492
4,RandomForestClassifier,0.833024,0.528977,0.770197,0.627193
6,DecisionTreeClassifier,0.821668,0.506787,0.804309,0.62179
1,KNeighborsClassifier,0.834611,0.492489,0.824057,0.616521
3,GaussianNB,0.822928,0.42432,0.895871,0.57588
