In [32]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import resample

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
filepath ='scripts/data/Telco_Customer_Churn_Snapshot.xlsx'

In [36]:
df = pd.read_excel(filepath, sheet_name = 'T')

In [37]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'Snapshot_Date'],
      dtype='object')

In [38]:
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [39]:
binary_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService','PaperlessBilling']

In [40]:
for bin_cols in binary_cols:
  # Initialize the encoder
  encoder = LabelEncoder()

  # Fit and transform the data
  df[bin_cols] = encoder.fit_transform(df[bin_cols])

In [41]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Snapshot_Date
0,7590-VHVEG,0,0,1,0,1,0,No phone service,DSL,No,...,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0,2024-01-06
1,5575-GNVDE,1,0,0,0,34,1,No,DSL,Yes,...,No,No,No,One year,0,Mailed check,56.95,1889.50,0,2024-01-06
2,3668-QPYBK,1,0,0,0,2,1,No,DSL,Yes,...,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1,2024-01-06
3,7795-CFOCW,1,0,0,0,45,0,No phone service,DSL,Yes,...,Yes,No,No,One year,0,Bank transfer (automatic),42.30,1840.75,0,2024-01-06
4,9237-HQITU,0,0,0,0,2,1,No,Fiber optic,No,...,No,No,No,Month-to-month,1,Electronic check,70.70,151.65,1,2024-01-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,1,0,1,1,24,1,Yes,DSL,Yes,...,Yes,Yes,Yes,One year,1,Mailed check,84.80,1990.50,0,2024-01-06
7039,2234-XADUH,0,0,1,1,72,1,Yes,Fiber optic,No,...,No,Yes,Yes,One year,1,Credit card (automatic),103.20,7362.90,0,2024-01-06
7040,4801-JZAZL,0,0,1,1,11,0,No phone service,DSL,Yes,...,No,No,No,Month-to-month,1,Electronic check,29.60,346.45,0,2024-01-06
7041,8361-LTMKD,1,1,1,0,4,1,Yes,Fiber optic,No,...,No,No,No,Month-to-month,1,Mailed check,74.40,306.60,1,2024-01-06


In [42]:
ohe_cols = ['MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract',
       'PaymentMethod']

In [43]:
X_train, X_test, y_train , y_test =  train_test_split(df, df['Churn'], test_size=0.2, random_state=42, stratify=df['Churn'])

In [44]:
other_cols = ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'PaperlessBilling',
       'MonthlyCharges', 'TotalCharges', 'Churn',
       'Snapshot_Date']

In [45]:
ohe = OneHotEncoder(drop='first', sparse_output=False)

# Fit on training data
X_train_cat = ohe.fit_transform(X_train[ohe_cols])
X_test_cat = ohe.transform(X_test[ohe_cols])

# Convert to DataFrame
X_train_cat_df = pd.DataFrame(X_train_cat, columns=ohe.get_feature_names_out(ohe_cols), index=X_train.index)
X_test_cat_df = pd.DataFrame(X_test_cat, columns=ohe.get_feature_names_out(ohe_cols), index=X_test.index)

# Combine with numerical features
X_train = pd.concat([X_train_cat_df, X_train[other_cols]], axis=1)
X_test = pd.concat([X_test_cat_df, X_test[other_cols]], axis=1)


In [46]:
X_train = X_train.drop(['customerID', 'Snapshot_Date', 'Churn'], axis=1)
X_test = X_test.drop(['customerID', 'Snapshot_Date', 'Churn'], axis=1)

In [50]:
for col in ['tenure', 'MonthlyCharges', 'TotalCharges']:
    mmscaler = MinMaxScaler()
    X_train[[col]] = mmscaler.fit_transform(X_train[[col]])
    X_test[[col]] = mmscaler.transform(X_test[[col]])

In [52]:
lr = LogisticRegression()
# Fit the model
lr.fit(X_train, y_train)

# Predict and evaluate
y_pred = lr.predict(X_test)
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# F1 Score (binary classification, so average='binary' is default)
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Full classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7991483321504613
F1 Score: 0.5916305916305916

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.64      0.55      0.59       374

    accuracy                           0.80      1409
   macro avg       0.74      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409



In [62]:
 X_train.columns

Index(['MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [68]:
prob = lr.predict_proba(X_train)[:,1]

In [71]:
oot_new_churn = X_train

In [72]:
oot_new_churn['prob'] = prob

In [74]:
oot_new_churn[oot_new_churn['prob']>0.7]

Unnamed: 0,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,prob
1056,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0.069444,1,1,0.613852,0.047497,0.722748
2582,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0,0,0,1,0.166667,1,1,0.865969,0.146883,0.708426
478,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0.069444,1,1,0.774290,0.052206,0.704776
1236,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0.041667,1,1,0.583458,0.026299,0.747721
4453,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,0,1,0,0.138889,1,1,0.798206,0.119490,0.798050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0.041667,1,1,0.764325,0.035395,0.752507
1308,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0.222222,1,1,0.679621,0.147499,0.744831
313,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,1,0,0.069444,1,1,0.507723,0.039333,0.703598
4314,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,1,0,0,0.333333,1,1,0.825610,0.280968,0.722818
