In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
df = pd.read_csv('../data/telco_clean.csv')
df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn_binary,gender_Female,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,1,0,1,29.85,29.85,0,True,...,True,False,False,True,False,False,False,False,True,False
1,0,0,0,34,1,0,56.95,1889.5,0,False,...,True,False,False,False,True,False,False,False,False,True
2,0,0,0,2,1,1,53.85,108.15,1,False,...,True,False,False,True,False,False,False,False,False,True
3,0,0,0,45,0,0,42.3,1840.75,0,False,...,True,False,False,False,True,False,True,False,False,False
4,0,0,0,2,1,1,70.7,151.65,1,True,...,True,False,False,True,False,False,False,False,True,False


In [4]:
y = df['Churn_binary']

X = df.drop(columns=['Churn_binary'])

In [5]:
X.shape, y.shape

((7043, 41), (7043,))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

y.value_counts(normalize=True)

Churn_binary
0    0.73463
1    0.26537
Name: proportion, dtype: float64

In [15]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit on training data and transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
log_model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)

# X_train_scaled
log_model.fit(X_train_scaled, y_train)

In [10]:
y_pred = log_model.predict(X_test)
y_prob = log_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       0.90      0.72      0.80      1035
           1       0.50      0.79      0.61       374

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.80      0.74      0.75      1409

ROC AUC: 0.8421090702420626


In [12]:
feature_importance = pd.Series(
    log_model.coef_[0],
    index=X.columns
).sort_values()

feature_importance

Contract_Two year                         -0.736455
InternetService_DSL                       -0.304866
PhoneService                              -0.235126
Dependents                                -0.228194
MultipleLines_No                          -0.222337
OnlineSecurity_Yes                        -0.216005
TechSupport_Yes                           -0.182817
PaymentMethod_Credit card (automatic)     -0.129098
PaymentMethod_Bank transfer (automatic)   -0.128595
StreamingMovies_No                        -0.086449
OnlineBackup_Yes                          -0.083987
OnlineSecurity_No internet service        -0.081114
InternetService_No                        -0.081114
OnlineBackup_No internet service          -0.081114
StreamingTV_No internet service           -0.081114
DeviceProtection_No internet service      -0.081114
StreamingMovies_No internet service       -0.081114
TechSupport_No internet service           -0.081114
PaymentMethod_Mailed check                -0.080371
StreamingTV_

In [13]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced'
)

rf.fit(X_train, y_train)

In [14]:
rf_pred = rf.predict(X_test)
rf_prob = rf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, rf_pred))
print("ROC AUC:", roc_auc_score(y_test, rf_prob))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.63      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.70      1409
weighted avg       0.77      0.79      0.78      1409

ROC AUC: 0.8189103309307912


## Business Recommendations

- Customers with short tenure are at highest churn risk
- Month-to-month contracts significantly increase churn likelihood
- High monthly charges combined with low tenure signal early churn
- Fiber optic users exhibit elevated churn and should be targeted with retention offers

### Recommended Actions
- Incentivize long-term contracts early in the customer lifecycle
- Offer pricing adjustments or bundled discounts for high-risk segments
- Prioritize retention campaigns within the first 3–6 months

## Model Selection Summary

- Logistic Regression provides interpretability and clear business insight
- Random Forest improves predictive performance
- Final recommendation: deploy Logistic Regression for strategy, Random Forest for monitoring

In [19]:
import joblib
# This saves the model and the feature names to your computer
joblib.dump(log_model, 'telco_model.pkl')
joblib.dump(X.columns, 'feature_names.pkl')

['feature_names.pkl']