In [9]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split


In [3]:
df = pd.read_csv("Telco-Customer-Churn.csv")

In [5]:
df = df.drop('customerID', axis=1)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


In [6]:
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']

for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})


In [7]:
df = pd.get_dummies(
    df,
    columns=[
        'gender', 'MultipleLines', 'InternetService',
        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
        'TechSupport', 'StreamingTV', 'StreamingMovies',
        'Contract', 'PaymentMethod'
    ],
    drop_first=True
)


In [10]:
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [13]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [14]:
y_pred = log_reg.predict(X_test)
y_prob = log_reg.predict_proba(X_test)[:, 1]


In [15]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.80      0.80      1409



In [16]:
confusion_matrix(y_test, y_pred)


array([[925, 110],
       [165, 209]])

In [17]:
roc_auc_score(y_test, y_prob)


0.8427884987987289

In [18]:
coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

coefficients.head(10)


Unnamed: 0,Feature,Coefficient
11,InternetService_Fiber optic,0.763947
28,PaymentMethod_Electronic check,0.392163
5,PaperlessBilling,0.367545
10,MultipleLines_Yes,0.278865
24,StreamingMovies_Yes,0.209727
22,StreamingTV_Yes,0.207274
9,MultipleLines_No phone service,0.177581
0,SeniorCitizen,0.139491
29,PaymentMethod_Mailed check,0.076342
1,Partner,0.023326


In [19]:
threshold = 0.3
y_pred_custom = (y_prob >= threshold).astype(int)


In [20]:
print(classification_report(y_test, y_pred_custom))
confusion_matrix(y_test, y_pred_custom)


              precision    recall  f1-score   support

           0       0.90      0.75      0.82      1035
           1       0.52      0.76      0.62       374

    accuracy                           0.75      1409
   macro avg       0.71      0.76      0.72      1409
weighted avg       0.80      0.75      0.77      1409



array([[778, 257],
       [ 90, 284]])

In [21]:
for t in [0.2, 0.3, 0.4, 0.5]:
    y_temp = (y_prob >= t).astype(int)
    recall = classification_report(y_test, y_temp, output_dict=True)['1']['recall']
    print(f"Threshold {t}: Recall = {recall:.3f}")


Threshold 0.2: Recall = 0.856
Threshold 0.3: Recall = 0.759
Threshold 0.4: Recall = 0.668
Threshold 0.5: Recall = 0.559


In [22]:
from sklearn.ensemble import RandomForestClassifier


In [23]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced'
)

rf.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
rf_pred = rf.predict(X_test)
rf_prob = rf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, rf_pred))
confusion_matrix(y_test, rf_pred)
roc_auc_score(y_test, rf_prob)



              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.64      0.49      0.56       374

    accuracy                           0.79      1409
   macro avg       0.74      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409



0.8256284068304529

In [26]:
importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

importances.head(10)


Unnamed: 0,Feature,Importance
7,TotalCharges,0.175935
3,tenure,0.164604
6,MonthlyCharges,0.152518
26,Contract_Two year,0.058048
28,PaymentMethod_Electronic check,0.039851
11,InternetService_Fiber optic,0.039154
25,Contract_One year,0.029617
14,OnlineSecurity_Yes,0.028371
8,gender_Male,0.02567
20,TechSupport_Yes,0.02464


# ==============================
# Business Translation: Risk Segmentation
# ==============================


In [27]:
risk_df = X_test.copy()

risk_df['Churn_Probability'] = rf_prob  # or y_prob if Logistic chosen
risk_df['Actual_Churn'] = y_test.values

def risk_bucket(p):
    if p >= 0.7:
        return 'High Risk'
    elif p >= 0.4:
        return 'Medium Risk'
    else:
        return 'Low Risk'

risk_df['Risk_Segment'] = risk_df['Churn_Probability'].apply(risk_bucket)


In [28]:
risk_df['Risk_Segment'].value_counts()
pd.crosstab(risk_df['Risk_Segment'], risk_df['Actual_Churn'], normalize='index')


Actual_Churn,0,1
Risk_Segment,Unnamed: 1_level_1,Unnamed: 2_level_1
High Risk,0.259542,0.740458
Low Risk,0.861829,0.138171
Medium Risk,0.492647,0.507353


In [30]:
risk_df.to_csv("churn_scored_customers.csv", index=False)
