In [2]:
import pandas as pd
df = pd.read_csv('customer_churn_dataset.csv')

# Data overview

In [7]:
# Show first 5 rows
print("Head",df.head())
# Show the shape of the DataFrame
print("shape:", df.shape)
# Show last 5 rows
print("Tail",df.tail())
# Display summary statistics
print("Describe",df.describe())
# Display information about the DataFrame
print("Info",df.info())
# Display 
print("Is Null",df.isnull().sum())
# Display number of unique values in each column
print("Nunique",df.nunique())

Head    customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract

In [None]:
# Fix TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].mean())

In [9]:
# convert 'Churn' column to binary
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [10]:
# Drop customerID column
df = df.drop('customerID', axis=1)

In [11]:
#Identify categorical columns (CatBoost needs this)
cat_cols = df.select_dtypes(include='object').columns.tolist()

In [14]:
#✅ STEP 3 — Train/Test Split
from sklearn.model_selection import train_test_split

X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [15]:
#✅ STEP 4 — Train CatBoost Model (no encoding needed)
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=8,
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=100
)

model.fit(
    X_train, y_train,
    cat_features=cat_cols,
    eval_set=(X_test, y_test)
)


0:	test: 0.8282811	best: 0.8282811 (0)	total: 61ms	remaining: 30.4s
100:	test: 0.8638711	best: 0.8653229 (57)	total: 5.85s	remaining: 23.1s
200:	test: 0.8640639	best: 0.8653229 (57)	total: 11.2s	remaining: 16.7s
300:	test: 0.8628541	best: 0.8653229 (57)	total: 17.6s	remaining: 11.7s
400:	test: 0.8615072	best: 0.8653229 (57)	total: 24s	remaining: 5.93s
499:	test: 0.8592946	best: 0.8653229 (57)	total: 30.4s	remaining: 0us

bestTest = 0.8653229062
bestIteration = 57

Shrink model to first 58 iterations.


<catboost.core.CatBoostClassifier at 0x2dc903e7ed0>

In [16]:
#✅ STEP 5 — Evaluate the Model
from sklearn.metrics import classification_report, roc_auc_score

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1036
           1       0.68      0.55      0.61       373

    accuracy                           0.81      1409
   macro avg       0.76      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

ROC-AUC: 0.8653229062076246
