In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.shape

(7043, 21)

In [5]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
#drop customer id not useful for prediction
df.drop('customerID', axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
df.dropna(inplace=True)

In [9]:
# Encode binary categorical columns
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})

In [10]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,1,0,0,0,34,1,No,DSL,Yes,No,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,1,0,0,0,2,1,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,1,0,0,0,45,0,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,0,0,0,0,2,1,No,Fiber optic,No,No,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1


In [11]:
# Only apply get_dummies to relevant categorical columns
categorical_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
                    'Contract', 'PaymentMethod']

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [12]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,1,29.85,29.85,0,...,False,False,False,False,False,False,False,False,True,False
1,1,0,0,0,34,1,0,56.95,1889.5,0,...,False,False,False,False,False,True,False,False,False,True
2,1,0,0,0,2,1,1,53.85,108.15,1,...,False,False,False,False,False,False,False,False,False,True
3,1,0,0,0,45,0,0,42.3,1840.75,0,...,True,False,False,False,False,True,False,False,False,False
4,0,0,0,0,2,1,1,70.7,151.65,1,...,False,False,False,False,False,False,False,False,True,False


In [13]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [14]:
x = df.drop('Churn', axis=1)
y = df['Churn']

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [16]:
from sklearn.model_selection import train_test_split , RandomizedSearchCV
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=.02,random_state=42)

In [17]:
# Define your feature names manually, same as in Streamlit
feature_names = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges',
                 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No',
                 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
                 'OnlineBackup_No internet service', 'OnlineBackup_Yes',
                 'DeviceProtection_No internet service', 'DeviceProtection_Yes',
                 'TechSupport_No internet service', 'TechSupport_Yes',
                 'StreamingTV_No internet service', 'StreamingTV_Yes',
                 'StreamingMovies_No internet service', 'StreamingMovies_Yes',
                 'Contract_One year', 'Contract_Two year',
                 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check',
                 'PaymentMethod_Mailed check']

# Convert x_train and x_test to DataFrames with labeled columns
x_train = pd.DataFrame(x_train, columns=feature_names)
x_test = pd.DataFrame(x_test, columns=feature_names)

In [18]:
print(type(x_train))
print(x_train.columns.tolist())  # this should NOT throw an error

<class 'pandas.core.frame.DataFrame'>
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


In [19]:

weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

xgb_model = XGBClassifier(
    scale_pos_weight=weight,
    eval_metric='logloss'
)

xgb_model.fit(x_train, y_train)


NameError: name 'XGBClassifier' is not defined

In [None]:
rsearch = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist,
                              scoring='roc_auc', n_iter=30, cv=3, verbose=1, n_jobs=-1)
rsearch.fit(x_train, y_train)

best_xgb = rsearch.best_estimator_

In [None]:
# Define stacked model with XGBoost, Logistic Regression as final
from sklearn.ensemble import  StackingClassifier
stack_model = StackingClassifier(
    estimators=[('xgb', best_xgb)],
    final_estimator=LogisticRegression(max_iter=1000)
)

In [None]:
stack_model.fit(x_train, y_train)

In [None]:
import joblib
joblib.dump(best_xgb, "xgb_churnn_model.pkl")

In [None]:
# Predictions
y_pred = stack_model.predict(x_test)
y_prob = stack_model.predict_proba(x_test)[:, 1]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score , accuracy_score 
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))
print("Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")

In [None]:
import seaborn as sns
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.title(f'Confusion Matrix')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
plot_importance(best_xgb, max_num_features=15, importance_type='gain')
plt.title("Top 15 Important Features")
plt.show()

In [None]:
import matplotlib.pyplot as plt

y_proba = best_xgb.predict_proba(x_test)[:, 1]  # Probabilities of churn

plt.hist(y_proba, bins=30, edgecolor='black')
plt.title('Distribution of Churn Probabilities')
plt.xlabel('Churn Probability')
plt.ylabel('Number of Customers')
plt.show()

In [None]:
from sklearn.metrics import f1_score
import numpy as np

y_probs = best_xgb.predict_proba(x_test)[:, 1]

thresholds = np.arange(0.2, 0.6, 0.01)
scores = [f1_score(y_test, y_probs > t) for t in thresholds]

best_t = thresholds[np.argmax(scores)]
print(f"Best threshold for F1 score: {best_t:.2f}")


In [None]:
from sklearn.metrics import accuracy_score

thresholds = np.arange(0.2, 0.8, 0.01)
accuracies = [accuracy_score(y_test, y_probs > t) for t in thresholds]

import matplotlib.pyplot as plt
plt.plot(thresholds, accuracies)
plt.xlabel("Threshold")
plt.ylabel("Accuracy")
plt.title("Accuracy vs Threshold")
plt.grid()
plt.show()


In [None]:
print(best_xgb.feature_names_in_)

In [None]:
classification_report(y_test, y_pred)