In [3]:
import pandas as pd
import numpy as np
import xgboost 

In [4]:
df = pd.read_csv("../data/cleaned/cleaned_churn.csv")

In [5]:
X = df.drop("Churn", axis=1)
y = df["Churn"].map({"Yes": 1, "No": 0})


In [6]:
#encode catagorical
X = pd.get_dummies(X, drop_first=True)


In [7]:
X

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.50,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.30,1840.75,True,False,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.70,151.65,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,True,True,True,True,False,True,...,False,True,False,True,True,False,True,False,False,True
7039,0,72,103.20,7362.90,False,True,True,True,False,True,...,False,True,False,True,True,False,True,True,False,False
7040,0,11,29.60,346.45,False,True,True,False,True,False,...,False,False,False,False,False,False,True,False,True,False
7041,1,4,74.40,306.60,True,True,False,True,False,True,...,False,False,False,False,False,False,True,False,False,True


In [8]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight={0:1, 1:3}  # churn class weighted higher to fix
                             # problem of imbalance
)
model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,"{0: 1, 1: 3}"
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [18]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.70      0.79      1035
           1       0.49      0.80      0.61       374

    accuracy                           0.73      1409
   macro avg       0.70      0.75      0.70      1409
weighted avg       0.80      0.73      0.74      1409



In [13]:
from sklearn.ensemble import RandomForestClassifier
weights = {0:1, 1:2}
rf = RandomForestClassifier(
    n_estimators=200,
    class_weight=weights,
    random_state=42
)

rf.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.64      0.49      0.56       374

    accuracy                           0.79      1409
   macro avg       0.73      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409



In [15]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    scale_pos_weight=3,  # churn imbalance
    eval_metric="logloss"
)

xgb.fit(X_train, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [16]:
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.78      0.82      1035
           1       0.53      0.67      0.59       374

    accuracy                           0.75      1409
   macro avg       0.70      0.73      0.71      1409
weighted avg       0.78      0.75      0.76      1409



## Out of all the models logistic regression gave the highest recall of 0.8
### therefore, we shall be using that model for churn prediction and feature importance selection 


In [19]:
importance = pd.DataFrame({
    "feature": X_train.columns,
    "coefficient": model.coef_[0]
})

importance["abs_coeff"] = importance["coefficient"].abs()
importance.sort_values("abs_coeff", ascending=False).head(10)


Unnamed: 0,feature,coefficient,abs_coeff
25,Contract_Two year,-1.43466,1.43466
10,InternetService_Fiber optic,1.423475,1.423475
1,tenure,-1.157128,1.157128
2,MonthlyCharges,-0.797279,0.797279
24,Contract_One year,-0.72624,0.72624
23,StreamingMovies_Yes,0.506533,0.506533
3,TotalCharges,0.495568,0.495568
21,StreamingTV_Yes,0.478169,0.478169
28,PaymentMethod_Electronic check,0.402435,0.402435
9,MultipleLines_Yes,0.363493,0.363493
