In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm

df = pd.read_csv("../data/clean_churn.csv")
df.shape

(7032, 22)

**Statistical Questions**
- Is customer churn associated with contract type?
- Do customers who churn pay higher monthly charges?
- Which factors have statistically significant relationships with churn?

In [2]:
contigency_contract = pd.crosstab(df["Contract"], df["Churn"])
contigency_contract

Churn,0,1
Contract,Unnamed: 1_level_1,Unnamed: 2_level_1
Month-to-month,2220,1655
One year,1306,166
Two year,1637,48


In [3]:
chi2, p_value, dof, expected = stats.chi2_contingency(contigency_contract)
chi2, p_value

(np.float64(1179.5458287339445), np.float64(7.326182186265472e-257))

The chi-square test shows a statistically significant association between constract type and churn (p<0.001), indicating churn rates differ across contract types.

In [4]:
contigency_payment = pd.crosstab(df["PaymentMethod"], df["Churn"])
chi2, p_value, _, _ = stats.chi2_contingency(contigency_payment)
chi2, p_value

(np.float64(645.4299001234638), np.float64(1.4263098511063342e-139))

Payment method is significantly associated with churn, with electronic check users exhibiting churn rates.

In [5]:
charges_churn = df.loc[df["Churn"] == 1, "MonthlyCharges"]
charges_no_churn = df.loc[df["Churn"] == 0, "MonthlyCharges"]
charges_churn.mean(), charges_no_churn.mean()

(np.float64(74.44133226324237), np.float64(61.307408483439865))

In [6]:
t_stat, p_value = stats.ttest_ind(
    charges_churn,
    charges_no_churn,
    equal_var=False
)
t_stat, p_value

(np.float64(18.34091879095257), np.float64(2.6573571445160277e-72))

Customers who churn have significantly higher monthly charges on average (p < 0.001), suggesting price sensitivity plays a role i churn behavior.

In [7]:
# Logistic regression
# Select key predictors
features = [
    "tenure",
    "MonthlyCharges",
    "TotalCharges",
    "Churn"
]

df_logit = df[features].copy()

# Add constant
X = df_logit.drop(columns="Churn")
X = sm.add_constant(X)
y = df_logit["Churn"]

# Fit Model 
logit_model = sm.Logit(y, X)
result = logit_model.fit()

result.summary()

Optimization terminated successfully.
         Current function value: 0.453372
         Iterations 7


0,1,2,3
Dep. Variable:,Churn,No. Observations:,7032.0
Model:,Logit,Df Residuals:,7028.0
Method:,MLE,Df Model:,3.0
Date:,"Sun, 11 Jan 2026",Pseudo R-squ.:,0.217
Time:,15:35:36,Log-Likelihood:,-3188.1
converged:,True,LL-Null:,-4071.7
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.5988,0.117,-13.628,0.000,-1.829,-1.369
tenure,-0.0671,0.005,-12.297,0.000,-0.078,-0.056
MonthlyCharges,0.0302,0.002,17.585,0.000,0.027,0.034
TotalCharges,0.0001,6.14e-05,2.361,0.018,2.47e-05,0.000


In [8]:
# Odd ratio
odds_ratios = np.exp(result.params)
odds_ratios

const             0.202133
tenure            0.935088
MonthlyCharges    1.030660
TotalCharges      1.000145
dtype: float64

- Higher tenure significantly reduces the odds of churn
- Higher monthly charges increase the likelihood of churn
- Total charges capture long-term customer value effects

In [9]:
summary_df = pd.DataFrame({
    "Coefficient": result.params,
    "Odds Ratio": odds_ratios,
    "p-value": result.pvalues
})

summary_df


Unnamed: 0,Coefficient,Odds Ratio,p-value
const,-1.598827,0.202133,2.7355679999999998e-42
tenure,-0.067114,0.935088,9.400316e-35
MonthlyCharges,0.0302,1.03066,3.229078e-69
TotalCharges,0.000145,1.000145,0.01820978


### Key Statistical Findings
- Contract type and payment method are significantly associated with churn
- Customers who churn pay higher monthly charges on average
- Longer customer tenure significantly reduces churn probability
- Pricing and early-stage customer experience are critical churn drivers