# Telecom Churn EDA

This notebook explores the telecom churn dataset and visualizes key patterns.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", 20)

data_path = "../data/raw/telecom_churn.csv"
df = pd.read_csv(data_path)
df.head()

## Dataset Summary
This section captures quick stats for numeric and categorical fields to understand distribution and missing values.

In [None]:
numeric_summary = df.select_dtypes(include="number").describe().T
categorical_summary = df.select_dtypes(exclude="number").describe().T
missing_values = df.isna().sum().sort_values(ascending=False)

display(numeric_summary.head(10))
display(categorical_summary.head(10))
display(missing_values[missing_values > 0])

## Churn Distribution
Visualise the churn balance to understand class imbalance before modeling.

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
sns.countplot(x="Churn Label", data=df, palette="muted", ax=ax)
ax.set_title("Churn Label Distribution")
ax.set_xlabel("Churn Label")
ax.set_ylabel("Customer Count")
ax.bar_label(ax.containers[0])
plt.tight_layout()
plt.show()

## Contract Type vs. Churn
Longer commitments typically lead to lower churn; validate this intuition with the data.

In [None]:
contract_churn = (
    df.groupby(["Contract", "Churn Label"])
    .size()
    .reset_index(name="count")
    .pivot(index="Contract", columns="Churn Label", values="count")
    .fillna(0)
 )
contract_share = contract_churn.div(contract_churn.sum(axis=1), axis=0)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.heatmap(contract_churn, annot=True, fmt=".0f", cmap="Blues", ax=axes[0])
axes[0].set_title("Customers per Contract / Churn Label")
sns.heatmap(contract_share, annot=True, fmt=".0%", cmap="Greens", ax=axes[1])
axes[1].set_title("Churn Share within Contract")
plt.tight_layout()
plt.show()

## Monthly Charges vs. Churn
Higher monthly charges often correlate with churn—check density differences across classes.

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
sns.kdeplot(data=df, x="Monthly Charges", hue="Churn Label", fill=True, common_norm=False, alpha=0.4, ax=ax)
ax.set_title("Monthly Charges Density by Churn Label")
plt.tight_layout()
plt.show()

## Tenure Segmentation
Bucket tenure into cohorts to highlight early-life churn risks.

In [None]:
tenure_bins = pd.cut(df["Tenure Months"], bins=[0, 6, 12, 24, 36, 48, 60, 72], right=False, include_lowest=True)
tenure_churn = (
    df.assign(tenure_bin=tenure_bins)
    .groupby(["tenure_bin", "Churn Label"])
    .size()
    .reset_index(name="count")
 )
fig, ax = plt.subplots(figsize=(10, 4))
sns.barplot(data=tenure_churn, x="tenure_bin", y="count", hue="Churn Label", ax=ax)
ax.set_title("Churn Counts by Tenure Cohort")
ax.set_xlabel("Tenure (months)")
ax.set_ylabel("Customer Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Churn Rate by State
Highlight states with elevated churn to prioritise retention campaigns.

In [None]:
state_churn = (
    df.groupby("State")["Churn Value"].agg(["mean", "count"]).rename(columns={"mean": "churn_rate", "count": "customers"})
    .sort_values("churn_rate", ascending=False)
 )
top_states = state_churn.head(10).reset_index()

fig, ax = plt.subplots(figsize=(8, 5))
sns.barplot(data=top_states, x="churn_rate", y="State", palette="Reds", ax=ax)
ax.set_title("Top 10 States by Churn Rate")
ax.set_xlabel("Churn Rate")
ax.set_ylabel("State")
for container in ax.containers:
    ax.bar_label(container, fmt="{:.1%}")
plt.tight_layout()
plt.show()

## Key Takeaways
- Early-tenure customers (<12 months) contribute a disproportionate share of churn.\n- Month-to-month contracts and higher monthly charges align with higher churn rates.\n- Texas and Florida show elevated churn in the latest refresh, suggesting regional campaigns.