In [None]:
import sys
from pathlib import Path

# Add project root to sys.path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [None]:
import pandas as pd

from src.preprocessing import preprocess_data
from src.config import RAW_DATA_PATH

df = preprocess_data(RAW_DATA_PATH)
df.head()

In [None]:
# Selecting features for modeling
features = ["tenure", "MonthlyCharges", "TotalCharges", "service_count"]
X = df[features]

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

inertia = []

for k in range(2, 8):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(range(2, 8), inertia, marker="o")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()

In [None]:
inertia

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
df["segment"] = kmeans.fit_predict(X_scaled)

In [None]:
df.groupby("segment")[["tenure", "MonthlyCharges", "TotalCharges", "service_count"]].mean()

In [None]:
pd.crosstab(df["segment"], df["Churn"])

In [None]:
# Absolute counts
count_table = pd.crosstab(df["segment"], df["Churn"])

# Row-wise percentage
percent_table = pd.crosstab(df["segment"], df["Churn"], normalize="index") * 100

count_table.columns = ["No_Count", "Yes_Count"]
percent_table.columns = ["No_%", "Yes_%"]

combined_table = pd.concat([count_table, percent_table], axis=1)
combined_table

In [None]:
df['segment'].value_counts().sort_index()

In [None]:
# Visualize segment distribution by Churn status

# Ensure consistent figure size
fig, ax = plt.subplots(figsize=(8, 5))

# Stacked bar: segments split by Churn (Yes/No)
ax = count_table.sort_index().plot(
    kind="bar",
    stacked=True,
    color=["#2ca02c", "#d62728"],
    ax=ax
)
ax.set_xlabel("Segment")
ax.set_ylabel("Customer Count")
ax.set_title("Customer Count by Segment and Churn")
ax.legend(["No", "Yes"], title="Churn")
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)

# Annotate counts on bars
for p in ax.patches:
    if p.get_height() > 0:
        ax.annotate(
            int(p.get_height()),
            (p.get_x() + p.get_width() / 2, p.get_y() + p.get_height() / 2),
            ha="center",
            va="center",
            color="white",
            fontsize=9
        )

plt.tight_layout()
plt.show()

#-------------------------------------------------------------------------------------------

# Stacked bar: segments split by Churn (Yes/No) - Percentage

# Ensure consistent figure size
fig, ax = plt.subplots(figsize=(8, 5))

percent_table.sort_index().plot(
    kind="bar",
    stacked=True,
    color=["#2ca02c", "#d62728"],
    ax=ax
)

ax.set_xlabel("Segment")
ax.set_ylabel("Percentage")
ax.set_title("Churn Distribution by Segment (%)")
ax.legend(["No", "Yes"], title="Churn")
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)

# Annotate percentages
for p in ax.patches:
    if p.get_height() > 0:
        ax.annotate(
            f"{p.get_height():.1f}%",
            (p.get_x() + p.get_width() / 2, p.get_y() + p.get_height() / 2),
            ha="center",
            va="center",
            color="white",
            fontsize=9
        )

plt.tight_layout()
plt.show()


In [None]:
# Compute total monthly revenue per segment
revenue_by_segment = df.groupby("segment")["MonthlyCharges"].sum()

fig, ax = plt.subplots(figsize=(8, 5))

revenue_by_segment.sort_index().plot(
    kind="bar",
    color="#1f77b4",
    ax=ax
)

ax.set_xlabel("Segment")
ax.set_ylabel("Total Monthly Revenue")
ax.set_title("Total Monthly Revenue by Segment")
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)

# Annotate values
for p in ax.patches:
    ax.annotate(
        f"{int(p.get_height())}",
        (p.get_x() + p.get_width() / 2, p.get_height()),
        ha="center",
        va="bottom",
        fontsize=9
    )

plt.tight_layout()
plt.show()

In [None]:
avg_monthly_revenue = df.groupby("segment")["MonthlyCharges"].mean()

fig, ax = plt.subplots(figsize=(8, 5))

avg_monthly_revenue.sort_index().plot(
    kind="bar",
    color="#2E86C1",
    ax=ax
)

ax.set_xlabel("Segment")
ax.set_ylabel("Average Monthly Revenue")
ax.set_title("Average Monthly Revenue per Customer by Segment")
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)

# Annotate values
for p in ax.patches:
    ax.annotate(
        f"{p.get_height():.1f}",
        (p.get_x() + p.get_width() / 2, p.get_height()),
        ha="center",
        va="bottom",
        fontsize=9
    )

plt.tight_layout()
plt.show()

<b>Segment-Based Churn Insights</b>
<ul><li>
<b>Segment 2 (High-Value At Risk)</b> shows the highest churn rate (~44%), despite relatively high monthly charges and moderate service adoption. This represents the most critical retention segment.
</li><li>
<b>Segment 3 (New & Low Engagement)</b> has moderate churn (~29%), driven by low tenure and low service adoption. Early lifecycle intervention is important for this group.
</li><li>
<b>Segment 1 (Premium Loyal)</b> generates high revenue with relatively low churn (~15%), making it a valuable but stable segment requiring proactive loyalty engagement.
</li><li>
<b>Segment 0 (Stable Low-Value Loyal)</b> exhibits extremely low churn (~4%), indicating strong long-term retention despite lower revenue contribution.
</li></ul>

The most dangerous segment is not the newest or lowest-value group, but the mid-tenure, high-spend segment (Segment 2), which combines meaningful revenue with high churn probability.
