In [None]:
import os

BASE_DIR = "/content/drive/MyDrive/churn-intelligence"

folders = [
    "data",
    "notebooks",
    "models",
    "plots",
    "reports"
]

for f in folders:
    os.makedirs(os.path.join(BASE_DIR, f), exist_ok=True)

print("Project folders ready at:", BASE_DIR)


In [None]:
import kagglehub
import shutil
import os

# download dataset
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

# locate csv
files = os.listdir(path)
print("Downloaded files:", files)

csv_file = files[0]  # Telco-Customer-Churn.csv

src = os.path.join(path, csv_file)
dst = os.path.join(BASE_DIR, "data", csv_file)

shutil.copy(src, dst)

print("Dataset copied to Drive:", dst)


In [None]:
import pandas as pd

DATA_PATH = "/content/drive/MyDrive/churn-intelligence/data/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df  = pd.read_csv(DATA_PATH)
print(df.head())
print(df.shape)
print(df["Churn"].value_counts())

In [None]:
df.info()


In [None]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")


In [None]:
df["TotalCharges"].isna().sum()


In [None]:
df["Churn"].value_counts(normalize=True) * 100


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,5))
sns.histplot(data=df, x="tenure", hue="Churn", bins=30, kde=True)
plt.title("Tenure vs Churn")
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x="Churn", y="MonthlyCharges", data=df)
plt.title("Monthly Charges vs Churn")
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x="Contract", hue="Churn", data=df)
plt.xticks(rotation=20)
plt.title("Contract Type vs Churn")
plt.show()


In [None]:
PLOT_DIR = "/content/drive/MyDrive/churn-intelligence/plots"

plt.figure(figsize=(8,5))
sns.countplot(x="Contract", hue="Churn", data=df)
plt.xticks(rotation=20)
plt.savefig(f"{PLOT_DIR}/contract_vs_churn.png")
plt.close()


In [None]:
df_fe = df.copy()


In [None]:
df_fe["Churn"] = df_fe["Churn"].map({"Yes": 1, "No": 0})


In [None]:
def tenure_group(t):
    if t <= 12:
        return "0-1 year"
    elif t <= 24:
        return "1-2 years"
    elif t <= 48:
        return "2-4 years"
    else:
        return "4+ years"

df_fe["tenure_group"] = df_fe["tenure"].apply(tenure_group)


In [None]:
df_fe["average_monthly_spend"] = df_fe["MonthlyCharges"]

df_fe["total_spend_estimate"] =  df_fe["MonthlyCharges"] * df_fe["tenure"]



In [None]:
df_fe.drop(columns=["customerID"], inplace=True)

In [None]:
df_fe.head()

In [None]:
df_fe.columns

In [None]:
X = df_fe.drop("Churn", axis=1)
Y = df_fe["Churn"]

In [None]:
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

print("Categorical columns:", cat_cols)
print("Numerical columns:", num_cols)

In [None]:
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [None]:
X_encoded.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_encoded,Y,test_size=0.2, random_state=42, stratify=Y)

In [None]:
print(X_train.shape, X_test.shape)
print(Y_train.mean(), Y_test.mean())


In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
X_train.isna().sum().sort_values(ascending=False).head(10)


In [None]:
# fill NaNs in training data
X_train = X_train.fillna(X_train.median())

# IMPORTANT: use same values for test data
X_test = X_test.fillna(X_train.median())

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# fit only on training data
X_train_scaled = scaler.fit_transform(X_train)

# use same scaler on test data
X_test_scaled = scaler.transform(X_test)


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, Y_train)

Y_pred = model.predict(X_test_scaled)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))


In [None]:
from sklearn.metrics import classification_report

print(classification_report(Y_test, Y_pred))


In [None]:
# probability of class = 1 (churn)
Y_prob = model.predict_proba(X_test_scaled)[:, 1]


In [None]:
threshold = 0.35
Y_pred_035 = (Y_prob >= threshold).astype(int)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print("Confusion Matrix (threshold=0.35):")
print(confusion_matrix(Y_test, Y_pred_035))

print("\nClassification Report:")
print(classification_report(Y_test, Y_pred_035))


In [None]:
for t in [0.5, 0.4, 0.35, 0.3]:
    preds = (Y_prob >= t).astype(int)
    cm = confusion_matrix(Y_test, preds)
    recall = cm[1,1] / (cm[1,0] + cm[1,1])
    print(f"Threshold {t}: Recall = {recall:.2f}")


In [None]:
import joblib

MODEL_DIR = "/content/drive/MyDrive/churn-intelligence/models"

joblib.dump(model, f"{MODEL_DIR}/logistic_model.pkl")


In [None]:
joblib.dump(scaler, f"{MODEL_DIR}/scaler.pkl")


In [None]:
import json

metadata = {
    "threshold": 0.35,
    "features": list(X_encoded.columns)
}

with open(f"{MODEL_DIR}/model_metadata.json", "w") as f:
    json.dump(metadata, f)
