In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import matplotlib.pyplot as plt
import seaborn as sns
import joblib


In [None]:
# load dataset
df = pd.read_csv("churn_dataset.csv")
df.head()


In [None]:
#cleaning and code encoding
df = df.copy()

# drop customerID
df.drop(columns=["customerID"], inplace=True, errors="ignore")

# binary Yes/No columns (EXPLICIT)
binary_cols = [
    "Partner", 
    "Dependents", 
    "PhoneService",
    "PaperlessBilling"
]

# convert binary columns
for col in binary_cols:
    df[col] = df[col].map({"Yes": 1, "No": 0})

# target encoding
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

# ensure numeric columns are numeric
numeric_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# SeniorCitizen sanity
df["SeniorCitizen"] = df["SeniorCitizen"].astype(int)

# handle missing values
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

df.info()


In [None]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
numeric_features = [
    "tenure", 
    "MonthlyCharges", 
    "TotalCharges", 
    "SeniorCitizen"
]

binary_features = binary_cols

categorical_features = [
    "gender",
    "MultipleLines",
    "InternetService",
    "Contract",
    "PaymentMethod",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies"
]


In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("bin", "passthrough", binary_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [None]:
model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("clf", RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            n_jobs=-1
        ))
    ]
)

model.fit(X_train, y_train)


In [None]:
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n")
print(classification_report(y_test, pred))


In [None]:
# extract preprocessors
preprocessor = model.named_steps["preprocess"]

# numeric names
num_features = numeric_features

# binary names
bin_features = binary_features

# categorical names from OHE
ohe = preprocessor.named_transformers_["cat"]
cat_features = ohe.get_feature_names_out(categorical_features).tolist()

# final feature list (ORDER MATTERS)
feature_names = num_features + bin_features + cat_features

# feature importances
importances = model.named_steps["clf"].feature_importances_

# safety check
print(len(feature_names),len(importances))


In [None]:
fi = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
})

fi = fi.sort_values("importance", ascending=False).head(15)

plt.figure(figsize=(12, 8))
sns.barplot(x="importance", y="feature", data=fi)
plt.title("Top 15 Important Features")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [None]:
sample = X.iloc[[0]]
model.predict(sample), model.predict_proba(sample)

In [None]:
joblib.dump(model, "churn_model.pkl")
print("Model saved as churn_model.pkl")