<a href="https://colab.research.google.com/github/ananyascodehq/customer-churn-prediction-app/blob/main/notebooks/churn_app_deployement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'telco-customer-churn' dataset.
Path to dataset files: /kaggle/input/telco-customer-churn


In [None]:
df = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
df.drop(columns=["customerID"], inplace=True) #customerID is useless for learning patterns.
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(0, inplace=True)


In [None]:
X = df.drop("Churn", axis=1)
y = df["Churn"].map({"Yes": 1, "No": 0})

In [None]:
categorical_cols = X.select_dtypes(include="object").columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols)
    ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
log_reg = LogisticRegression(
    max_iter=1000,
    random_state=42
)

logistic_pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", log_reg)
])

logistic_pipeline.fit(X_train, y_train)

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf_pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", rf)
])

rf_pipeline.fit(X_train, y_train)


In [None]:
with open("logistic_pipeline.pkl", "wb") as f:
    pickle.dump(logistic_pipeline, f)

with open("rf_pipeline.pkl", "wb") as f:
    pickle.dump(rf_pipeline, f)


In [None]:
sample = X_test.iloc[[0]]  # raw input

logistic_pipeline.predict(sample)
logistic_pipeline.predict_proba(sample)

rf_pipeline.predict(sample)
rf_pipeline.predict_proba(sample)


array([[0.985, 0.015]])

In [None]:
from google.colab import files
files.download("logistic_pipeline.pkl")
files.download("rf_pipeline.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>