In [None]:
import pandas as pd
import snowflake.snowpark as snowpark

from matplotlib import pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold

from scikitplot.metrics import (
    plot_roc_curve,
    plot_lift_curve,
    plot_precision_recall,
    plot_ks_statistic,
    plot_confusion_matrix
)

from warnings import filterwarnings
from snowflake.snowpark.context import get_active_session
session = get_active_session()
filterwarnings("ignore")


In [None]:
df = session.table("tasty_bytes.analytics.customer_churn").toPandas()

df.head()

In [None]:
cols = [
    "CUSTOMER_ID",
    "LTV",
    "AVG_TICKET",
    "QUANTITY_UNIQUE_LOCATION",
    "QUANTITY_UNIQUE_TRUCKS",
    "TOTAL_QUANTITY_PRODUCTS",
    "MIN_QUANTITY_PRODUCTS",
    "MAX_QUANTITY_PRODUCTS",
    "TOTAL_UNIT_PRICE",
    "MIN_UNIT_PRICE",
    "MAX_UNIT_PRICE",
]

target_col = ["CHURN"]

X = df[cols]
y = df[target_col]

In [None]:
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

print(df["CHURN"].value_counts())
print(y_test["CHURN"].value_counts())
print(y_train["CHURN"].value_counts())

In [None]:
X_train.head(2)

In [None]:
X_train_b = pd.concat([
    X_train[y_train["CHURN"] == 0].head(y_train["CHURN"].sum()),
    X_train[y_train["CHURN"] == 1]
], axis=0)

y_train_b = y_train[y_train.index.isin(X_train_b.index)]

y_train_b["CHURN"].value_counts()

In [None]:
#est = DecisionTreeClassifier(max_depth=10)
est = RandomForestClassifier(n_estimators=5, max_depth=50)
est.fit(X_train.drop(columns=["CUSTOMER_ID"]), y_train)

train_probas = est.predict_proba(X_train.drop(columns=["CUSTOMER_ID"]))
test_probas = est.predict_proba(X_test.drop(columns=["CUSTOMER_ID"]))

train_pred = est.predict(X_train.drop(columns=["CUSTOMER_ID"]))
test_pred = est.predict(X_test.drop(columns=["CUSTOMER_ID"]))

In [None]:
plot_roc_curve(y_train, train_probas)
plot_lift_curve(y_train, train_probas)
plot_precision_recall(y_train, train_probas)
plot_ks_statistic(y_train, train_probas)
plot_confusion_matrix(y_train, train_pred)

In [None]:
plot_roc_curve(y_test, test_probas)
plot_lift_curve(y_test, test_probas)
plot_precision_recall(y_test, test_probas)
plot_ks_statistic(y_test, test_probas)
plot_confusion_matrix(y_test, test_pred)