In [0]:
# For imbalance data
%pip install imbalanced-learn


In [0]:
%restart_python

In [0]:
# COMMAND ----------
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    classification_report,
    confusion_matrix,
)
import matplotlib.pyplot as plt

# COMMAND ----------
df_spark = spark.table("cross_sell_insurance.01_feature_staging.stage2_clean_feature_table")
print("Spark schema:")
df_spark.printSchema()

df = df_spark.toPandas()
print("Pandas shape:", df.shape)
df.head()



In [0]:
# 2. Define target, cek imbalance, dan train–test split

# COMMAND ----------
TARGET_COL = "is_target_customer"

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

print("Class distribution (count & %):")
counts = y.value_counts()
pct = y.value_counts(normalize=True) * 100
display(pd.DataFrame({"count": counts, "percentage": pct.round(4)}))

total = len(y)
pos = (y == 1).sum()
neg = (y == 0).sum()
ratio = neg / pos if pos > 0 else np.inf

print(f"\nTotal samples : {total}")
print(f"Class 0 (non-target): {neg} ({neg/total*100:.4f}%)")
print(f"Class 1 (target)    : {pos} ({pos/total*100:.4f}%)")
print(f"Imbalance ratio (neg:pos) ≈ {ratio:.2f} : 1")

# Train-test split (stratified karena imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTrain shape:", X_train.shape)
print("Test shape :", X_test.shape)



In [0]:
# 3. Tentukan kolom kategorikal & numerik
# COMMAND ----------
# Numeric: kolom dengan dtype number / bool
numeric_cols = X_train.select_dtypes(include=["number", "bool"]).columns.tolist()

# Categorical: kolom object / category
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

print("Numeric cols (first 20):", numeric_cols[:20])
print("Total numeric cols:", len(numeric_cols))
print("Categorical cols:", categorical_cols)
print("Total categorical cols:", len(categorical_cols))



In [0]:
# 4. Preprocessing pipeline (Impute → Log (numeric) → Scale → OHE)

# COMMAND ----------
from sklearn.impute import SimpleImputer

# Numeric: isi missing dengan 0, lalu scale
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("scaler", StandardScaler()),
])

# Categorical: isi missing dengan '0' (string), lalu OneHotEncode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="0")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop",
)


In [0]:
# 5. Logistic Regression pipeline + cross validation (tanpa undersampling)

# COMMAND ----------
# Kita pakai class_weight='balanced' untuk handle extreme imbalance
logreg_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(
        max_iter=800,
        class_weight="balanced",
        n_jobs=-1,
        solver="lbfgs",
    )),
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ROC-AUC CV
roc_scores = cross_val_score(
    logreg_pipeline,
    X_train, y_train,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
)

# PR-AUC (Average Precision) CV
pr_scores = cross_val_score(
    logreg_pipeline,
    X_train, y_train,
    scoring="average_precision",
    cv=cv,
    n_jobs=-1,
)

print("LOGISTIC REGRESSION (class_weight='balanced')")
print(f"ROC-AUC CV mean ± std: {roc_scores.mean():.4f} ± {roc_scores.std():.4f}")
print(f"PR-AUC  CV mean ± std: {pr_scores.mean():.4f} ± {pr_scores.std():.4f}")



In [0]:
# 6. Train Logistic Regression final di full train & evaluasi di test
# COMMAND ----------
logreg_pipeline.fit(X_train, y_train)

y_proba_log = logreg_pipeline.predict_proba(X_test)[:, 1]
y_pred_log_default = (y_proba_log >= 0.5).astype(int)  # threshold default

print("=== Logistic Regression – Test Evaluation (threshold=0.5) ===")
print("ROC-AUC :", roc_auc_score(y_test, y_proba_log))
print("PR-AUC  :", average_precision_score(y_test, y_proba_log))
print("\nClassification report:")
print(classification_report(y_test, y_pred_log_default, digits=4))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred_log_default))


In [0]:
# 7. Decision Tree pipeline + CV (juga tanpa undersampling)
# COMMAND ----------
tree_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", DecisionTreeClassifier(
        max_depth=5,
        min_samples_leaf=50,
        class_weight="balanced",
        random_state=42,
    )),
])

tree_roc_scores = cross_val_score(
    tree_pipeline,
    X_train, y_train,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
)

tree_pr_scores = cross_val_score(
    tree_pipeline,
    X_train, y_train,
    scoring="average_precision",
    cv=cv,
    n_jobs=-1,
)

print("DECISION TREE (class_weight='balanced')")
print(f"ROC-AUC CV mean ± std: {tree_roc_scores.mean():.4f} ± {tree_roc_scores.std():.4f}")
print(f"PR-AUC  CV mean ± std: {tree_pr_scores.mean():.4f} ± {tree_pr_scores.std():.4f}")


In [0]:
# 8. Train Decision Tree final & evaluasi di test
# COMMAND ----------
tree_pipeline.fit(X_train, y_train)

y_proba_tree = tree_pipeline.predict_proba(X_test)[:, 1]
y_pred_tree_default = (y_proba_tree >= 0.5).astype(int)

print("=== Decision Tree – Test Evaluation (threshold=0.5) ===")
print("ROC-AUC :", roc_auc_score(y_test, y_proba_tree))
print("PR-AUC  :", average_precision_score(y_test, y_proba_tree))
print("\nClassification report:")
print(classification_report(y_test, y_pred_tree_default, digits=4))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred_tree_default))


In [0]:
# 9. Lihat feature importance Logistic Regression
# COMMAND ----------
# Ambil nama feature setelah preprocess
num_feature_names = numeric_cols
cat_feature_names = list(
    logreg_pipeline.named_steps["preprocess"]
    .named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out(categorical_cols)
)

all_feature_names = np.concatenate([num_feature_names, cat_feature_names])

coefs = logreg_pipeline.named_steps["clf"].coef_[0]

coef_df = pd.DataFrame({
    "feature": all_feature_names,
    "coefficient": coefs,
}).sort_values("coefficient", ascending=False)

print("Top positive features (mendorong ke class 1):")
display(coef_df.head(20))

print("Top negative features (mendorong ke class 0):")
display(coef_df.tail(20))


In [0]:
# Ambil feature importance dari trained decision tree

# COMMAND ----------

# Ambil feature names setelah preprocessing
num_feature_names = numeric_cols
cat_feature_names = list(
    tree_pipeline.named_steps["preprocess"]
    .named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out(categorical_cols)
)

all_feature_names = np.concatenate([num_feature_names, cat_feature_names])

# Ambil feature importance dari tree
tree_clf = tree_pipeline.named_steps["clf"]
importances = tree_clf.feature_importances_

# Buat DataFrame
tree_fi_df = pd.DataFrame({
    "feature": all_feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

display(tree_fi_df.head(20))


In [0]:
top_n = 20
top_fi = tree_fi_df.head(top_n)

plt.figure(figsize=(10, 8))
plt.barh(top_fi["feature"][::-1], top_fi["importance"][::-1])
plt.title("Top 20 Feature Importances (Decision Tree)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

In [0]:
# COMMAND ----------
logreg_pipeline.fit(X_train, y_train)
tree_pipeline.fit(X_train, y_train)


In [0]:
# COMMAND ----------
y_proba_log = logreg_pipeline.predict_proba(X_test)[:, 1]
y_proba_tree = tree_pipeline.predict_proba(X_test)[:, 1]


In [0]:
# COMMAND ----------
from sklearn.metrics import roc_auc_score, average_precision_score

metrics = {}

metrics["logistic"] = {
    "roc_auc": roc_auc_score(y_test, y_proba_log),
    "pr_auc": average_precision_score(y_test, y_proba_log)
}

metrics["tree"] = {
    "roc_auc": roc_auc_score(y_test, y_proba_tree),
    "pr_auc": average_precision_score(y_test, y_proba_tree)
}

metrics


In [0]:
# COMMAND ----------
best_model_name = max(metrics, key=lambda m: metrics[m]["pr_auc"])
best_model_name


In [0]:
# COMMAND ----------
if best_model_name == "logistic":
    best_model = logreg_pipeline
else:
    best_model = tree_pipeline

best_model


In [0]:
best_model.fit(X_train, y_train)

In [0]:
# COMMAND ----------
import numpy as np
import pandas as pd

# Load full table untuk scoring
df_spark_full = spark.table("cross_sell_insurance.01_feature_staging.stage2_clean_feature_table")
df_full = df_spark_full.toPandas()

TARGET_COL = "is_target_customer"

# X_all = semua fitur tanpa target
X_all = df_full.drop(columns=[TARGET_COL])

# Cari kolom ID utama (sesuaikan kalau kamu tahu pasti namanya)
id_col = None
for cand in ["client_id", "customer_id", "accountid", "customer_number"]:
    if cand in df_full.columns:
        id_col = cand
        break

# Kalau tidak ketemu, pakai index sebagai ID
if id_col is None:
    df_full["row_id"] = np.arange(len(df_full))
    id_col = "row_id"

print("ID column used:", id_col)

# Prediksi probabilitas beli Bebas Aksi
proba_all = best_model.predict_proba(X_all)[:, 1]

# Threshold (bisa kamu ubah, misal 0.2)
threshold = 0.2
flag_all = (proba_all >= threshold).astype(int)

# Buat DataFrame hasil scoring
scored_df = pd.DataFrame({
    id_col: df_full[id_col],
    "bebas_aksi_score": proba_all,
    "bebas_aksi_flag": flag_all
})

scored_df.head()


In [0]:
# Simpan hasil scoring ke Spark table baru

# COMMAND ----------
scored_spark = spark.createDataFrame(scored_df)

scored_spark.write.mode("overwrite").saveAsTable(
    "cross_sell_insurance.01_feature_staging.stage3_bebas_aksi_scored"
)

# Cek hasil di Spark
spark.table("cross_sell_insurance.01_feature_staging.stage3_bebas_aksi_scored").show(10)


In [0]:
import mlflow
import mlflow.sklearn

mlflow.set_experiment("/Users/u2600038142@gmail.com/bebas_aksi_model")   # opsional

with mlflow.start_run():
    mlflow.sklearn.log_model(best_model, "bebas_aksi_model")
    mlflow.log_metric("roc_auc", metrics[best_model_name]["roc_auc"])
    mlflow.log_metric("pr_auc", metrics[best_model_name]["pr_auc"])