In [0]:
%pip install xgboost

In [0]:
%restart_python

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [0]:
# Load Dataset
df_spark = spark.table("olist.gold.gld_ml_features")

In [0]:
# Handle Null Values
df_spark = df_spark.fillna({
    "avg_review_score": 3,
    "avg_delivery_days": 7,
    "avg_delivery_delay": 0
})

In [0]:
# Convert Spark to Pandas
df = df_spark.toPandas()

In [0]:
# Required Columns
feature_cols = [
    "recency_days",
    "frequency",
    "avg_review_score",
    "avg_delivery_days",
    "avg_delivery_delay"
]

label_col = "high_value_customer"

In [0]:
# Train/Test Split
X = df[feature_cols]
y = df[label_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [0]:
with mlflow.start_run(run_name="Logistic_Regression"):

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_param("model", "logistic_regression")
    mlflow.log_metric("roc_auc", auc)

    mlflow.sklearn.log_model(model, "model")




In [0]:
with mlflow.start_run(run_name="Random_Forest"):

    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        random_state=42
    )

    model.fit(X_train, y_train)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_param("model", "random_forest")
    mlflow.log_metric("roc_auc", auc)

    mlflow.sklearn.log_model(model, "model")




In [0]:
with mlflow.start_run(run_name="Gradient_Boosting"):

    model = GradientBoostingClassifier(random_state=42)
    model.fit(X_train, y_train)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_param("model", "gradient_boosting")
    mlflow.log_metric("roc_auc", auc)

    mlflow.sklearn.log_model(model, "model")




In [0]:
with mlflow.start_run(run_name="XGBoost"):

    model = xgb.XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="auc",
        random_state=42
    )

    model.fit(X_train, y_train)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_param("model", "xgboost")
    mlflow.log_param("n_estimators", 300)
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("learning_rate", 0.05)

    mlflow.log_metric("roc_auc", auc)

    mlflow.xgboost.log_model(model, artifact_path="model")


  self.get_booster().save_model(fname)
