In [None]:
# üöÄ CREDIT RISK CLASSIFICATION TRAINING - FINAL PIPELINE VERSION

%pip install scikit-learn pyyaml

import mlflow
import yaml
import numpy as np
import pandas as pd
import warnings
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from mlflow.models.signature import infer_signature
from pyspark.sql import SparkSession
from pyspark.ml.linalg import VectorUDT

warnings.filterwarnings("ignore")

print("=" * 80)
print("üöÄ CREDIT RISK CLASSIFICATION TRAINING PIPELINE")
print("=" * 80)

# -----------------------------------------
# 1Ô∏è‚É£ Load pipeline config
# -----------------------------------------
with open("pipeline_config.yml", "r") as f:
    pipeline_cfg = yaml.safe_load(f)

MODEL_TYPE = pipeline_cfg["model"]["type"]
EXPERIMENT_NAME = pipeline_cfg["experiment"]["name"]
MODEL_ARTIFACT_PATH = pipeline_cfg["experiment"]["artifact_path"]
PREPROCESSED_TABLE = pipeline_cfg["data"]["input_table"]   # <-- RAW table for final preprocessing
FEATURES = pipeline_cfg["data"]["features"]
LABEL_COL = pipeline_cfg["data"]["label"]

TEST_SIZE = pipeline_cfg["data"]["split"]["test_size"]
RANDOM_STATE = pipeline_cfg["data"]["split"]["random_state"]
STRATIFY = pipeline_cfg["data"]["split"]["stratify"]

print("‚úÖ Config loaded\n")


# -----------------------------------------
# 2Ô∏è‚É£ Load Raw Data
# -----------------------------------------
spark = SparkSession.builder.appName("CreditRiskTraining").getOrCreate()

print(f"üì¶ Loading RAW data ‚Üí {PREPROCESSED_TABLE}")
df = spark.read.table(PREPROCESSED_TABLE).toPandas()

X = df[FEATURES]
y = df[LABEL_COL]


# -----------------------------------------
# 3Ô∏è‚É£ Build preprocessing + model pipeline
# -----------------------------------------

categorical_cols = [col for col in X.columns if X[col].dtype == "object"]
numeric_cols = [col for col in X.columns if col not in categorical_cols]

print(f"üîß Detected {len(categorical_cols)} categorical features")
print(f"üîß Detected {len(numeric_cols)} numerical features")

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("numeric", StandardScaler(), numeric_cols)
    ]
)

rf_model = RandomForestClassifier(random_state=RANDOM_STATE)

full_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", rf_model)
])


# -----------------------------------------
# 4Ô∏è‚É£ Train Model
# -----------------------------------------
stratify_option = y if STRATIFY else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=stratify_option, random_state=RANDOM_STATE
)

mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run():

    print("‚öôÔ∏è Training model...")
    full_pipeline.fit(X_train, y_train)

    y_pred = full_pipeline.predict(X_test)
    y_proba = full_pipeline.predict_proba(X_test)[:, 1]

    # Metrics
    metrics = {
        "test_accuracy": accuracy_score(y_test, y_pred),
        "test_precision": precision_score(y_test, y_pred, zero_division=0),
        "test_recall": recall_score(y_test, y_pred, zero_division=0),
        "test_f1": f1_score(y_test, y_pred, zero_division=0),
        "test_roc_auc": roc_auc_score(y_test, y_proba),
    }

    for k, v in metrics.items():
        mlflow.log_metric(k, v)

    signature = infer_signature(X_train, full_pipeline.predict(X_train))

    print("üì¶ Saving model pipeline to MLflow registry")
    mlflow.sklearn.log_model(
        sk_model=full_pipeline,
        artifact_path=MODEL_ARTIFACT_PATH,
        signature=signature
    )

print("\nüéâ Training completed successfully!")
print("üöÄ Model saved as full preprocessing + classifier pipeline")
