In [None]:
# üöÄ CREDIT RISK CLASSIFICATION TRAINING - FIXED VERSION

%pip install scikit-learn pyyaml

import mlflow
import yaml
import numpy as np
import pandas as pd
import warnings
import time
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, confusion_matrix
)
from mlflow.models.signature import infer_signature
from pyspark.sql import SparkSession
from pyspark.ml.linalg import VectorUDT

warnings.filterwarnings("ignore")

print("=" * 80)
print("üöÄ CREDIT RISK CLASSIFICATION TRAINING PIPELINE")
print("=" * 80)
 
# ‚úÖ LOAD PIPELINE CONFIGURATION

print("\nüìã Step 1: Loading pipeline configuration...")

try:
    with open("pipeline_config.yml", "r") as f:
        pipeline_cfg = yaml.safe_load(f)

    MODEL_TYPE = pipeline_cfg["model"]["type"]
    CATALOG = pipeline_cfg["model"]["catalog"]
    SCHEMA = pipeline_cfg["model"]["schema"]
    BASE_NAME = pipeline_cfg["model"]["base_name"]
    
    EXPERIMENT_NAME = pipeline_cfg["experiment"]["name"]
    MODEL_ARTIFACT_PATH = pipeline_cfg["experiment"]["artifact_path"]
    RUN_NAME_PREFIX = pipeline_cfg["experiment"]["run_name_prefix"]

    PREPROCESSED_TABLE = pipeline_cfg["data"]["preprocessed_table"]
    LABEL_COL = "label"
    
    TEST_SIZE = pipeline_cfg["data"]["split"]["test_size"]
    RANDOM_STATE = pipeline_cfg["data"]["split"]["random_state"]
    STRATIFY = pipeline_cfg["data"]["split"]["stratify"]

    METRICS_CONFIG = pipeline_cfg["metrics"]["classification"]
    PRIMARY_METRIC = METRICS_CONFIG["primary_metric"]
    DIRECTION = METRICS_CONFIG["direction"]
    TRACKED_METRICS = METRICS_CONFIG["tracked_metrics"]
    THRESHOLD_METRICS = METRICS_CONFIG["threshold_metrics"]

    print(f"‚úÖ Pipeline configuration loaded successfully!")

except Exception as e:
    print(f"‚ùå ERROR loading pipeline configuration: {e}")
    raise

print("=" * 80)
 
# ----- Load experiment configurations -----

def load_experiment_configs(path="config.yml"):
    print(f"\nüìÑ Step 2: Loading experiment configurations...")
    
    with open(path, "r") as f:
        config = yaml.safe_load(f)

    return config
 
# ----- Convert PySpark Vector to array -----

def vector_to_array(v):
    return v.toArray() if hasattr(v, 'toArray') else np.array(v)
 
# ----- Load preprocessed Delta table -----

def load_preprocessed_data(spark):
    print(f"\nüì¶ Loading PREPROCESSED data...")
    df = spark.read.format("delta").table(PREPROCESSED_TABLE)
    df_pd = df.toPandas()

    X = np.array([vector_to_array(row) for row in df_pd['features']])
    y = df_pd['label'].values

    return X, y
 
# ----- Train a single experiment -----

def train_single_experiment(X, y, params, run_name):

    stratify_param = y if STRATIFY else None
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=TEST_SIZE, 
        random_state=RANDOM_STATE,
        stratify=stratify_param
    )

    with mlflow.start_run(run_name=run_name) as run:
        run_id = run.info.run_id

        # Log metadata
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)

        if "random_state" in params:
            params.pop("random_state")

        model = RandomForestClassifier(
            random_state=RANDOM_STATE,
            **params
        )

        model.fit(X_train, y_train)

        y_test_pred = model.predict(X_test)
        y_test_proba = model.predict_proba(X_test)[:, 1]

        metrics_dict = {
            "test_accuracy": accuracy_score(y_test, y_test_pred),
            "test_precision": precision_score(y_test, y_test_pred, zero_division=0),
            "test_recall": recall_score(y_test, y_test_pred, zero_division=0),
            "test_f1": f1_score(y_test, y_test_pred, zero_division=0),
            "test_roc_auc": roc_auc_score(y_test, y_test_proba)
        }

        for metric_name, metric_value in metrics_dict.items():
            mlflow.log_metric(metric_name, metric_value)

        signature = infer_signature(X_train, model.predict(X_train))

        mlflow.sklearn.log_model(
            model,
            artifact_path=MODEL_ARTIFACT_PATH,
            signature=signature,
            registered_model_name=None
        )

        # --- ‚≠ê NEW FEATURE IMPORTANCE DELTA LOGIC ADDED HERE ‚≠ê ---
        if hasattr(model, 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature_index': range(len(model.feature_importances_)),
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)

            importance_file = "feature_importance.csv"
            feature_importance.to_csv(importance_file, index=False)
            mlflow.log_artifact(importance_file)

            try:
                FEATURE_IMPORTANCE_TABLE = pipeline_cfg["tables"]["feature_importance"]

                feature_importance["run_id"] = run_id
                feature_importance["timestamp"] = datetime.now()

                spark.createDataFrame(feature_importance) \
                    .write.format("delta") \
                    .mode("append") \
                    .saveAsTable(FEATURE_IMPORTANCE_TABLE)

                print(f"üìå Feature importance saved ‚Üí {FEATURE_IMPORTANCE_TABLE}")

            except Exception as e:
                print(f"‚ö† Feature importance Delta logging failed: {e}")

        return run_id, metrics_dict
 
# ----- MAIN EXECUTION -----

if __name__ == "__main__":

    mlflow.set_tracking_uri("databricks")
    mlflow.set_registry_uri("databricks-uc")
    mlflow.set_experiment(EXPERIMENT_NAME)

    spark = SparkSession.builder.appName("CreditRiskTraining").getOrCreate()

    X, y = load_preprocessed_data(spark)
    config = load_experiment_configs()

    results = []

    for exp in config["experiments"]:
        run_id, metrics = train_single_experiment(
            X, y, exp["params"], f"{RUN_NAME_PREFIX}_{exp['name']}"
        )
        results.append({ "name": exp["name"], "run_id": run_id, "metrics": metrics })

    print("\nüéâ Training completed successfully!")
