In [None]:
# üöÄ CREDIT RISK CLASSIFICATION TRAINING - FINAL COMPATIBLE VERSION

%pip install scikit-learn pyyaml

import mlflow
import time
import yaml
import numpy as np
import pandas as pd
import warnings
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from mlflow.models.signature import infer_signature
from pyspark.sql import SparkSession

warnings.filterwarnings("ignore")

print("=" * 80)
print("üöÄ CREDIT RISK CLASSIFICATION - CONFIG DRIVEN TRAINING")
print("=" * 80)

# -----------------------------------------
# 1Ô∏è‚É£ Load pipeline config (pipeline_config.yml)
# -----------------------------------------
with open("pipeline_config.yml", "r") as f:
    pipeline_cfg = yaml.safe_load(f)

MODEL_TYPE = pipeline_cfg["model"]["type"]
EXPERIMENT_NAME = pipeline_cfg["experiment"]["name"]
MODEL_ARTIFACT_PATH = pipeline_cfg["experiment"]["artifact_path"]
RAW_INPUT_TABLE = pipeline_cfg["data"]["input_table"]
FEATURES = pipeline_cfg["data"]["features"]
LABEL_COL = pipeline_cfg["data"]["label"]
RUN_NAME_PREFIX = pipeline_cfg["experiment"]["run_name_prefix"]

print("‚úÖ Pipeline config loaded.")

# -----------------------------------------
# 2Ô∏è‚É£ Load experiment config (config.yml)
# -----------------------------------------
with open("config.yml", "r") as f:
    experiment_cfg = yaml.safe_load(f)

EXPERIMENT_LIST = experiment_cfg["experiments"]
print(f"üîç Loaded {len(EXPERIMENT_LIST)} experiment variations.")

# -----------------------------------------
# 3Ô∏è‚É£ Load Raw Data (with Label Encoding)
# -----------------------------------------
spark = SparkSession.builder.appName("CreditRiskTraining").getOrCreate()

print(f"üì¶ Loading RAW data ‚Üí {RAW_INPUT_TABLE}")
df = spark.read.table(RAW_INPUT_TABLE).toPandas()

X = df[FEATURES]
y = df[LABEL_COL]

# Label conversion
if y.dtype == "object":
    y = y.map({"yes": 1, "no": 0}).astype(int)

# -----------------------------------------
# 4Ô∏è‚É£ Setup Preprocessing Pipeline
# -----------------------------------------
categorical_cols = [col for col in X.columns if X[col].dtype == "object"]
numeric_cols = [col for col in X.columns if col not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("numeric", StandardScaler(), numeric_cols)
    ],
    remainder='passthrough'
)

# -----------------------------------------
# 5Ô∏è‚É£ Setup MLflow Tracking
# -----------------------------------------
mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")
mlflow.set_experiment(EXPERIMENT_NAME)

stratify_option = y if pipeline_cfg["data"]["split"]["stratify"] else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=pipeline_cfg["data"]["split"]["test_size"],
    stratify=stratify_option,
    random_state=pipeline_cfg["data"]["split"]["random_state"]
)

results = []

# -----------------------------------------
# 6Ô∏è‚É£ Run All Experiments (With all metrics)
# -----------------------------------------

for exp in EXPERIMENT_LIST:
    
    exp_name = f"{RUN_NAME_PREFIX}_{exp['name']}"
    params = exp["params"].copy()

    config_random_state = params.pop("random_state", 42)

    rf_model = RandomForestClassifier(random_state=config_random_state, **params)

    full_pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("model", rf_model)
    ])

    with mlflow.start_run(run_name=exp_name):

        # ‚è± TRAIN TIME
        train_start = time.time()
        full_pipeline.fit(X_train, y_train)
        train_time_sec = round(time.time() - train_start, 4)

        # Training accuracy
        train_pred = full_pipeline.predict(X_train)
        train_accuracy = accuracy_score(y_train, train_pred)

        # üîç TEST predictions
        inf_start = time.time()
        y_pred = full_pipeline.predict(X_test)
        inference_time_sec = round(time.time() - inf_start, 4)

        y_proba = full_pipeline.predict_proba(X_test)[:, 1]

        metrics = {
            "test_accuracy": accuracy_score(y_test, y_pred),
            "test_precision": precision_score(y_test, y_pred),
            "test_recall": recall_score(y_test, y_pred),
            "test_f1": f1_score(y_test, y_pred),
            "test_roc_auc": roc_auc_score(y_test, y_proba),

            # NEW metrics from config.yml
            "train_accuracy": train_accuracy,
            "train_time": train_time_sec,
            "inference_time": inference_time_sec
        }

        # Log metrics
        for k, v in metrics.items():
            mlflow.log_metric(k, v)

        # Log model hyperparameters
        mlflow.log_params(params)
        mlflow.log_param("random_state", config_random_state)

        # Log model with preprocessing
        signature = infer_signature(X_train, full_pipeline.predict(X_train))

        mlflow.sklearn.log_model(
            sk_model=full_pipeline,
            artifact_path=MODEL_ARTIFACT_PATH,
            signature=signature,
            input_example=X_train.head(5)
        )

        print(f"   ‚úÖ {exp_name} logged successfully with metrics: {metrics}")

        results.append((exp_name, metrics))

# -----------------------------------------
# 7Ô∏è‚É£ Summary
# -----------------------------------------
print("\n" + "=" * 80)
print("üéâ TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 80)

