In [None]:
### TRAINING_script (FIXED - Hyperparameter Grid from YAML)

import mlflow
import time
import yaml
import sys
import os
import numpy as np
import pandas as pd
import warnings
from datetime import datetime
from itertools import product

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from mlflow.models.signature import infer_signature
from pyspark.sql import SparkSession

# ‚úÖ Import evaluation logging functions
from evaluation import create_eval_table_if_not_exists, log_run_to_table

warnings.filterwarnings("ignore")

print("=" * 80)
print("üöÄ CREDIT RISK TRAINING - MULTI MODEL MODE")
print("=" * 80)

# üî• LOAD CONFIG FILES

with open("pipeline_config.yml", "r") as f:
    pipeline_cfg = yaml.safe_load(f)

with open("experiments_config.yml", "r") as f:
    experiments_cfg = yaml.safe_load(f)

# üî• GET MODELS TO TRAIN (No Git Variables / Widgets)

def get_models_to_train():
    available_models = list(experiments_cfg.get("models", {}).keys())

    if not available_models:
        raise ValueError("‚ùå No models defined in experiments_config.yml")

    print(f"‚úÖ Training ALL models: {available_models}")
    return available_models

try:
    MODELS_TO_TRAIN = get_models_to_train()
    print(f"\nüìã Models to train: {MODELS_TO_TRAIN}\n")
except ValueError as e:
    print(str(e))
    dbutils.notebook.exit("FAILED: Invalid MODELS_TO_TRAIN configuration")

# üî• PIPELINE SETTINGS

BASE_EXPERIMENT_NAME = pipeline_cfg["experiment"]["name"]
MODEL_ARTIFACT_PATH = pipeline_cfg["experiment"]["artifact_path"]
RAW_INPUT_TABLE = pipeline_cfg["data"]["input_table"]
FEATURES = pipeline_cfg["data"]["features"]
LABEL_COL = pipeline_cfg["data"]["label"]
RUN_NAME_PREFIX = pipeline_cfg["experiment"]["run_name_prefix"]

# üî• LOAD DATA

spark = SparkSession.builder.appName("CreditRiskTraining").getOrCreate()
df = spark.read.table(RAW_INPUT_TABLE).toPandas()

X = df[FEATURES]
y = df[LABEL_COL]

if y.dtype == "object":
    y = y.map({"yes": 1, "no": 0}).astype(int)

# üî• PREPROCESSING

categorical_cols = [c for c in X.columns if X[c].dtype == "object"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("numeric", StandardScaler(), numeric_cols)
    ]
)

# üî• TRAIN-TEST SPLIT

stratify_option = y if pipeline_cfg["data"]["split"]["stratify"] else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=pipeline_cfg["data"]["split"]["test_size"],
    stratify=stratify_option,
    random_state=pipeline_cfg["data"]["split"]["random_state"]
)

# üî• MLflow SETUP

mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")

MODEL_CLASSES = {
    "random_forest": RandomForestClassifier
}

# ‚úÖ MODEL TYPE TO SHORT NAME MAPPING
MODEL_SHORT_NAMES = {
    "random_forest": "RF"
}

def get_model_short_name(model_type):
    if model_type in MODEL_SHORT_NAMES:
        return MODEL_SHORT_NAMES[model_type]
    words = model_type.split("_")
    return "".join([w[0].upper() for w in words if w])

# ‚úÖ CREATE PARAM GRID FROM YAML hyperparameters

def generate_param_combinations(hyperparam_dict: dict):
    """
    Converts:
      {"a":[1,2], "b":[3,4]}
    Into:
      [{"a":1,"b":3}, {"a":1,"b":4}, {"a":2,"b":3}, {"a":2,"b":4}]
    """
    keys = list(hyperparam_dict.keys())
    values_list = [hyperparam_dict[k] for k in keys]

    combos = []
    for values in product(*values_list):
        combos.append(dict(zip(keys, values)))

    return combos

# üî• TRAIN LOOP

for MODEL_TYPE in MODELS_TO_TRAIN:

    if MODEL_TYPE not in MODEL_CLASSES:
        print(f"‚ö†Ô∏è  Skipping {MODEL_TYPE} - model class not found")
        continue

    if MODEL_TYPE not in experiments_cfg["models"]:
        print(f"‚ö†Ô∏è  Skipping {MODEL_TYPE} - not in experiments_config.yml")
        continue

    # ‚úÖ CREATE MODEL-SPECIFIC EXPERIMENT NAME
    model_short = get_model_short_name(MODEL_TYPE)
    MODEL_EXPERIMENT_NAME = f"{BASE_EXPERIMENT_NAME}_{model_short}"

    print(f"\n{'='*80}")
    print(f"üî¨ Setting experiment: {MODEL_EXPERIMENT_NAME}")
    print(f"{'='*80}")

    mlflow.set_experiment(MODEL_EXPERIMENT_NAME)

    ModelClass = MODEL_CLASSES[MODEL_TYPE]

    # ‚úÖ READ hyperparameters from YAML
    hyperparams = experiments_cfg["models"][MODEL_TYPE].get("hyperparameters", {})

    if not hyperparams:
        print(f"‚ö†Ô∏è No hyperparameters found for {MODEL_TYPE}. Skipping...")
        continue

    PARAM_COMBINATIONS = generate_param_combinations(hyperparams)

    print(f"üéØ Training {MODEL_TYPE.upper()} - {len(PARAM_COMBINATIONS)} hyperparameter combinations\n")

    # ‚úÖ Create eval table once (before runs start)
    create_eval_table_if_not_exists()

    for idx, params in enumerate(PARAM_COMBINATIONS, start=1):

        exp_name = f"{RUN_NAME_PREFIX}_{MODEL_TYPE}_run_{idx}"

        with mlflow.start_run(run_name=exp_name) as run:

            model = ModelClass(**params)

            pipeline = Pipeline([
                ("preprocessing", preprocessor),
                ("model", model)
            ])

            start = time.time()
            pipeline.fit(X_train, y_train)
            train_time = round(time.time() - start, 4)

            train_pred = pipeline.predict(X_train)
            train_accuracy = accuracy_score(y_train, train_pred)

            start_inf = time.time()
            y_pred = pipeline.predict(X_test)
            inference_time = round(time.time() - start_inf, 4)

            if hasattr(pipeline.named_steps["model"], "predict_proba"):
                y_proba = pipeline.predict_proba(X_test)[:, 1]
            else:
                y_proba = None

            metrics = {
                "test_accuracy": accuracy_score(y_test, y_pred),
                "test_precision": precision_score(y_test, y_pred),
                "test_recall": recall_score(y_test, y_pred),
                "test_f1": f1_score(y_test, y_pred),
                "train_accuracy": train_accuracy,
                "train_time": train_time,
                "inference_time": inference_time
            }

            if y_proba is not None:
                metrics["test_roc_auc"] = roc_auc_score(y_test, y_proba)

            for k, v in metrics.items():
                mlflow.log_metric(k, v)

            # Log params
            mlflow.log_params(params)
            mlflow.log_param("model_type", MODEL_TYPE)
            mlflow.log_param("experiment_name", MODEL_EXPERIMENT_NAME)

            signature = infer_signature(X_train, pipeline.predict(X_train))

            mlflow.sklearn.log_model(
                pipeline,
                artifact_path=MODEL_ARTIFACT_PATH,
                signature=signature,
                input_example=X_train.head(5)
            )

            # ‚úÖ NEW STEP: Log to Delta Evaluation Table
            log_run_to_table(
                model_name=exp_name,
                model_type=MODEL_TYPE,
                run_id=run.info.run_id,
                experiment_name=MODEL_EXPERIMENT_NAME,
                hyperparams=params,
                metrics=metrics
            )

            print(f"   ‚úÖ {exp_name}")

print("\n" + "=" * 80)
print("üéâ ALL MODELS TRAINING COMPLETED!")
print("=" * 80)
