In [None]:
### TRAINING_script (ZIPPED Hyperparameter Runs from YAML + Evaluation Logging in same file)

import mlflow
import time
import yaml
import json
import numpy as np
import pandas as pd
import warnings
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from mlflow.models.signature import infer_signature
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

warnings.filterwarnings("ignore")

print("=" * 80)
print("üöÄ CREDIT RISK TRAINING - MULTI MODEL MODE (ZIPPED PARAMS + EVAL LOGGING)")
print("=" * 80)

# ---------------------- LOAD CONFIG FILES ----------------------
with open("pipeline_config.yml", "r") as f:
    pipeline_cfg = yaml.safe_load(f)

with open("experiments_config.yml", "r") as f:
    experiments_cfg = yaml.safe_load(f)

# ---------------------- INIT SPARK ----------------------
spark = SparkSession.builder.appName("CreditRiskTraining").getOrCreate()

# ---------------------- EVALUATION CONFIG (FROM pipeline_config.yml) ----------------------
EVAL_TABLE = pipeline_cfg["tables"]["evaluation_log"]
TRACKED_METRICS = pipeline_cfg["metrics"]["classification"]["tracked_metrics"]

DUPLICATE_CFG = pipeline_cfg.get("tables", {}).get("duplicate_handling", {})
DUPLICATE_ENABLED = DUPLICATE_CFG.get("enabled", True)

print(f"‚úÖ Evaluation Log Table: {EVAL_TABLE}")
print(f"‚úÖ Tracked Metrics: {TRACKED_METRICS}")
print(f"‚úÖ Duplicate Handling Enabled: {DUPLICATE_ENABLED}")

# ---------------------- EVALUATION TABLE FUNCTIONS ----------------------
def create_eval_table_if_not_exists():
    spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {EVAL_TABLE} (
        model_name STRING,
        model_type STRING,
        run_id STRING,
        experiment_name STRING,
        created_timestamp TIMESTAMP,
        hyperparameters STRING,
        metrics STRING
    )
    USING DELTA
    """)
    print(f"‚úÖ Evaluation table ready: {EVAL_TABLE}")


def is_duplicate(model_type: str, experiment_name: str, hyper_json: str) -> bool:
    """
    Duplicate means:
    same model_type + experiment_name + hyperparameters already exists
    """
    if not DUPLICATE_ENABLED:
        return False

    try:
        df = spark.read.table(EVAL_TABLE).filter(
            (col("model_type") == model_type) &
            (col("experiment_name") == experiment_name) &
            (col("hyperparameters") == hyper_json)
        )
        return df.limit(1).count() > 0
    except Exception as e:
        print(f"‚ö†Ô∏è Duplicate check skipped (table read error): {e}")
        return False


def log_run_to_table(model_name, model_type, run_id, experiment_name, hyperparams, metrics):
    """
    Stores:
    - hyperparameters as JSON string
    - metrics as JSON string (only tracked metrics)
    - avoids duplicates
    """

    filtered_metrics = {k: metrics.get(k, None) for k in TRACKED_METRICS}

    hyper_json = json.dumps(hyperparams, sort_keys=True)
    metrics_json = json.dumps(filtered_metrics, sort_keys=True)

    if is_duplicate(model_type, experiment_name, hyper_json):
        print(f"‚ö†Ô∏è Duplicate row detected. Skipping insert for run_id={run_id}")
        return

    row = [{
        "model_name": model_name,
        "model_type": model_type,
        "run_id": run_id,
        "experiment_name": experiment_name,
        "created_timestamp": datetime.utcnow(),
        "hyperparameters": hyper_json,
        "metrics": metrics_json
    }]

    df = spark.createDataFrame(row)
    df.write.format("delta").mode("append").saveAsTable(EVAL_TABLE)

    print(f"‚úÖ Logged evaluation row for run_id={run_id}")

# ---------------------- GET MODELS TO TRAIN ----------------------
def get_models_to_train():
    available_models = list(experiments_cfg.get("models", {}).keys())
    if not available_models:
        raise ValueError("‚ùå No models defined in experiments_config.yml")

    print(f"‚úÖ Training ALL models: {available_models}")
    return available_models

MODELS_TO_TRAIN = get_models_to_train()

# ---------------------- PIPELINE SETTINGS ----------------------
BASE_EXPERIMENT_NAME = pipeline_cfg["experiment"]["name"]
MODEL_ARTIFACT_PATH = pipeline_cfg["experiment"]["artifact_path"]
RAW_INPUT_TABLE = pipeline_cfg["data"]["input_table"]
FEATURES = pipeline_cfg["data"]["features"]
LABEL_COL = pipeline_cfg["data"]["label"]
RUN_NAME_PREFIX = pipeline_cfg["experiment"]["run_name_prefix"]

# ---------------------- LOAD DATA ----------------------
df = spark.read.table(RAW_INPUT_TABLE).toPandas()

X = df[FEATURES]
y = df[LABEL_COL]

if y.dtype == "object":
    y = y.map({"yes": 1, "no": 0}).astype(int)

# ---------------------- PREPROCESSING ----------------------
categorical_cols = [c for c in X.columns if X[c].dtype == "object"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("numeric", StandardScaler(), numeric_cols)
    ]
)

# ---------------------- TRAIN-TEST SPLIT ----------------------
stratify_option = y if pipeline_cfg["data"]["split"]["stratify"] else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=pipeline_cfg["data"]["split"]["test_size"],
    stratify=stratify_option,
    random_state=pipeline_cfg["data"]["split"]["random_state"]
)

# ---------------------- MLFLOW SETUP ----------------------
mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")

MODEL_CLASSES = {
    "random_forest": RandomForestClassifier
}

MODEL_SHORT_NAMES = {
    "random_forest": "RF"
}

def get_model_short_name(model_type):
    if model_type in MODEL_SHORT_NAMES:
        return MODEL_SHORT_NAMES[model_type]
    words = model_type.split("_")
    return "".join([w[0].upper() for w in words if w])

# ---------------------- ZIPPED PARAM GENERATOR ----------------------
def generate_param_combinations(hyperparam_dict: dict):
    """
    Zipped mode:
    Run 1 -> first value of each list
    Run 2 -> second value of each list
    ...
    Total runs = minimum length of all lists
    """
    if not hyperparam_dict:
        return []

    for k, v in hyperparam_dict.items():
        if not isinstance(v, list):
            raise ValueError(f"‚ùå Hyperparameter '{k}' must be a list. Found: {type(v)}")

    n_runs = min(len(v) for v in hyperparam_dict.values())

    combos = []
    for i in range(n_runs):
        combo = {k: hyperparam_dict[k][i] for k in hyperparam_dict.keys()}
        combos.append(combo)

    return combos

# ---------------------- CREATE EVAL TABLE ONCE ----------------------
create_eval_table_if_not_exists()

# ---------------------- TRAIN LOOP ----------------------
for MODEL_TYPE in MODELS_TO_TRAIN:

    if MODEL_TYPE not in MODEL_CLASSES:
        print(f"‚ö†Ô∏è  Skipping {MODEL_TYPE} - model class not found")
        continue

    if MODEL_TYPE not in experiments_cfg["models"]:
        print(f"‚ö†Ô∏è  Skipping {MODEL_TYPE} - not in experiments_config.yml")
        continue

    model_short = get_model_short_name(MODEL_TYPE)
    MODEL_EXPERIMENT_NAME = f"{BASE_EXPERIMENT_NAME}_{model_short}"

    print(f"\n{'='*80}")
    print(f"üî¨ Setting experiment: {MODEL_EXPERIMENT_NAME}")
    print(f"{'='*80}")

    mlflow.set_experiment(MODEL_EXPERIMENT_NAME)

    ModelClass = MODEL_CLASSES[MODEL_TYPE]

    hyperparams = experiments_cfg["models"][MODEL_TYPE].get("hyperparameters", {})
    if not hyperparams:
        print(f"‚ö†Ô∏è No hyperparameters found for {MODEL_TYPE}. Skipping...")
        continue

    PARAM_COMBINATIONS = generate_param_combinations(hyperparams)

    print(f"üéØ Training {MODEL_TYPE.upper()} - {len(PARAM_COMBINATIONS)} runs (ZIPPED MODE)\n")

    for idx, params in enumerate(PARAM_COMBINATIONS, start=1):

        exp_name = f"{RUN_NAME_PREFIX}_{MODEL_TYPE}_run_{idx}"

        with mlflow.start_run(run_name=exp_name) as run:

            model = ModelClass(**params)

            pipeline = Pipeline([
                ("preprocessing", preprocessor),
                ("model", model)
            ])

            # ‚úÖ TRAIN
            start = time.time()
            pipeline.fit(X_train, y_train)
            train_time = round(time.time() - start, 4)

            # ‚úÖ TRAIN METRICS
            train_pred = pipeline.predict(X_train)
            train_accuracy = accuracy_score(y_train, train_pred)

            # ‚úÖ INFERENCE
            start_inf = time.time()
            y_pred = pipeline.predict(X_test)
            inference_time = round(time.time() - start_inf, 4)

            if hasattr(pipeline.named_steps["model"], "predict_proba"):
                y_proba = pipeline.predict_proba(X_test)[:, 1]
            else:
                y_proba = None

            metrics = {
                "test_accuracy": accuracy_score(y_test, y_pred),
                "test_precision": precision_score(y_test, y_pred),
                "test_recall": recall_score(y_test, y_pred),
                "test_f1": f1_score(y_test, y_pred),
                "train_accuracy": train_accuracy,
                "train_time": train_time,
                "inference_time": inference_time
            }

            if y_proba is not None:
                metrics["test_roc_auc"] = roc_auc_score(y_test, y_proba)

            # ‚úÖ Log metrics to MLflow
            for k, v in metrics.items():
                mlflow.log_metric(k, v)

            # ‚úÖ Log params to MLflow
            mlflow.log_params(params)
            mlflow.log_param("model_type", MODEL_TYPE)
            mlflow.log_param("experiment_name", MODEL_EXPERIMENT_NAME)

            # ‚úÖ Log model to MLflow
            signature = infer_signature(X_train, pipeline.predict(X_train))

            mlflow.sklearn.log_model(
                pipeline,
                artifact_path=MODEL_ARTIFACT_PATH,
                signature=signature,
                input_example=X_train.head(5)
            )

            # ‚úÖ Store evaluation row in Delta table (AFTER TRAINING)
            log_run_to_table(
                model_name=exp_name,
                model_type=MODEL_TYPE,
                run_id=run.info.run_id,
                experiment_name=MODEL_EXPERIMENT_NAME,
                hyperparams=params,
                metrics=metrics
            )

            print(f"   ‚úÖ {exp_name} | params={params}")

print("\n" + "=" * 80)
print("üéâ ALL MODELS TRAINING COMPLETED!")
print("=" * 80)
