In [None]:
### TRAINING_script

%pip install scikit-learn pyyaml xgboost
dbutils.library.restartPython()

import mlflow
import time
import yaml
import sys
import os
import numpy as np
import pandas as pd
import warnings
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from mlflow.models.signature import infer_signature
from pyspark.sql import SparkSession

# ‚úÖ FIX: Suppress ALL warnings including threadpoolctl AttributeError
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=AttributeError, module="threadpoolctl")

# ‚úÖ FIX: Also suppress the specific threadpoolctl warning at import time
import logging
logging.getLogger("threadpoolctl").setLevel(logging.ERROR)

print("=" * 80)
print("üöÄ CREDIT RISK TRAINING - MULTI MODEL MODE")
print("=" * 80)

# üî• LOAD CONFIG FILES

with open("pipeline_config.yml", "r") as f:
    pipeline_cfg = yaml.safe_load(f)

with open("experiments_config.yml", "r") as f:
    experiments_cfg = yaml.safe_load(f)

# üîπ ENSURE WIDGET EXISTS (REQUIRED FOR DATABRICKS JOBS)
try:
    dbutils.widgets.text(
        "MODELS_TO_TRAIN",
        "",
        "Models to Train"
    )
except:
    pass

# üî• GET MODELS TO TRAIN (Git / Job Variable)

def get_models_to_train():
    """
    ‚úÖ ENHANCED: Better validation and "all" keyword support
    """
    # Get available models from experiments_config.yml
    available_models = list(experiments_cfg.get("models", {}).keys())
    
    if not available_models:
        raise ValueError("‚ùå No models defined in experiments_config.yml")
    
    # Try to get value from widget first, then environment
    value = None
    try:
        value = dbutils.widgets.get("MODELS_TO_TRAIN")
        print(f"üìå MODELS_TO_TRAIN from Widget: '{value}'")
    except:
        value = os.getenv("MODELS_TO_TRAIN", "")
        print(f"üìå MODELS_TO_TRAIN from ENV: '{value}'")
    
    # Clean the value
    if value:
        value = value.strip()
    
    # ‚úÖ Handle "None" string explicitly
    if not value or value == "" or value.lower() in ["none", "null", "undefined"]:
        raise ValueError(
            f"‚ùå MODELS_TO_TRAIN is not set!\n"
            f"   Available models in experiments_config.yml: {available_models}\n"
            f"   \n"
            f"   Set via Git CI/CD:\n"
            f"   - GitHub: Set variable MODELS_TO_TRAIN='random_forest,xgboost'\n"
            f"   - GitLab: Set CI/CD variable MODELS_TO_TRAIN='random_forest,xgboost'\n"
            f"   - Databricks Job: Pass as parameter\n"
            f"   \n"
            f"   Special keywords:\n"
            f"   - 'all' = train all available models\n"
            f"   \n"
            f"   Current value received: '{value}'"
        )
    
    # ‚úÖ Handle "all" keyword
    if value.lower() == "all":
        print(f"‚úÖ Training ALL models: {available_models}")
        return available_models
    
    # Parse comma-separated values
    models = [m.strip() for m in value.split(",") if m.strip()]
    
    if not models:
        raise ValueError(f"‚ùå No valid models found in MODELS_TO_TRAIN='{value}'")
    
    # ‚úÖ Validate against experiments_config.yml
    invalid_models = [m for m in models if m not in available_models]
    
    if invalid_models:
        raise ValueError(
            f"‚ùå Invalid model names: {invalid_models}\n"
            f"   Available in experiments_config.yml: {available_models}\n"
            f"   \n"
            f"   You tried to train: {models}\n"
            f"   Check your Git variable or experiments_config.yml"
        )
    
    print(f"‚úÖ Training selected models: {models}")
    return models

# ‚úÖ CHANGE 2: Call validation early - before any MLflow operations
try:
    MODELS_TO_TRAIN = get_models_to_train()
    print(f"\nüìã Models to train: {MODELS_TO_TRAIN}\n")
except ValueError as e:
    print(str(e))
    dbutils.notebook.exit("FAILED: Invalid MODELS_TO_TRAIN configuration")

# üî• PIPELINE SETTINGS

BASE_EXPERIMENT_NAME = pipeline_cfg["experiment"]["name"]
MODEL_ARTIFACT_PATH = pipeline_cfg["experiment"]["artifact_path"]
RAW_INPUT_TABLE = pipeline_cfg["data"]["input_table"]
FEATURES = pipeline_cfg["data"]["features"]
LABEL_COL = pipeline_cfg["data"]["label"]
RUN_NAME_PREFIX = pipeline_cfg["experiment"]["run_name_prefix"]

# üî• LOAD DATA

spark = SparkSession.builder.appName("CreditRiskTraining").getOrCreate()
df = spark.read.table(RAW_INPUT_TABLE).toPandas()

X = df[FEATURES]
y = df[LABEL_COL]

if y.dtype == "object":
    y = y.map({"yes": 1, "no": 0}).astype(int)

# üî• PREPROCESSING

categorical_cols = [c for c in X.columns if X[c].dtype == "object"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("numeric", StandardScaler(), numeric_cols)
    ]
)

# üî• TRAIN-TEST SPLIT

stratify_option = y if pipeline_cfg["data"]["split"]["stratify"] else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=pipeline_cfg["data"]["split"]["test_size"],
    stratify=stratify_option,
    random_state=pipeline_cfg["data"]["split"]["random_state"]
)

# üî• MLflow SETUP

mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")

print(f"üî¨ Setting MLflow experiment: {BASE_EXPERIMENT_NAME}")
mlflow.set_experiment(BASE_EXPERIMENT_NAME)
print(f"‚úÖ Experiment set successfully\n")

MODEL_CLASSES = {
    "random_forest": RandomForestClassifier,
    "xgboost": XGBClassifier,
    "logistic_regression": LogisticRegression
}

# üî• TRAIN LOOP

for MODEL_TYPE in MODELS_TO_TRAIN:

    if MODEL_TYPE not in MODEL_CLASSES:
        print(f"‚ö†Ô∏è  Skipping {MODEL_TYPE} - model class not found")
        continue

    if MODEL_TYPE not in experiments_cfg["models"]:
        print(f"‚ö†Ô∏è  Skipping {MODEL_TYPE} - not in experiments_config.yml")
        continue

    ModelClass = MODEL_CLASSES[MODEL_TYPE]
    EXPERIMENT_LIST = experiments_cfg["models"][MODEL_TYPE]["experiments"]

    print(f"\n{'='*80}")
    print(f"üéØ Training {MODEL_TYPE.upper()} - {len(EXPERIMENT_LIST)} experiments")
    print(f"{'='*80}")

    for exp in EXPERIMENT_LIST:

        exp_name = f"{RUN_NAME_PREFIX}_{MODEL_TYPE}_{exp['name']}"
        params = exp["params"].copy()

        with mlflow.start_run(run_name=exp_name):

            model = ModelClass(**params)

            pipeline = Pipeline([
                ("preprocessing", preprocessor),
                ("model", model)
            ])

            start = time.time()
            pipeline.fit(X_train, y_train)
            train_time = round(time.time() - start, 4)

            train_pred = pipeline.predict(X_train)
            train_accuracy = accuracy_score(y_train, train_pred)

            start_inf = time.time()
            y_pred = pipeline.predict(X_test)
            inference_time = round(time.time() - start_inf, 4)

            if hasattr(pipeline.named_steps["model"], "predict_proba"):
                y_proba = pipeline.predict_proba(X_test)[:, 1]
            else:
                y_proba = None

            metrics = {
                "test_accuracy": accuracy_score(y_test, y_pred),
                "test_precision": precision_score(y_test, y_pred),
                "test_recall": recall_score(y_test, y_pred),
                "test_f1": f1_score(y_test, y_pred),
                "train_accuracy": train_accuracy,
                "train_time": train_time,
                "inference_time": inference_time
            }

            if y_proba is not None:
                metrics["test_roc_auc"] = roc_auc_score(y_test, y_proba)

            for k, v in metrics.items():
                mlflow.log_metric(k, v)

            model_step = pipeline.named_steps["model"]
            if MODEL_TYPE == "logistic_regression" and hasattr(model_step, "n_iter_"):
                mlflow.log_metric("lr_n_iterations", int(np.max(model_step.n_iter_)))

            mlflow.log_params(params)
            mlflow.log_param("model_type", MODEL_TYPE)
            mlflow.log_param("experiment_name", BASE_EXPERIMENT_NAME)

            signature = infer_signature(X_train, pipeline.predict(X_train))

            mlflow.sklearn.log_model(
                pipeline,
                artifact_path=MODEL_ARTIFACT_PATH,
                signature=signature,
                input_example=X_train.head(5)
            )
            
            print(f"   ‚úÖ {exp_name}")

print("\n" + "=" * 80)
print("üéâ ALL MODELS TRAINING COMPLETED!")
print("=" * 80)