In [None]:
# üöÄ CREDIT RISK TRAINING - MULTI MODEL SUPPORT

%pip install scikit-learn pyyaml xgboost

import mlflow
import time
import yaml
import sys
import os  # ‚úÖ NEW: required for Git / Env variables
import numpy as np
import pandas as pd
import warnings
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from mlflow.models.signature import infer_signature
from pyspark.sql import SparkSession

warnings.filterwarnings("ignore")

print("=" * 80)
print("üöÄ CREDIT RISK TRAINING - MULTI MODEL MODE")
print("=" * 80)

# ==========================================
# üî• LOAD BOTH CONFIG FILES
# ==========================================

print("\nüìã Loading configuration files...")

with open("pipeline_config.yml", "r") as f:
    pipeline_cfg = yaml.safe_load(f)

with open("experiments_config.yml", "r") as f:
    experiments_cfg = yaml.safe_load(f)

print("‚úÖ Configs loaded successfully")

# ==========================================
# üî• GET MODELS TO TRAIN (Databricks Job Safe)
# ==========================================

try:
    MODELS_TO_TRAIN_ENV = dbutils.widgets.get("MODELS_TO_TRAIN")
except:
    MODELS_TO_TRAIN_ENV = None

if not MODELS_TO_TRAIN_ENV:
    raise ValueError("‚ùå MODELS_TO_TRAIN job parameter is not set")

MODELS_TO_TRAIN = [m.strip() for m in MODELS_TO_TRAIN_ENV.split(",")]

print(f"\n‚úÖ Models to train: {MODELS_TO_TRAIN}")

# ==========================================
# 1Ô∏è‚É£ Extract Pipeline Settings (SAME AS BEFORE)
# ==========================================

EXPERIMENT_NAME = pipeline_cfg["experiment"]["name"]
MODEL_ARTIFACT_PATH = pipeline_cfg["experiment"]["artifact_path"]
RAW_INPUT_TABLE = pipeline_cfg["data"]["input_table"]
FEATURES = pipeline_cfg["data"]["features"]
LABEL_COL = pipeline_cfg["data"]["label"]
RUN_NAME_PREFIX = pipeline_cfg["experiment"]["run_name_prefix"]

# ==========================================
# 2Ô∏è‚É£ Load Raw Data (SAME AS BEFORE)
# ==========================================

spark = SparkSession.builder.appName("CreditRiskTraining").getOrCreate()
df = spark.read.table(RAW_INPUT_TABLE).toPandas()

X = df[FEATURES]
y = df[LABEL_COL]

if y.dtype == "object":
    y = y.map({"yes": 1, "no": 0}).astype(int)

# ==========================================
# 3Ô∏è‚É£ Preprocessing (SAME AS BEFORE)
# ==========================================

categorical_cols = [c for c in X.columns if X[c].dtype == "object"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("numeric", StandardScaler(), numeric_cols)
    ]
)

# ==========================================
# 4Ô∏è‚É£ Train-Test Split (SAME AS BEFORE)
# ==========================================

stratify_option = y if pipeline_cfg["data"]["split"]["stratify"] else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=pipeline_cfg["data"]["split"]["test_size"],
    stratify=stratify_option,
    random_state=pipeline_cfg["data"]["split"]["random_state"]
)

# ==========================================
# 5Ô∏è‚É£ MLflow Setup (SAME AS BEFORE)
# ==========================================

mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")
mlflow.set_experiment(EXPERIMENT_NAME)

# ==========================================
# üî• MODEL CLASS MAPPING
# ==========================================

MODEL_CLASSES = {
    "random_forest": RandomForestClassifier,
    "xgboost": XGBClassifier,
    "logistic_regression": LogisticRegression
}

# ==========================================
# 6Ô∏è‚É£ TRAIN EACH MODEL TYPE (MAIN LOOP)
# ==========================================

for MODEL_TYPE in MODELS_TO_TRAIN:

    print(f"\n{'='*80}")
    print(f"üöÄ Training Model: {MODEL_TYPE.upper()}")
    print(f"{'='*80}")

    if MODEL_TYPE not in MODEL_CLASSES:
        print(f"‚ùå Unsupported model type: {MODEL_TYPE}")
        continue

    if MODEL_TYPE not in experiments_cfg["models"]:
        print(f"‚ùå No experiments found for {MODEL_TYPE} in experiments_config.yml")
        continue

    ModelClass = MODEL_CLASSES[MODEL_TYPE]
    model_config = experiments_cfg["models"][MODEL_TYPE]
    EXPERIMENT_LIST = model_config["experiments"]

    print(f"üîç Running {len(EXPERIMENT_LIST)} experiments for {MODEL_TYPE}")

    for exp in EXPERIMENT_LIST:
        exp_name = f"{RUN_NAME_PREFIX}_{MODEL_TYPE}_{exp['name']}"
        params = exp["params"].copy()

        with mlflow.start_run(run_name=exp_name):

            model = ModelClass(**params)

            pipeline = Pipeline([
                ("preprocessing", preprocessor),
                ("model", model)
            ])

            start = time.time()
            pipeline.fit(X_train, y_train)
            train_time = round(time.time() - start, 4)

            train_pred = pipeline.predict(X_train)
            train_accuracy = accuracy_score(y_train, train_pred)

            start_inf = time.time()
            y_pred = pipeline.predict(X_test)
            inference_time = round(time.time() - start_inf, 4)

            y_proba = pipeline.predict_proba(X_test)[:, 1]

            metrics = {
                "test_accuracy": accuracy_score(y_test, y_pred),
                "test_precision": precision_score(y_test, y_pred),
                "test_recall": recall_score(y_test, y_pred),
                "test_f1": f1_score(y_test, y_pred),
                "test_roc_auc": roc_auc_score(y_test, y_proba),
                "train_accuracy": train_accuracy,
                "train_time": train_time,
                "inference_time": inference_time
            }

            for k, v in metrics.items():
                mlflow.log_metric(k, v)

            mlflow.log_params(params)
            mlflow.log_param("model_type", MODEL_TYPE)

            signature = infer_signature(X_train, pipeline.predict(X_train))

            mlflow.sklearn.log_model(
                pipeline,
                artifact_path=MODEL_ARTIFACT_PATH,
                signature=signature,
                input_example=X_train.head(5)
            )

            print(f"   ‚úÖ {exp_name} completed")

    print(f"\n{'='*80}")
    print(f"üéâ {MODEL_TYPE.upper()} TRAINING COMPLETED!")
    print(f"{'='*80}")

# ==========================================
# üéâ FINAL SUMMARY
# ==========================================

print("\n" + "=" * 80)
print("üéâ ALL MODELS TRAINING COMPLETED!")
print("=" * 80)
print(f"‚úÖ Trained models: {', '.join(MODELS_TO_TRAIN)}")
print("=" * 80)
