In [None]:
# üöÄ CREDIT RISK CLASSIFICATION TRAINING - FIXED VERSION

%pip install scikit-learn pyyaml

import mlflow
import yaml
import numpy as np
import pandas as pd
import warnings
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from mlflow.models.signature import infer_signature
from pyspark.sql import SparkSession

warnings.filterwarnings("ignore")

print("=" * 80)
print("üöÄ CREDIT RISK CLASSIFICATION - CONFIG DRIVEN TRAINING")
print("=" * 80)

# -----------------------------------------
# 1Ô∏è‚É£ Load pipeline config (pipeline_config.yml)
# -----------------------------------------
with open("pipeline_config.yml", "r") as f:
    pipeline_cfg = yaml.safe_load(f)

MODEL_TYPE = pipeline_cfg["model"]["type"]
EXPERIMENT_NAME = pipeline_cfg["experiment"]["name"]
MODEL_ARTIFACT_PATH = pipeline_cfg["experiment"]["artifact_path"]
RAW_INPUT_TABLE = pipeline_cfg["data"]["input_table"]
FEATURES = pipeline_cfg["data"]["features"]
LABEL_COL = pipeline_cfg["data"]["label"]

TEST_SIZE = pipeline_cfg["data"]["split"]["test_size"]
RANDOM_STATE = pipeline_cfg["data"]["split"]["random_state"]
STRATIFY = pipeline_cfg["data"]["split"]["stratify"]

print("‚úÖ Pipeline config loaded.")

# -----------------------------------------
# 2Ô∏è‚É£ Load experiment config (config.yml)
# -----------------------------------------
with open("config.yml", "r") as f:
    experiment_cfg = yaml.safe_load(f)

EXPERIMENT_LIST = experiment_cfg["experiments"]
print(f"üîç Loaded {len(EXPERIMENT_LIST)} experiment variations.")

# -----------------------------------------
# 3Ô∏è‚É£ Load Raw Data (with Label Encoding)
# -----------------------------------------
spark = SparkSession.builder.appName("CreditRiskTraining").getOrCreate()

print(f"üì¶ Loading RAW data ‚Üí {RAW_INPUT_TABLE}")
df = spark.read.table(RAW_INPUT_TABLE).toPandas()

X = df[FEATURES]
y = df[LABEL_COL]

# üî• Convert textual labels ("yes", "no") ‚Üí numeric 1/0 for training
if y.dtype == "object":
    y = y.map({"yes": 1, "no": 0}).astype(int)

# -----------------------------------------
# 4Ô∏è‚É£ Setup Preprocessing Pipeline (Handles Raw Data)
# -----------------------------------------
categorical_cols = [col for col in X.columns if X[col].dtype == "object"]
numeric_cols = [col for col in X.columns if col not in categorical_cols]

print(f"üîß Categorical Features: {len(categorical_cols)} ‚Üí {categorical_cols}")
print(f"üîß Numeric Features: {len(numeric_cols)} ‚Üí {numeric_cols}")

# Pipeline will handle raw categorical data automatically
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("numeric", StandardScaler(), numeric_cols)
    ],
    remainder='passthrough'  # Keep any other columns as-is
)

# -----------------------------------------
# 5Ô∏è‚É£ Train model for each config experiment
# -----------------------------------------
mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")
mlflow.set_experiment(EXPERIMENT_NAME)

stratify_option = y if STRATIFY else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=stratify_option, random_state=RANDOM_STATE
)

results = []

for exp in EXPERIMENT_LIST:
    
    exp_name = exp["name"]
    params = exp["params"].copy()  # Make a copy to avoid modifying original
    
    # üî• FIX: Remove random_state from params if it exists (we'll set it separately)
    config_random_state = params.pop("random_state", RANDOM_STATE)

    print(f"\n‚öôÔ∏è Training experiment: {exp_name}")
    print(f"   Parameters: {params}")

    # Create model with random_state set explicitly
    rf_model = RandomForestClassifier(random_state=config_random_state, **params)

    # Complete pipeline: Preprocessing + Model
    full_pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("model", rf_model)
    ])

    with mlflow.start_run(run_name=exp_name):

        # Train on RAW data - pipeline handles preprocessing
        full_pipeline.fit(X_train, y_train)

        # Predictions
        y_pred = full_pipeline.predict(X_test)
        y_proba = full_pipeline.predict_proba(X_test)[:, 1]

        # ----- Metrics -----
        metrics = {
            "test_accuracy": accuracy_score(y_test, y_pred),
            "test_precision": precision_score(y_test, y_pred, zero_division=0),
            "test_recall": recall_score(y_test, y_pred, zero_division=0),
            "test_f1": f1_score(y_test, y_pred, zero_division=0),
            "test_roc_auc": roc_auc_score(y_test, y_proba),
        }

        # Log all metrics
        for k, v in metrics.items():
            mlflow.log_metric(k, v)

        # Log hyperparameters
        mlflow.log_params(params)
        mlflow.log_param("random_state", config_random_state)

        # Create model signature with RAW data format
        signature = infer_signature(X_train, full_pipeline.predict(X_train))

        # Log the FULL pipeline (preprocessing + model)
        mlflow.sklearn.log_model(
            sk_model=full_pipeline,
            artifact_path=MODEL_ARTIFACT_PATH,
            signature=signature,
            input_example=X_train.head(5)  # Log example raw data
        )

        print(f"   ‚úÖ {exp_name} ‚Üí F1: {metrics['test_f1']:.4f}, "
              f"Recall: {metrics['test_recall']:.4f}, "
              f"ROC-AUC: {metrics['test_roc_auc']:.4f}")

        results.append((exp_name, metrics))

# -----------------------------------------
# 6Ô∏è‚É£ Summary of All Experiments
# -----------------------------------------
print("\n" + "=" * 80)
print("üéâ TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 80)
print("\nüìä EXPERIMENT RESULTS SUMMARY:\n")

results_df = pd.DataFrame([
    {"Experiment": name, **metrics} 
    for name, metrics in results
])

print(results_df.to_string(index=False))

# Find best model by F1 score
best_idx = results_df["test_f1"].idxmax()
best_model = results_df.iloc[best_idx]

print("\n" + "=" * 80)
print(f"üèÜ BEST MODEL: {best_model['Experiment']}")
print(f"   F1 Score: {best_model['test_f1']:.4f}")
print(f"   Recall: {best_model['test_recall']:.4f}")
print(f"   ROC-AUC: {best_model['test_roc_auc']:.4f}")
print("=" * 80)

print("\n‚úÖ All models logged to MLflow and can accept RAW data!")
print("‚úÖ Models ready for serving endpoint deployment!")