# 05 - MLflow Experiment Tracking

This notebook demonstrates MLflow integration for experiment tracking.

## Key Features
- **Experiment Tracking**: Log parameters, metrics, and artifacts
- **Model Registry**: Version and manage models
- **Reproducibility**: Track all training runs

In [None]:
import sys
sys.path.insert(0, '..')

import mlflow
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from src.models.mlflow_tracking import (
    setup_mlflow,
    MLflowExperimentTracker,
    log_training_run,
    get_best_model,
)
from src.models.risk_model import FEATURE_COLUMNS, prepare_data

## Setup MLflow

In [None]:
experiment_id = setup_mlflow()
print(f"Experiment ID: {experiment_id}")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")

## Train Model with Tracking

In [None]:
# Load data
df = pd.read_parquet('../data/features.parquet')
X_train, X_test, y_train, y_test, scaler = prepare_data(df)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

In [None]:
# Train with MLflow tracking
with MLflowExperimentTracker("rf_baseline", tags={"model_type": "random_forest"}) as tracker:
    # Log parameters
    params = {
        "n_estimators": 100,
        "max_depth": 10,
        "class_weight": "balanced",
    }
    tracker.log_params(params)
    
    # Train model
    model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    
    # Evaluate
    y_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    
    # Log metrics
    tracker.log_metrics({"auc": auc, "accuracy": model.score(X_test, y_test)})
    
    # Log model
    tracker.log_model(model, registered_model_name="identity-risk-model")
    
    print(f"Run ID: {tracker.run_id}")
    print(f"AUC: {auc:.4f}")

## View Experiment Results

In [None]:
# Get all runs
runs = mlflow.search_runs(experiment_names=["identity-risk-scoring"])
print(f"Total runs: {len(runs)}")
runs[['run_id', 'metrics.auc', 'params.n_estimators', 'params.max_depth']].head()

## Get Best Model

In [None]:
try:
    best = get_best_model(metric="auc")
    print(f"Best Run ID: {best['run_id']}")
    print(f"Best AUC: {best['metrics'].get('auc', 'N/A')}")
    print(f"Model URI: {best['model_uri']}")
except Exception as e:
    print(f"Error: {e}")

## MLflow UI

To view the MLflow UI, run:
```bash
mlflow ui --port 5000
```
Then open http://localhost:5000