# 07 - MLflow Experiments

Run and compare CatBoost, Bayesian, and Linear baselines with unified MLflow logging.


In [1]:
import mlflow
import shutil

from IPython.display import clear_output
from pathlib import Path
from mlflow.tracking import MlflowClient
from sympy import prime

from nba_ingame_prob.consts import proj_paths
from nba_ingame_prob.training.hpo import (
    replicate_bayesian_best, 
    replicate_catboost_best, 
    replicate_linear_logistic_best, 
    replicate_linear_regression_best,
    run_study_bayesian, 
    run_study_catboost, 
    run_study_linear_logistic, 
    run_study_linear_regression,
    run_hpo_with_override,
    replicate_with_custom_config
)

In [2]:
models = {
    "linear_regression": {
        "hpo_fn": run_study_linear_regression,
        "replicate_fn": replicate_linear_regression_best,
        "config": proj_paths.config.linear_regression,
    },
    "linear_logistic": {
        "hpo_fn": run_study_linear_logistic,
        "replicate_fn": replicate_linear_logistic_best,
        "config": proj_paths.config.linear_logistic,
    },
    "catboost": {
        "hpo_fn": run_study_catboost,
        "replicate_fn": replicate_catboost_best,
        "config": proj_paths.config.catboost,
    },
    "bayesian": {
        "hpo_fn": run_study_bayesian,
        "replicate_fn": replicate_bayesian_best,
        "config": proj_paths.config.bayesian,
    },
}

In [3]:
hpo_results = {}
n_trials = 50

for model_name, model_info in models.items():
    hpo_fn = model_info["hpo_fn"]
    config_path = model_info["config"]
    experiment_name = "NBAGames-HPO"
    
    study = run_hpo_with_override(
        hpo_fn, config_path, 
        study_name=f"{model_name}-study", 
        experiment_name=experiment_name, 
        n_trials=n_trials, 
        reserve_validation=True
    )
    
    hpo_results[model_name] = {
        "study": study,
        "best_params": study.best_params,
        "best_value": study.best_value,
    }
    
clear_output(wait=True)
print("\nHPO Results Summary:")
for model_name, hpo_result in hpo_results.items():
    print(f"{model_name}: {hpo_result['best_value']:.4f} | {hpo_result['best_params']}")


HPO Results Summary:
linear_regression: 0.4489 | {'alpha': 99.94523496627072}
linear_logistic: 0.4388 | {'C': 0.0036383924346184548}
catboost: 0.4047 | {'learning_rate': 0.09636223593149941, 'max_depth': 5, 'iterations': 1400, 'l2_leaf_reg': 0.4290344235973468, 'colsample_bylevel': 0.9433600798423123, 'subsample': 0.5369048376010586, 'max_bin': 91}
bayesian: 0.4050 | {'learning_rate': 0.002178737122149179, 'weight_decay': 1.2171989575527562e-06, 'lr_gamma': 0.8354521167934316, 'team_hidden_dim': 4, 'team_layers': 3, 'res_hidden_dim': 22, 'res_layers': 2, 'embedding_dim': None}


In [4]:
n_trials = 30

primes = [prime(i) for i in range(1, 30)]
seed_study_results = {}

for model_name, result in hpo_results.items():
    print(f"Running SEEDS evaluation for {model_name}...")
    study = result["study"]
    config_path = models[model_name]["config"]
    replicate_fn = models[model_name]["replicate_fn"]
    experiment_name = "NBAGames-SEEDS"
    
    # Use helper function with explicit reserve_validation=True and get summary
    results, summary = replicate_with_custom_config(
        replicate_fn, config_path, study, 
        seeds=primes, 
        experiment_name=experiment_name, 
        reserve_validation=True
    )
    
    seed_study_results[model_name] = summary
    print(f"  Mean: {summary['mean']:.4f} ± {summary['std']:.4f}")
    print(f"  Best seed: {summary['best_seed']} (metric: {summary['best_metric']:.4f})")

clear_output(wait=True)
print("\nSEEDS Results Summary:")
for model_name, summary in seed_study_results.items():
    print(f"{model_name}: {summary['mean']:.4f} ± {summary['std']:.4f} | Best: {summary['best_metric']:.4f} @ seed {summary['best_seed']}")


SEEDS Results Summary:
linear_regression: 0.4489 ± 0.0000 | Best: 0.4489 @ seed 2
linear_logistic: 0.4388 ± 0.0000 | Best: 0.4388 @ seed 2
catboost: 0.4145 ± 0.0057 | Best: 0.4047 @ seed 47
bayesian: 0.4116 ± 0.0058 | Best: 0.4017 @ seed 5


In [19]:
# FINAL Phase: Evaluate best seed on test set (reserve_validation=False)
final_eval_results = {}

for model_name in models.keys():
    print(f"Running FINAL evaluation for {model_name}...")
    study = hpo_results[model_name]["study"]
    config_path = models[model_name]["config"]
    replicate_fn = models[model_name]["replicate_fn"]
    best_seed = seed_study_results[model_name]["best_seed"]
    experiment_name = "NBAGames-FINAL"
    
    # Use helper function with explicit reserve_validation=False for test evaluation
    results, summary = replicate_with_custom_config(
        replicate_fn, config_path, study, 
        seeds=[best_seed], 
        experiment_name=experiment_name, 
        reserve_validation=False
    )
    
    final_metric = summary['best_metric']  # Should be the only metric since single seed
    final_eval_results[model_name] = {"final_metric": final_metric}
    print(f"  Test metric: {final_metric:.4f}")

# Find best model based on test performance
best_final_model = min(final_eval_results.items(), key=lambda x: x[1]["final_metric"])  # Lower log loss is better

clear_output(wait=True)
print("\nFINAL Results Summary:")
for model_name, result in final_eval_results.items():
    print(f"{model_name}: {result['final_metric']:.4f}")
print(f"\nBest model: {best_final_model[0]} (log loss: {best_final_model[1]['final_metric']:.4f})")


FINAL Results Summary:
linear_regression: 0.5024
linear_logistic: 0.5029
catboost: 0.4739
bayesian: 0.4757

Best model: catboost (log loss: 0.4739)


In [20]:
production_dir = proj_paths.models / "production"
production_dir.mkdir(exist_ok=True)

print("Deploying best configuration for each model class to production...\n")

client = MlflowClient()
final_experiment = mlflow.get_experiment_by_name("NBAGames-FINAL")

for model_name in models.keys():
    print(f"Deploying {model_name}...")
    
    best_seed = seed_study_results[model_name]["best_seed"]
    test_metric = final_eval_results[model_name]["final_metric"]
    
    runs = client.search_runs(
        experiment_ids=[final_experiment.experiment_id],
        filter_string=f"tags.seed_study = '{model_name}' AND tags.seed = '{best_seed}'",
        max_results=1
    )
    
    if not runs:
        print(f"  No FINAL run found for {model_name} with seed {best_seed}")
        continue
        
    final_run = runs[0]
    print(f"  Downloading artifacts from FINAL run: {final_run.info.run_id}")
    
    artifacts_dir = client.download_artifacts(final_run.info.run_id, "artifacts")
    artifacts_path = Path(artifacts_dir)
    
    downloaded_files = []
    
    if artifacts_path.exists():
        for file_path in artifacts_path.rglob("*"):
            if file_path.is_file() and file_path.suffix in ('.pth', '.pkl', '.joblib', '.cbm', '.yaml'):
                print(f"    Copying {file_path.name}...")
                target_path = production_dir / model_name / file_path.name
                target_path.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(file_path, target_path)
                downloaded_files.append(target_path)
                print(f"      -> {target_path}")
    
    print(f"  ✓ {model_name} deployed ({len(downloaded_files)} files, seed: {best_seed}, test log loss: {test_metric:.4f})\n")

print("Production deployment complete!")
print(f"All model artifacts saved to: {production_dir}")
print("\nDeployed models:")
for model_name in models.keys():
    test_metric = final_eval_results[model_name]["final_metric"]
    best_seed = seed_study_results[model_name]["best_seed"]
    print(f"  {model_name}: seed {best_seed}, test log loss {test_metric:.4f}")

print(f"\nOverall best model: {best_final_model[0]} (log loss: {best_final_model[1]['final_metric']:.4f})")

Deploying best configuration for each model class to production...

Deploying linear_regression...
  Downloading artifacts from FINAL run: be6127461d484f6a8713a0e493fea545


Downloading artifacts:   0%|          | 0/3 [00:00<?, ?it/s]

    Copying run_config.yaml...
      -> /workspaces/nba-ingame-prob/models/production/linear_regression/run_config.yaml
    Copying model.pkl...
      -> /workspaces/nba-ingame-prob/models/production/linear_regression/model.pkl
    Copying scalers.pkl...
      -> /workspaces/nba-ingame-prob/models/production/linear_regression/scalers.pkl
  ✓ linear_regression deployed (3 files, seed: 2, test log loss: 0.5024)

Deploying linear_logistic...
  Downloading artifacts from FINAL run: f9888c52fa1147649f9e5a0c887058ec
  Downloading artifacts from FINAL run: f9888c52fa1147649f9e5a0c887058ec


Downloading artifacts:   0%|          | 0/3 [00:00<?, ?it/s]

    Copying run_config.yaml...
      -> /workspaces/nba-ingame-prob/models/production/linear_logistic/run_config.yaml
    Copying model.pkl...
      -> /workspaces/nba-ingame-prob/models/production/linear_logistic/model.pkl
    Copying scalers.pkl...
      -> /workspaces/nba-ingame-prob/models/production/linear_logistic/scalers.pkl
  ✓ linear_logistic deployed (3 files, seed: 2, test log loss: 0.5029)

Deploying catboost...
  Downloading artifacts from FINAL run: c6d6e6ebfc034450952eedcf73a7ed5c


Downloading artifacts:   0%|          | 0/3 [00:00<?, ?it/s]

    Copying run_config.yaml...
      -> /workspaces/nba-ingame-prob/models/production/catboost/run_config.yaml
    Copying model.cbm...
      -> /workspaces/nba-ingame-prob/models/production/catboost/model.cbm
    Copying scalers.pkl...
      -> /workspaces/nba-ingame-prob/models/production/catboost/scalers.pkl
  ✓ catboost deployed (3 files, seed: 47, test log loss: 0.4739)

Deploying bayesian...
  Downloading artifacts from FINAL run: 7d0cd2fc6b544388a27ee252670d25ae
  Downloading artifacts from FINAL run: 7d0cd2fc6b544388a27ee252670d25ae


Downloading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

    Copying run_config.yaml...
      -> /workspaces/nba-ingame-prob/models/production/bayesian/run_config.yaml
    Copying model.pth...
      -> /workspaces/nba-ingame-prob/models/production/bayesian/model.pth
    Copying model_init.yaml...
      -> /workspaces/nba-ingame-prob/models/production/bayesian/model_init.yaml
    Copying scalers.pkl...
      -> /workspaces/nba-ingame-prob/models/production/bayesian/scalers.pkl
  ✓ bayesian deployed (4 files, seed: 5, test log loss: 0.4757)

Production deployment complete!
All model artifacts saved to: /workspaces/nba-ingame-prob/models/production

Deployed models:
  linear_regression: seed 2, test log loss 0.5024
  linear_logistic: seed 2, test log loss 0.5029
  catboost: seed 47, test log loss 0.4739
  bayesian: seed 5, test log loss 0.4757

Overall best model: catboost (log loss: 0.4739)
