In [None]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import subprocess
from itertools import product
import yaml, os

In [None]:
def run_experiments():
    with open("params.yaml") as f:
        params = yaml.safe_load(f)

    train_path = params["data"]["train"]
    val_path = params["data"]["val"]
    test_path = params["data"]["test"]

    mlflow_uri = params["mlflow"]["uri"]
    experiment_name = params["mlflow"]["experiment"]

    n_estimators_list = params["model"]["n_estimators_list"]
    max_depth_list = params["model"]["max_depth_list"]

    # ------------------------------
    # Load processed data
    train = pd.read_csv(train_path)
    val = pd.read_csv(val_path)
    test = pd.read_csv(test_path)

    X_train, y_train = train.drop("y", axis=1), train["y"]
    X_val, y_val = val.drop("y", axis=1), val["y"]
    X_test, y_test = test.drop("y", axis=1), test["y"]

    # ------------------------------
    # Get current data version (Git commit hash)
    data_version = subprocess.getoutput("git rev-parse HEAD")
    print("Using data version:", data_version)

    # ------------------------------
    # Set remote MLflow server
    mlflow.set_tracking_uri(mlflow_uri)
    mlflow.set_experiment(experiment_name)

    # ------------------------------
    # Run experiments
    for n_estimators, max_depth in product(n_estimators_list, max_depth_list):
        with mlflow.start_run():
            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_param("max_depth", max_depth)
            mlflow.log_param("data_version", data_version)

            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
            model.fit(X_train, y_train)

            y_val_pred = model.predict(X_val)
            val_f1 = f1_score(y_val, y_val_pred)
            mlflow.log_metric("val_f1", val_f1)

            y_test_pred = model.predict(X_test)
            test_f1 = f1_score(y_test, y_test_pred)
            mlflow.log_metric("test_f1", test_f1)

            mlflow.sklearn.log_model(model, "model")

            print(f"Run logged: n_estimators={n_estimators}, max_depth={max_depth}, val_f1={val_f1:.4f}, test_f1={test_f1:.4f}")

In [None]:
if __name__ == "__main__":
    run_experiments()