In [None]:
import yaml
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
import mlflow
from mlflow.tracking import MlflowClient
print("Setup complete.")

In [None]:
with open("params.yaml") as f:
    params = yaml.safe_load(f)

EXPERIMENT_NAME = params["mlflow"]["experiment"]
MLFLOW_URI = params["mlflow"]["uri"]

In [None]:
train_path = params["data"]["train"]
val_path = params["data"]["val"]
test_path = params["data"]["test"]

In [None]:
train = pd.read_csv(train_path)
val = pd.read_csv(val_path)
test = pd.read_csv(test_path)

X_train, y_train = train.drop("y", axis=1), train["y"]
X_val, y_val = val.drop("y", axis=1), val["y"]
X_test, y_test = test.drop("y", axis=1), test["y"]

In [None]:
mlflow.set_tracking_uri(MLFLOW_URI)
client = MlflowClient()
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.val_f1 DESC"]
)

In [None]:
best_run = runs[0]
best_params = best_run.data.params

In [None]:
best_model = RandomForestClassifier(
    n_estimators=int(best_params["n_estimators"]),
    max_depth=None if best_params["max_depth"] == "None" else int(best_params["max_depth"]),
    random_state=42,
    verbose=1 
)

In [None]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])
best_model.fit(X_train_val, y_train_val)

In [None]:
with mlflow.start_run() as run:
    mlflow.sklearn.log_model(best_model, "best_model")
    joblib.dump(best_model, "models/best_model.pkl")
    print("✅ Best model retrained on train+val and logged.")
    
    model_uri = f"runs:/{run.info.run_id}/best_model"
    mlflow.register_model(model_uri, "Best_RF_Model")
    print("✅ Model registered in MLflow Model Registry")