## Trains a regression model to predict calories burned and logs to MLflow

### Imports and Config

In [0]:
%pip install mlflow --upgrade --pre
%pip show mlflow | grep Version

In [0]:
import pandas as pd
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
import joblib

### Load data from Gold table

In [0]:
df = spark.table("workspace.fitness_dlt.gold_daily_activity_ml")
pandas_df = df.toPandas()
pandas_df.head()

### Prepare Features and Labels

In [0]:
# Drop nulls if needed
pandas_df = pandas_df.dropna()

# Features and target
X = pandas_df.drop(columns=["avg_calories"])
X_encoded = pd.get_dummies(X, columns=["activity_level"], drop_first=True)
y = pandas_df["avg_calories"]

### Train-Test Split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Save the feature order for reuse in prediction notebook
joblib.dump(list(X_train.columns), "/dbfs/tmp/features.joblib")

### Train Model

In [0]:
model = LinearRegression()
model.fit(X_train, y_train)

### Evaluate Model

In [0]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

### Track with MLflow

In [0]:
with mlflow.start_run():
    mlflow.log_params({
    "model_type": "LinearRegression",
    "features_used": ", ".join(X_encoded.columns),
    "train_size": len(X_train),
    "test_size": len(X_test)
    })

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(model, "calorie_burn_model")

### Train, Evaluate, and Track using RandomForestRegressor

In [0]:
from mlflow.models.signature import infer_signature
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

with mlflow.start_run(run_name="RandomForest_Tuned") as run:
    model = RandomForestRegressor(
        n_estimators=200,
        max_depth=20,
        random_state=42
    )
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    training_r2 = model.score(X_train, y_train)

    mlflow.log_params({
        "model_type": "RandomForestRegressor",
        "n_estimators": 200,
        "max_depth": 20
    })
    mlflow.log_metrics({
        "rmse": rmse,
        "r2": r2,
        "training_r2": training_r2
    })

    print(f"RMSE: {rmse:.2f}")
    print(f"R² (test): {r2:.4f}")
    print(f"R² (train): {training_r2:.4f}")

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log the model with the signature
    mlflow.sklearn.log_model(model, "model", signature=signature)

    # Register model to MLflow Model Registry
    model_uri = f"runs:/{run.info.run_id}/model"
    model_name = "calorie_prediction_model"

    mlflow.register_model(model_uri=model_uri, name=model_name)
    print(f"Model registered as '{model_name}'")

In [0]:
from datetime import datetime

print("✅ ML Pipeline Complete")
print(f"Model trained and registered: {model_name}")
print(f"Final test RMSE: {rmse:.2f}")
print(f"Final test R² score: {r2:.2f}")
print(f"Run completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")