### Trained and compared Linear and Ridge regression models, logging parameters, metrics, and signatures using MLflow with Unity Catalog governance.

In [0]:
%fs
ls /Volumes/workspace/ecommerce/ecommerce_data

path,name,size,modificationTime
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv,2019-Nov.csv,9006762395,1767962633000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv,2019-Oct.csv,5668612855,1767962553000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/,delta/,0,1768891934121
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/incremental/,incremental/,0,1768891934121
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/parquet/,parquet/,0,1768891934121


In [0]:
import mlflow
import mlflow.sklearn

from pyspark.sql import functions as F

In [0]:
features_df = spark.table("ecommerce.gold.user_features_ml")

In [0]:
model_df = features_df.select("views","sessions","purchases","view_to_purchase_rate","purchase_per_session","total_spent").fillna(0)
pdf = model_df.toPandas()

In [0]:
X = pdf.drop(columns=["total_spent"])
y = pdf["total_spent"]

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [0]:
import numpy as np
from mlflow.models import infer_signature
with mlflow.start_run(run_name="linear_regression_baseline"):
    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    # Compute RMSE manually since 'squared' argument is not supported
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Log parameters
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("train_rows", len(X_train))
    mlflow.log_param("test_rows", len(X_test))

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Infer model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log model with signature
    mlflow.sklearn.log_model(
        model,
        artifact_path="model",
        registered_model_name="user_spend_regression",
        signature=signature
    )

    print("RMSE:", rmse)
    print("R2:", r2)

Registered model 'user_spend_regression' already exists. Creating a new version of this model...


RMSE: 417.62571368386665
R2: 0.5899802048424623


Created version '3' of model 'workspace.default.user_spend_regression'.


In [0]:
from mlflow.models.signature import infer_signature

In [0]:
# Create input example (first few rows)
input_example = X_train.iloc[:5]

# Infer model signature
signature = infer_signature(X_train, model.predict(X_train))



### When logging models with MLflow and Unity Catalog, ensure all Spark-derived Decimal types are cast to floats to avoid serialization issues and enable proper schema inference

In [0]:
import numpy as np

# Convert Decimal â†’ float
for col in X.columns:
    X[col] = X[col].astype(float)

y = y.astype(float)

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
from mlflow.models.signature import infer_signature

input_example = X_train.head(5)
signature = infer_signature(X_train, model.predict(X_train))

### using MLflow with Unity Catalog, models must be logged with explicit input/output signatures and examples

In [0]:
with mlflow.start_run(run_name="linear_regression_baseline"):

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=input_example,
        registered_model_name="user_spend_regression"
    )

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-6425185855522582>, line 8[0m
[1;32m      4[0m model[38;5;241m.[39mfit(X_train, y_train)
[1;32m      6[0m y_pred [38;5;241m=[39m model[38;5;241m.[39mpredict(X_test)
[0;32m----> 8[0m rmse [38;5;241m=[39m mean_squared_error(y_test, y_pred, squared[38;5;241m=[39m[38;5;28;01mFalse[39;00m)
[1;32m      9[0m r2 [38;5;241m=[39m r2_score(y_test, y_pred)
[1;32m     11[0m mlflow[38;5;241m.[39mlog_param([38;5;124m"[39m[38;5;124mmodel_type[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mLinearRegression[39m[38;5;124m"[39m)

File [0;32m/databricks/python/lib/python3.12/site-packages/sklearn/utils/_param_validation.py:194[0m, in [0;36mvalidate_params.<locals>.decorator.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m    191[0m func_sig [38;5;241m=[39m signature(fun

In [0]:
import numpy as np
rmse = np.sqrt(mean_squared_error(y_test, y_pred))



In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

with mlflow.start_run(run_name="linear_regression_baseline"):

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    input_example = X_train.head(5)
    signature = infer_signature(X_train, model.predict(X_train))

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=input_example,
        registered_model_name="user_spend_regression"
    )

    print("RMSE:", rmse)
    print("R2:", r2)



In [0]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

with mlflow.start_run(run_name="ridge_regression_baseline"):

    # Train model
    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Metrics (version-safe)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Log parameters
    mlflow.log_param("model_type", "Ridge")
    mlflow.log_param("alpha", 1.0)

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Signature & input example (Unity Catalog compliant)
    input_example = X_train.head(5)
    signature = infer_signature(X_train, model.predict(X_train))

    # Log & register model
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=input_example,
        registered_model_name="user_spend_regression"
    )

    print("Ridge RMSE:", rmse)
    print("Ridge R2:", r2)

Registered model 'user_spend_regression' already exists. Creating a new version of this model...


Ridge RMSE: 417.62566638299336
Ridge R2: 0.5899802977212896


Created version '4' of model 'workspace.default.user_spend_regression'.
