In [0]:
import os
import mlflow
import mlflow.spark
from pyspark.sql import SparkSession

In [0]:
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/ecommerce/mlflow_tmp"

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS workspace.ecommerce.mlflow_tmp;

In [0]:
spark = SparkSession.builder.getOrCreate()
DATA_PATH = "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv"

In [0]:
df = (spark.read.option("header", True).option("inferSchema", True).csv(DATA_PATH))

In [0]:
display(df.limit(5))
print("Rows:", df.count())
print("Columns:", df.columns)

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-01T00:00:00.000Z,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
2019-10-01T00:00:00.000Z,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2019-10-01T00:00:01.000Z,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
2019-10-01T00:00:01.000Z,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
2019-10-01T00:00:04.000Z,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


Rows: 42448764
Columns: ['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id', 'user_session']


In [0]:
target_col = "price"

numeric_cols = dict(df.dtypes)

feature_cols = [
    c for c in df.columns
    if c != target_col and numeric_cols[c] in ("int", "double")
]

df_model = df.select(feature_cols + [target_col]).dropna()

print("Features:", feature_cols)

Features: ['product_id', 'user_id']


In [0]:
train_df, test_df = df_model.randomSplit([0.8, 0.2], seed=42)


In [0]:
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_raw"
)

scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features"
)

In [0]:
lr = LinearRegression(
    featuresCol="features",
    labelCol=target_col
)

ridge = LinearRegression(
    featuresCol="features",
    labelCol=target_col,
    regParam=0.5,
    elasticNetParam=0.0
)

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol=target_col,
    numTrees=50,
    maxDepth=8,
    seed=42
)

models = {
    "LinearRegression": lr,
    "RidgeRegression": ridge,
    "RandomForest": rf
}


In [0]:
mlflow.set_experiment(
    "/Users/venkat.mce38@gmail.com/ecommerce-model-comparison"
)


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/864934663048954', creation_time=1768969030263, experiment_id='864934663048954', last_update_time=1768969408175, lifecycle_stage='active', name='/Users/venkat.mce38@gmail.com/ecommerce-model-comparison', tags={'mlflow.experiment.sourceName': '/Users/venkat.mce38@gmail.com/ecommerce-model-comparison',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'venkat.mce38@gmail.com',
 'mlflow.ownerId': '76192472775648'}>

In [0]:
from mlflow.models.signature import infer_signature

In [0]:
evaluator = RegressionEvaluator(
    labelCol=target_col,
    predictionCol="prediction",
    metricName="rmse"
)

results = []

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):

        pipeline = Pipeline(stages=[
            assembler,
            scaler,
            model
        ])

        pipeline_model = pipeline.fit(train_df)

        predictions = pipeline_model.transform(test_df)
        rmse = evaluator.evaluate(predictions)

        # ----- Signature + Input Example -----
        input_example = train_df.limit(5).toPandas()
        pred_sample = pipeline_model.transform(train_df.limit(5)) \
                                     .select("prediction") \
                                     .toPandas()

        signature = infer_signature(input_example, pred_sample)

        # ----- MLflow Logging -----
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("num_features", len(feature_cols))
        mlflow.log_metric("rmse", rmse)

        mlflow.spark.log_model(
            pipeline_model,
            artifact_path="model",
            input_example=input_example,
            signature=signature
        )

        results.append((model_name, rmse))
        print(f"{model_name} → RMSE: {rmse}")



LinearRegression → RMSE: 348.4490879365764




RidgeRegression → RMSE: 348.4490476683829




RandomForest → RMSE: 293.1631144404939


In [0]:
results_df = spark.createDataFrame(
    results,
    ["Model", "RMSE"]
)

display(results_df.orderBy("RMSE"))

Model,RMSE
RandomForest,293.1631144404939
RidgeRegression,348.4490476683829
LinearRegression,348.4490879365764


In [0]:
best_model, best_rmse = min(results, key=lambda x: x[1])

print("✅ BEST MODEL SELECTED")
print("Model:", best_model)
print("RMSE :", best_rmse)

✅ BEST MODEL SELECTED
Model: RandomForest
RMSE : 293.1631144404939


In [0]:
best_model, best_rmse, best_run_id = min(
    results, key=lambda x: x[1]
)

mlflow.register_model(
    model_uri=f"runs:/{best_run_id}/model",
    name="workspace.ecommerce.best_price_model"
)

print("✅ Best model registered successfully")


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-6743076984187771>, line 2[0m
[1;32m      1[0m best_model, best_rmse, best_run_id [38;5;241m=[39m [38;5;28mmin[39m(
[0;32m----> 2[0m     results, key[38;5;241m=[39m[38;5;28;01mlambda[39;00m x: x[[38;5;241m1[39m]
[1;32m      3[0m )
[1;32m      5[0m mlflow[38;5;241m.[39mregister_model(
[1;32m      6[0m     model_uri[38;5;241m=[39m[38;5;124mf[39m[38;5;124m"[39m[38;5;124mruns:/[39m[38;5;132;01m{[39;00mbest_run_id[38;5;132;01m}[39;00m[38;5;124m/model[39m[38;5;124m"[39m,
[1;32m      7[0m     name[38;5;241m=[39m[38;5;124m"[39m[38;5;124mworkspace.ecommerce.best_price_model[39m[38;5;124m"[39m
[1;32m      8[0m )
[1;32m     10[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124m✅ Best model registered successfully[39m[38;5;124m"[39m)

[0;31mNameError