In [0]:
import pandas as pd
import seaborn as sns
#
from pyspark.sql.functions import *
#
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
#
import mlflow
#
import logging

In [0]:
logging.getLogger("mlflow").setLevel(logging.FATAL)

In [0]:
tips_df = sns.load_dataset("tips")
#
tips_sdf = spark.createDataFrame(tips_df)
#
display(tips_sdf.limit(5))

In [0]:
display(tips_df)

In [0]:
display(tips_sdf.filter("size is null"))

In [0]:
tips_sdf = tips_sdf.selectExpr("total_bill",
                               "tip",
                               "case when sex = 'Female' then 1 else 0 end as sex",
                               "case when smoker = 'yes' then 1 else 0 end as smoker",
                               "case when time = 'Dinner' then 1 else 0 end as time",
                               "day",
                               "size")
#
train_df, test_df = tips_sdf.randomSplit([.8, .2])
#
ohe_cols = ["size", "day"]
num_cols = ["total_bill", "sex", "smoker", "time"]
target_col = "tip"
#
string_indexer = StringIndexer(inputCols=ohe_cols, outputCols=[c+"_index" for c in ohe_cols], handleInvalid="skip")
#
ohe = OneHotEncoder()
ohe.setInputCols([c+"_index" for c in ohe_cols])
ohe.setOutputCols([c+"_ohe" for c in ohe_cols])
#
assembler_inputs = [c+"_ohe" for c in ohe_cols] + num_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

In [0]:
gbt =       GBTRegressor(featuresCol="features", labelCol=target_col, maxIter=5)
evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="rmse")

Observe how you are logging models 

mlflow.spark.log_model(model: obj , model_name: str)

If it is a sklearn model

mlflow.sklearn.log_model(model: obj , model_name: str)

In [0]:
model_name = "GBT-Regressor"
#
with mlflow.start_run(run_name="Tip-run") as run:
    #
    # define pipeline stages according to model
    stages = [string_indexer, ohe, vec_assembler, gbt]
    #
    # set pipeline
    pipeline = Pipeline(stages=stages)
    #
    # fit pipeline to train set
    model = pipeline.fit(train_df)
    #
    # manually log model to mlflow
    mlflow.spark.log_model(model, model_name)
    #
    # manually log parameter to mlflow
    mlflow.log_param("maxIter", 5)
    #
    # predict test set
    pred_df = model.transform(test_df)
    #
    # evaluate prediction
    rmse = evaluator.evaluate(pred_df)
    #
    # manually log metric to mlflow
    mlflow.log_metric("rmse", rmse)

In [0]:
mlflow.search_runs()

In [0]:
mlflow.search_runs()[['params.maxIter','run_id','metrics.rmse']].sort_values(by=['metrics.rmse'],ascending=True)

In [0]:
best_run_id = mlflow.search_runs().sort_values(by=['metrics.rmse'],ascending=True).iloc[0].run_id

In [0]:
best_model = f"runs:/{best_run_id}/{model_name}"

In [0]:
loaded_model = mlflow.spark.load_model(best_model)

In [0]:
display(loaded_model.transform(test_df).select("tip", "prediction"))