In [0]:
import pandas as pd
import seaborn as sns
#
from pyspark.sql.functions import *
#
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression, FMRegressor, LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
#
import mlflow
#
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
#
import logging

In [0]:
logging.getLogger("mlflow").setLevel(logging.FATAL)

In [0]:
tips_df = sns.load_dataset("tips")
#
tips_sdf = spark.createDataFrame(tips_df)
#
display(tips_sdf.limit(5))

In [0]:
tips_sdf = tips_sdf.selectExpr("total_bill",
                               "tip",
                               "case when sex = 'Female' then 1 else 0 end as sex",
                               "case when smoker = 'yes' then 1 else 0 end as smoker",
                               "case when time = 'Dinner' then 1 else 0 end as time",
                               "day",
                               "size")
#
train_df, test_df = tips_sdf.randomSplit([.8, .2], seed=42)
#
ohe_cols = ["size", "day"]
num_cols = ["total_bill", "sex", "smoker", "time"]
target_col = "tip"
#
string_indexer = StringIndexer(inputCols=ohe_cols, outputCols=[c+"_index" for c in ohe_cols], handleInvalid="skip")
#
ohe = OneHotEncoder()
ohe.setInputCols([c+"_index" for c in ohe_cols])
ohe.setOutputCols([c+"_ohe" for c in ohe_cols])
#
assembler_inputs = [c+"_ohe" for c in ohe_cols] + num_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

In [0]:
glr =       GeneralizedLinearRegression(featuresCol="features", labelCol=target_col, maxIter=10)
lrm =       LinearRegression(featuresCol="features", labelCol=target_col)
fmr =       FMRegressor(featuresCol="features", labelCol=target_col, stepSize=0.001)
evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="rmse")

In [0]:
signature = mlflow.models.infer_signature(train_df, train_df[["tip"]]);
print(signature)

In [0]:
input_example = train_df.toPandas().head()
input_example

In [0]:
model_name = "linear-regression"
#
with mlflow.start_run(run_name="Tip-run") as run:
    #
    # define pipeline stages according to model
    stages = [string_indexer, ohe, vec_assembler, lrm]
    #
    # set pipeline
    pipeline = Pipeline(stages=stages)
    #
    # fit pipeline to train set
    model = pipeline.fit(train_df)
    #
    # manually log model to mlflow
    mlflow.spark.log_model(model, model_name, signature=signature, input_example=input_example)
    #
    # manually log parameter to mlflow
    mlflow.log_param("maxIter", 11)
    #
    # predict test set
    pred_df = model.transform(test_df)
    #
    # evaluate prediction
    rmse = evaluator.evaluate(pred_df)
    #
    # manually log metric to mlflow
    mlflow.log_metric("rmse", rmse)

In [0]:
with mlflow.start_run(run_name="tips_evaluation") as run_parent:
    #
    # loop on the three regression models
    for regression_model in [glr, lrm, fmr]:
        #
        # get model name
        model_name = regression_model.__str__().split("_")[0]
        #
        # Nest mlflow logging
        with mlflow.start_run(run_name=model_name, nested=True) as run:
            #
            # define pipeline stages according to model
            stages = [string_indexer, ohe, vec_assembler, regression_model]
            #
            # set pipeline
            pipeline = Pipeline(stages=stages)
            #
            # fit pipeline to train set
            model = pipeline.fit(train_df)
            #
            # log model to mlflow
            mlflow.spark.log_model(model, model_name, signature=signature, input_example=input_example)
            #
            # predict test set
            pred_df = model.transform(test_df)
            #
            # evaluate prediction
            rmse = evaluator.evaluate(pred_df)
            #
            # log evaluation to mlflow
            mlflow.log_metric("rmse", rmse)

In [0]:
mlflow.autolog()

In [0]:
# fit pipeline to train set
model_lrm_autolog = pipeline.fit(train_df)
#
# predict test set
pred_df = model_lrm_autolog.transform(test_df)
#
# evaluate
evaluator.evaluate(pred_df)

In [0]:
output_mlflow = (spark.createDataFrame(mlflow.search_runs())
                      .drop(*['tags.mlflow.source.name',
                              'tags.mlflow.databricks.notebookPath',
                              'tags.mlflow.user',
                              'tags.mlflow.databricks.workspaceURL',
                              'tags.mlflow.databricks.cluster.info']))
display(output_mlflow.orderBy(desc("end_time")).limit(1))

In [0]:
def train_model(maxIter, regParam, elasticNetParam, labelCol):
    """
    This train() function:
     - takes hyperparameters as inputs (for tuning later)
     - returns the rmse score on the test dataset
    """
    # Use MLflow to track training.
    # Specify "nested=True" since this single model will be logged as a child run of Hyperopt's run.
    with mlflow.start_run(nested=True):
        #
        model_hyperopt = LinearRegression(maxIter=maxIter,
                                          regParam=regParam,
                                          elasticNetParam=elasticNetParam,
                                          labelCol=target_col)
        #
        evaluator_hyperopt = RegressionEvaluator(labelCol=target_col, predictionCol="prediction")
        #
        stages = [string_indexer, ohe, vec_assembler, model_hyperopt]
        #
        # set pipeline
        pipeline = Pipeline(stages=stages)
        #
        # fit pipeline to train set
        model_rfr_hyperopt = pipeline.fit(train_df)
        #
        # predict test set
        pred_df = model_rfr_hyperopt.transform(test_df)
        #
        # evaluate
        rmse = evaluator_hyperopt.evaluate(pred_df)
        #
        # log rmse for each child run
        mlflow.log_metric("rmse", rmse)
    #
    return model_rfr_hyperopt, rmse

In [0]:
def objective(params):
    """ This function is the function to minimize by hyperopt """
    #
    model, rmse = train_model(maxIter=params["maxIter"],
                              regParam=params["regParam"],
                              elasticNetParam=params["elasticNetParam"],
                              labelCol=target_col)
    #
    return {'loss': rmse, 'status': STATUS_OK}

In [0]:
search_spaces = {"maxIter": hp.quniform("maxIter", 1, 100, 1),
                 "regParam": hp.uniform("regParam", 0.1, 10),
                 "elasticNetParam": hp.uniform("elasticNetParam", 0, 1)}

In [0]:
with mlflow.start_run(run_name="hyperopt_tips"):
    argmin = fmin(fn=objective,
                  space=search_spaces,
                  algo=tpe.suggest,
                  max_evals=15,
                  trials=Trials())

In [0]:
print("Parameters of the best model: ", argmin)

In [0]:
mlflow.autolog(disable=True)

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
#
import matplotlib.pyplot as plt

In [0]:
# load dataset previously prepared
pandas_tips = tips_sdf.toPandas()
#
# set features dataset
pandas_tips_features = pandas_tips.drop(["tip", "day", "time"], axis=1)
#
# set target
pandas_tips_target   = pandas_tips["tip"]
#
# train test split
pd_df_X_train, pd_df_X_test, pd_df_y_train, pd_df_y_test = train_test_split(pandas_tips_features,
                                                                            pandas_tips_target,
                                                                            test_size=0.33,
                                                                            random_state=42)
#
# fit 
fitted_rfr_model = RandomForestRegressor().fit(pd_df_X_train, pd_df_y_train)

In [0]:
# import shap
# import matplotlib.pyplot as plt
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(X_test)
# # Generate summary plot and save
# shap.summary_plot(shap_values, X_test,
# show=False)
# plt.savefig("shap_summary.png")
# # Log as artifact
# mlflow.log_artifact("shap_summary.png")


In [0]:
with mlflow.start_run(run_name="shap_tips"):
    mlflow.shap.log_explanation(fitted_rfr_model.predict, pd_df_X_test)

In [0]:
%python
import shap
import numpy as np
import matplotlib.pyplot as plt

# Define the path to the artifacts
artifact_path = "dbfs:/databricks/mlflow-tracking/a89510df8d5e43debb832600e5d3723f/e6377081e9e742bd920f830ba7e1fc48/artifacts/model_explanations_shap"

# Load the artifacts directly from DBFS
shap_values = np.load( artifact_path + "/shap_values.npy")
base_values = np.load( artifact_path + "/base_values.npy")

# Create a SHAP explainer object
explainer = shap.Explainer(fitted_rfr_model, pd_df_X_test)

# Generate the summary plot
shap.summary_plot(shap_values, pd_df_X_test)
plt.show()

In [0]:
with mlflow.start_run(run_name="figure_tips"):
    #
    # Generate feature importance plot thanks to feature_importances_ attribute of the RandomForestRegressor model
    feature_importances = pd.Series(fitted_rfr_model.feature_importances_, index=pd_df_X_train.columns)
    fig, ax = plt.subplots()
    feature_importances.plot.bar(ax=ax)
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    #
    # Log figure to mlflow
    mlflow.log_figure(fig, "feature_importance_rf.png")