In [1]:
import os
import joblib
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

backend_uri = os.environ['MLFLOW_TRACKING_URI']
artifact_uri = os.environ['MLFLOW_ARTIFACT_STORE']
mlflow.set_tracking_uri(backend_uri)
experiment_name = "testing"

In [2]:
def delete_temp_files(files):
    for f in files:
        if os.path.exists(f):
            os.remove(f)

In [3]:
def set_mlflow_experiment(experiment_name):
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        experiment_id = mlflow.create_experiment(experiment_name, artifact_location=artifact_uri)
        experiment = mlflow.get_experiment(experiment_id)

    mlflow.set_experiment(experiment.name)

In [4]:
data = pd.read_csv("../../dataset/housing.csv")

features = ["housing_median_age", "total_rooms"]
target = ["housing_median_value"]
x = data[features]
y = data[features]

# Simple sklearn pipeline logged as sklearn

In [5]:
pipe = Pipeline([
    ("scale", StandardScaler()),
    ("model", LinearRegression())
])

pipe.fit(x, y)

Pipeline(steps=[('scale', StandardScaler()), ('model', LinearRegression())])

In [6]:
pipeline_path = "pipe.pkl"
joblib.dump(pipe, pipeline_path)

['pipe.pkl']

In [7]:
set_mlflow_experiment(experiment_name)

with mlflow.start_run():
        mlflow.log_param("pyfunc", 0)
        mlflow.sklearn.log_model(pipe, 'basic_sklearn_pipeline')
mlflow.end_run()

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



# Simple sklearn pipeline logged as pyfunc

In [8]:
artifacts_1 = {
    "model": pipeline_path
}

In [9]:
class ModelWrapper_1(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        self.model = joblib.load(context.artifacts["model"])
        return super().load_context(context)
    
    def predict(self, context, model_input):
        return self.model.predict(model_input)

In [10]:
set_mlflow_experiment(experiment_name)
with mlflow.start_run():
        mlflow.log_param("pyfunc", 1)
        mlflow.pyfunc.log_model('pyfunc_pipeline',
                                python_model = ModelWrapper_1(),
                                artifacts = artifacts_1)
mlflow.end_run()

delete_temp_files([pipeline_path])

# Pipeline and model stored separately

In [11]:
pipe = Pipeline([("scaler", StandardScaler())])
pipe.fit(x)
x_transformed = pipe.transform(x)

model = LinearRegression()
model.fit(x_transformed, y)

LinearRegression()

In [12]:
pipeline_path = "pipe.pkl"
joblib.dump(pipe, pipeline_path)

model_path = "model.pkl"
joblib.dump(model, model_path)

['model.pkl']

In [13]:
artifacts_2 = {
    "pipeline": pipeline_path,
    "model": model_path
}

class ModelWrapper_2(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        self.pipeline = joblib.load(context.artifacts["pipeline"])
        self.model = joblib.load(context.artifacts["model"])
        return super().load_context(context)
    
    def predict(self, context, model_input):
        input_matrix = self.pipeline.transform(model_input)
        return self.model.predict(input_matrix)

In [14]:
set_mlflow_experiment(experiment_name)
with mlflow.start_run():
        mlflow.log_param("pyfunc", 2)
        mlflow.pyfunc.log_model('pyfunc_right_way',
                                python_model = ModelWrapper_2(),
                                artifacts = artifacts_2)
mlflow.end_run()

delete_temp_files([pipeline_path, model_path])

# Pipeline and Model stored together in a folder under the current folder.
## This does not work! The .pkl files have to be saved locally before logging the pyfunc model

In [15]:
save_path = os.path.join(experiment_name, "artifacts")
try:
    os.makedirs(save_path)
except:
    pass

pipeline_save_file = "pipe.pkl"
joblib.dump(pipe, os.path.join(save_path, pipeline_save_file))

model_save_file = "model.pkl"
joblib.dump(model, os.path.join(save_path, model_save_file))

['testing\\artifacts\\model.pkl']

In [16]:
artifacts_3 = {
    "pipeline": pipeline_save_file, 
    "model": model_save_file 
}

class ModelWrapper_3(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        self.pipeline = joblib.load(context.artifacts["pipeline"])
        self.model = joblib.load(context.artifacts["model"])
        return super().load_context(context)
    
    def predict(self, context, model_input):
        input_matrix = self.pipeline.transform(model_input)
        return self.model.predict(input_matrix)

In [21]:
set_mlflow_experiment(experiment_name)
with mlflow.start_run():
        mlflow.log_param("pyfunc", 3)
        mlflow.pyfunc.log_model('pyfunc_stored_differently',
                                python_model = ModelWrapper_3(),
                                artifacts = artifacts_3,
                                code_path = [save_path])
mlflow.end_run()
delete_temp_files([os.path.join(save_path, pipeline_save_file), 
                   os.path.join(save_path, model_save_file)])

FileNotFoundError: [Errno 2] No such file or directory: 'pipe.pkl'