In [3]:
from sys import version_info
import os
import tarfile
import pandas as pd
import numpy as np
import joblib
import marshal
from types import FunctionType
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import mlflow
import warnings

from custom_data_transformers.utils import HouseColumnTransformerFunc

warnings.filterwarnings("ignore")

HOUSING_PATH = os.path.join("../dataset")
PYTHON_VERSION = "{major}.{minor}.{micro}".format(major=version_info.major,
                                                  minor=version_info.minor,
                                                  micro=version_info.micro)

pipeline_path = "pipe.pkl"
model_path = "model.pkl"

In [4]:
backend_uri = os.environ['MLFLOW_TRACKING_URI']
artifact_uri = os.environ['MLFLOW_ARTIFACT_STORE']
mlflow.set_tracking_uri(backend_uri)

In [5]:
def fetch_housing_data(housing_path=HOUSING_PATH):
    tgz_path = os.path.join(housing_path, "housing.tgz")
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    csv_path = os.path.join(housing_path, "housing.csv")
    df = pd.read_csv(csv_path)
    return df

housing = fetch_housing_data()

### Create pipeline

In [6]:
targetCol = "median_house_value"
catCols = ["ocean_proximity"]
numCols = ["housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]

customColTrans = FunctionTransformer(HouseColumnTransformerFunc, 
                                     kw_args={"add_bedrooms_per_room": False})

num_pipeline = Pipeline([
                    ("Imputer", SimpleImputer()),
                    ("Scaler", StandardScaler()),
                    ("Transform", customColTrans)
                ])

full_pipeline = ColumnTransformer([
                    ("Numerical_Pipeline", num_pipeline, numCols),
                    ("OneHot", OneHotEncoder(), catCols)
                ])

### Data processing

In [7]:
x = housing[numCols + catCols]
y = housing[targetCol]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)
x_train = full_pipeline.fit_transform(x_train)
x_test = full_pipeline.transform(x_test)

In [8]:
x_test.shape

(4128, 10)

### Build Model

In [9]:
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(rmse)

73202.85375260453


### Save the pipeline and the model

In [10]:
joblib.dump(full_pipeline, pipeline_path);
joblib.dump(model, model_path);

In [11]:
# Create a Conda environment for the new MLflow Model that contains all necessary dependencies.
import cloudpickle
conda_env = {
    'channels': ['defaults'],
    'dependencies': [
      'python={}'.format(PYTHON_VERSION),
      'pip',
      {
        'pip': [
          'mlflow',
          'build_library',
          'scikit-learn=={}'.format(sklearn.__version__),
          'joblib=={}'.format(joblib.__version__),
          'cloudpickle=={}'.format(cloudpickle.__version__),
        ],
      },
    ],
    'name': 'model_env'
}

<IPython.core.display.Javascript object>

In [12]:
# Define the model class
class ModelWrapper(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        self.pipeline = joblib.load(context.artifacts["pipeline"])
        self.model = joblib.load(context.artifacts["model"])

    def predict(self, context, model_input):
        input_matrix = self.pipeline.transform(model_input)
        return self.model.predict(input_matrix)

In [13]:
# Create an `artifacts` dictionary that assigns a unique name to the saved pipeline and the model
# This dictionary will be passed to `mlflow.pyfunc.save_model` or `mlflow.pyfunc.log_model`, which will 
# copy the model file into the new MLflow Model's directory.
artifacts = {
    "pipeline": pipeline_path,
    "model": model_path
}

In [14]:
experiment = mlflow.get_experiment_by_name("advanced_pipeline")
if experiment is None:
    experiment_id = mlflow.create_experiment("advanced_pipeline", artifact_location=artifact_uri)
    experiment = mlflow.get_experiment(experiment_id)

mlflow.set_experiment(experiment.name)

mlflow_pyfunc_model_path = "mlflow_advanced_pipeline"
with mlflow.start_run():
    mlflow.log_metric("rmse", rmse)
    mlflow.pyfunc.log_model(mlflow_pyfunc_model_path, 
                            python_model=ModelWrapper(), 
                            artifacts=artifacts,
                            conda_env=conda_env)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



In [14]:
# Save the MLflow Model
# mlflow_pyfunc_model_path = "mlflow_advanced_workflow"
# mlflow.pyfunc.save_model(
#         path=mlflow_pyfunc_model_path, python_model=ModelWrapper(), artifacts=artifacts,
#         conda_env=conda_env)