In [None]:
from sys import version_info
import os
import io
import pandas as pd
import numpy as np
import joblib
import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec
import dvc.api
import warnings

warnings.filterwarnings("ignore")

backend_uri = os.environ['MLFLOW_TRACKING_URI']
artifact_uri = os.environ['MLFLOW_ARTIFACT_STORE']
mlflow.set_tracking_uri(backend_uri)

PYTHON_VERSION = "{major}.{minor}.{micro}".format(major=version_info.major,
                                                  minor=version_info.minor,
                                                  micro=version_info.micro)
pipeline_path = "pipe.pkl"
model_path = "model.pkl"

path = 'dataset/housing.csv'
repo = '../.git'
version = 'v1' #Git tag

data_url = dvc.api.get_url(
    path = path,
    repo = repo,
    rev = version
)

In [None]:
def fetch_data_from_s3(path, repo, version):
    data = dvc.api.read(
            path = path,
            repo = repo,
            rev = version
        )
    return pd.read_csv(io.StringIO(data), sep=',')


def fetch_data_from_fs(url):
    return pd.read_csv(url, sep=',')


def fetch_data(url):
    storage_type = url.split(":")[0]
    if storage_type.upper() == "S3":
        data = fetch_data_from_s3(path, repo, version)
    else:
        data = fetch_data_from_fs(url)
        
    return data

In [None]:
housing = fetch_data(data_url)
housing.head()

### Create pipeline

In [None]:
targetCol = "median_house_value"
catCols = ["ocean_proximity"]
numCols = ["housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]

num_pipeline = Pipeline([
                    ("Imputer", SimpleImputer()),
                    ("Scaler", StandardScaler())
                ])

full_pipeline = ColumnTransformer([
                    ("Numerical_Pipeline", num_pipeline, numCols),
                    ("OneHot", OneHotEncoder(), catCols)
                ])

### Data processing

In [None]:
x = housing[numCols + catCols]
y = housing[targetCol]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)
x_train = full_pipeline.fit_transform(x_train)
x_test = full_pipeline.transform(x_test)

In [None]:
categories = list(full_pipeline.named_transformers_['OneHot'].categories_[0])
feature_names = numCols + categories

### Build Model

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
rmse

### Save the pipeline and the model

In [None]:
joblib.dump(model, model_path)
joblib.dump(full_pipeline, pipeline_path);

In [None]:
# Create an `artifacts` dictionary that assigns a unique name to the saved pipeline and the model
# This dictionary will be passed to `mlflow.pyfunc.save_model`, which will copy the model file
# into the new MLflow Model's directory.
artifacts = {
    "pipeline": pipeline_path,
    "model": model_path
}

# Define the model class
class ModelWrapper(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        self.pipeline = joblib.load(context.artifacts["pipeline"])
        self.model = joblib.load(context.artifacts["model"])

    def predict(self, context, model_input):
        model_input.columns = ["housing_median_age", "total_rooms", "total_bedrooms", 
                   "population", "households", "median_income", "ocean_proximity"]
        input_matrix = self.pipeline.transform(model_input)
        return self.model.predict(input_matrix)

In [None]:
# Create a Conda environment for the new MLflow Model that contains all necessary dependencies.
import cloudpickle
conda_env = {
    'channels': ['defaults'],
    'dependencies': [
      'python={}'.format(PYTHON_VERSION),
      'pip',
      {
        'pip': [
          'mlflow',
          'scikit-learn=={}'.format(sklearn.__version__),
          'joblib=={}'.format(joblib.__version__),
          'cloudpickle=={}'.format(cloudpickle.__version__),
        ],
      },
    ],
    'name': 'model_env'
}

In [None]:
input_schema = []
for col in numCols:
    input_schema.append(ColSpec("double", col))
    
for col in catCols:
    input_schema.append(ColSpec("string", col))
    
input_schema = Schema(input_schema)
output_schema = Schema([ColSpec('double')])
sign = ModelSignature(inputs=input_schema, outputs=output_schema)

In [None]:
experiment_name = "regres_sk_housing"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name, artifact_location=artifact_uri)
    experiment = mlflow.get_experiment(experiment_id)

mlflow.set_experiment(experiment.name)

mlflow_pyfunc_model_path = experiment_name
with mlflow.start_run():
    mlflow.log_param('data_url', data_url)
    mlflow.log_param('data_version', version)
    mlflow.log_param('input_rows', x_train.shape[0])
    mlflow.log_param('input_columns', x_train.shape[1])
    
    mlflow.log_metric("rmse", rmse)
    mlflow.pyfunc.log_model(mlflow_pyfunc_model_path, 
                            python_model=ModelWrapper(), 
                            artifacts=artifacts,
                            signature=sign,
                            conda_env=conda_env)
    mlflow.shap.log_explanation(model.predict, 
                                pd.DataFrame(data = x_test[:100], columns = feature_names))

In [None]:
# Save the MLflow Model
# mlflow.pyfunc.save_model(
#         path=mlflow_pyfunc_model_path, python_model=ModelWrapper(), artifacts=artifacts,
#         conda_env=conda_env)