# MLflow Tutorial

This is a basic overview of the MLflow model component management tool. The example predicts the quality of wine using the model [sklearn.linear_model.ElasticNet](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html) and [dataset](http://archive.ics.uci.edu/ml/datasets/Wine+Quality).



## Set your password (mandatory)

In [1]:
#PASSWORD = "" # use your password

## Enable kubectl to run MLflow backend (mandatory)

In [2]:
%kubeRefresh --pwd $PASSWORD



kubeconfig refresh failed


In [3]:
# This magic sets the environmental variables required for mlflow in backend.
%loadMlflow

Backend configured


## Set your experiment name

In [4]:
# Magic function '%Setexp' replaces the two lines below.
#mlflow.set_experiment('demoexp')
#mlflow.set_tag('mlflow.user','chris')
%Setexp --name demoexp

In [5]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet

import mlflow
    
import mlflow.sklearn
from mlflow import log_metric, log_param, log_artifact
import time



  and should_run_async(code)


In [6]:
# Wine Quality Sample

homedir = !echo $HOME  # get user home directory for data
homedir = homedir[0]

def train(in_alpha, in_l1_ratio):


    def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2


    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read the wine-quality csv file from the URL
    #csv_url = (
    #    "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    #)
    #try:
    #    data = pd.read_csv(csv_url, sep=";")
    #except Exception as e:
    #    logger.exception(
    #        "Unable to download training & test CSV, check your internet connection. Error: %s", e
    #    )
     
    # Read locally
    data_path = homedir + "/examples/mlflow/wine-quality.csv"

    data = pd.read_csv(data_path)
    

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]

    # Set default values if no alpha is provided
    if float(in_alpha) is None:
        alpha = 0.5
    else:
        alpha = float(in_alpha)

    # Set default values if no l1_ratio is provided
    if float(in_l1_ratio) is None:
        l1_ratio = 0.5
    else:
        l1_ratio = float(in_l1_ratio)

    # Useful for multiple runs (only doing one run in this sample notebook)    
    
        # Execute ElasticNet
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    lr.fit(train_x, train_y)

        # Evaluate Metrics
    predicted_qualities = lr.predict(test_x)
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        # Print out metrics
    print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
    mlflow.log_param("alpha", alpha)
    mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    mlflow.sklearn.log_model(lr, "model")
    
    mlflow.end_run() # Executing this ends one run of the current experiment.

  and should_run_async(code)


## Train and track models 
MLflow provides a tracking UI to see all historical training model components, this is also useful for hyperparameter tuning. Here we try three hyperparameter combinations:

In [7]:
train(0.5, 0.5)

Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 0.82224284975954
  MAE: 0.6278761410160693
  R2: 0.12678721972772689


In [8]:
train(0.2, 0.2)

Elasticnet model (alpha=0.200000, l1_ratio=0.200000):
  RMSE: 0.7859129997062342
  MAE: 0.6155290394093895
  R2: 0.2022463182289208


In [9]:
train(0.1, 0.1)

Elasticnet model (alpha=0.100000, l1_ratio=0.100000):
  RMSE: 0.7792546522251949
  MAE: 0.6112547988118586
  R2: 0.2157063843066196


## Train the best model
Use the Ezmeral Training cluster to configure higher resource to train model with big data on the selected hyperparameter (alpha=0.5, l1_ratio=0.5).

In [10]:
%attachments

Training Cluster        ML Engine
----------------------  -----------
trainingengineinstance  python


Ensure the data set exists in the shared project folder

In [11]:
!cp $homedir/examples/mlflow/wine-quality.csv /bd-fs-mnt/project_repo/data/

Copy and paste the name of the Training Cluster as a magic (e.g., `%%trainingengineinstance`) to run the model on the training cluster 

In [12]:
%%capture history_url

%%trainingengineinstance

# make sure to copy and paste all the training codes after the magic

import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet

import mlflow
    
import mlflow.sklearn
from mlflow import log_metric, log_param, log_artifact

def train(in_alpha, in_l1_ratio, data_path=None):


    def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2


    warnings.filterwarnings("ignore")
    np.random.seed(40)
     
    # Read from shared data volume (make sure to put the data in the Project Repository on ECP)
    data_path = "/bd-fs-mnt/project_repo/data/wine-quality.csv"

    data = pd.read_csv(data_path)
    

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]

    # Set default values if no alpha is provided
    if float(in_alpha) is None:
        alpha = 0.5
    else:
        alpha = float(in_alpha)

    # Set default values if no l1_ratio is provided
    if float(in_l1_ratio) is None:
        l1_ratio = 0.5
    else:
        l1_ratio = float(in_l1_ratio)

    # Useful for multiple runs (only doing one run in this sample notebook)    
    
        # Execute ElasticNet
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    lr.fit(train_x, train_y)

        # Evaluate Metrics
    predicted_qualities = lr.predict(test_x)
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        # Print out metrics
    print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
    mlflow.log_param("alpha", alpha)
    mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    mlflow.sklearn.log_model(lr, "model")
    
    mlflow.end_run()
    
train(0.5, 0.5)


In [13]:
historyurl = history_url.stdout.split(' ')[2]
print(historyurl)

http://trainingengineinstance-restserver-stxtr-0.trainingengineinstance8cdvg.k8s-aiml-t1.svc.cluster.local:10001/history/28



In [14]:
%logs --url $historyurl
time.sleep(5) # wait here for the training to finish to see the final log
%logs --url $historyurl
# rerun this cell if you don't see "Job Status" updates

Job Status: Running
Job Status: Finished
Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
RMSE: 0.82224284975954
MAE: 0.6278761410160693
R2: 0.12678721972772689




## Reference:
- This examples is a notebook version of `train.py` from the paper:
  - P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. *Modeling wine preferences by data mining from physicochemical properties*. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.