In [1]:
import mlflow
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split

In [2]:
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

## Preparing the data

In [3]:
# path = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 
path = "../data/winequality.csv"

In [4]:
def get_data(uri):
    try:
        data = pd.read_csv(uri, sep=";")
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e)
    return data


In [7]:
data = get_data(path)

In [10]:
data.to_csv('data/winequality.csv', index=False, sep=';')

In [8]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [9]:
train, test = train_test_split(data)

X_train = train.drop(["quality"], axis=1)
X_test = test.drop(["quality"], axis=1)
y_train = train[["quality"]]
y_test = test[["quality"]]

## Training with mlflow

In [10]:
mlflow.set_tracking_uri("http://localhost:5000")

In [11]:
experiment_name = "wine-quality-experiment"
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = mlflow.create_experiment(experiment_name) if experiment is None else experiment.experiment_id

In [12]:
experiment

### modelling

In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor

In [14]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
def eval_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2
        
    

def train_model(X_train, X_test, y_train, y_test, experiment_id, parameters, model_type="sklearn"):
    with mlflow.start_run(experiment_id=experiment_id):
        run = mlflow.active_run()
        print(run.info.artifact_uri, run.info.run_name, run.info.run_id)
        if model_type == "sklearn":
            model = ElasticNet(**parameters)
            log_model_func = mlflow.sklearn.log_model
        elif model_type == "xgboost":
            model = XGBRegressor(**parameters)
            log_model_func = mlflow.xgboost.log_model
        else:
            raise ValueError("Invalid model_type")
        
        model.fit(X_train, y_train)
    
        # Evaluate Metrics
        predicted_qualities = model.predict(X_test)
        metrics = eval_metrics(y_test, predicted_qualities)
    
        # Log metrics and parameters
        metric_dict = {"RMSE": metrics[0], "MAE": metrics[1], "R2": metrics[2]}
        print(f"  parameters: {parameters}")
        mlflow.log_params(parameters)
        print(f"  metrics: {metric_dict}")
        mlflow.log_metrics(metric_dict)
    
        # Log model to MLflow
        log_model_func(model, "model")
        return model, run.info.run_id

In [17]:
model_type = "xgboost"
bst_10 = train_model(X_train.values, X_test.values, y_train.values, y_test.values, experiment_id, parameters={"alpha":10, "n_estimators":10}, model_type=model_type)
bst_10

mlflow-artifacts:/2/99dea9ded3a74bef87f65a4b79e562f6/artifacts bemused-robin-43 99dea9ded3a74bef87f65a4b79e562f6
  parameters: {'alpha': 10, 'n_estimators': 10}
  metrics: {'RMSE': 0.6418836987126934, 'MAE': 0.49828591108322146, 'R2': 0.2846656839739785}


(XGBRegressor(alpha=10, base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=10, n_jobs=None, num_parallel_tree=None,
              predictor=None, ...),
 '99dea9ded3a74bef87f65a4b79e562f6')

In [18]:
model_type = "xgboost"
bst_100 = train_model(X_train.values, X_test.values, y_train.values, y_test.values, experiment_id, parameters={"alpha":10, "n_estimators":100}, model_type=model_type)
bst_100

mlflow-artifacts:/2/12f4f11316914e669733544324d1acc1/artifacts enthused-panda-29 12f4f11316914e669733544324d1acc1
  parameters: {'alpha': 10, 'n_estimators': 100}
  metrics: {'RMSE': 0.5977678682048612, 'MAE': 0.4560557770729065, 'R2': 0.3796146981063687}


(XGBRegressor(alpha=10, base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, ...),
 '12f4f11316914e669733544324d1acc1')

In [20]:
!mkdir xgb_wine

Prepare artefacts for deployment

In [23]:
%%writefile xgb_wine/model-settings.json
{
    "name": "wine-xgboost",
    "implementation": "mlserver_xgboost.XGBoostModel",
    "parameters": {
        "version": "v0.1.0"
    }
}

Overwriting xgb_wine/model-settings.json


In [24]:
def prepare_xgb_folder(run_id):
    bst = mlflow.xgboost.load_model(f'runs:/{run_id}/model')
    model_file_name = 'xgb_wine/model.bst'
    bst.save_model(model_file_name)
    with mlflow.start_run(experiment_id=experiment_id, run_id=run_id):
        mlflow.log_artifacts("xgb_wine/", artifact_path="xgb_wine/model")
    print("Artefacts logged")

print(bst_10[1]) # 99dea9ded3a74bef87f65a4b79e562f6
prepare_xgb_folder(run_id=bst_10[1])

99dea9ded3a74bef87f65a4b79e562f6
Artefacts logged


Deploying the model

In [25]:
%%writefile ../seldon/xgb_wine.yaml
apiVersion: machinelearning.seldon.io/v1alpha2
kind: SeldonDeployment
metadata:
  name: xgb-wine-k8s
  namespace: ml-models
spec:
  name: xgb-wines
  predictors:
    - graph:
        children: []
        envSecretRefName: seldon-init-container-secret
        implementation: XGBOOST_SERVER
        modelUri: s3://models/2/99dea9ded3a74bef87f65a4b79e562f6/artifacts/xgb_wine/model/
        name: classifier
      name: default
      replicas: 1

Writing ../seldon/xgb_wine.yaml


In [26]:
!kubectl apply -n ml-models -f ../seldon/xgb_wine.yaml

seldondeployment.machinelearning.seldon.io/xgb-wine-k8s created


Testing the model

In [34]:
# k8s port-forward -n ml-models svc/xgb-wine-k8s-default 8000

In [27]:
import requests

In [28]:
response = requests.post("http://localhost:8000/api/v1.0/predictions",
                         json={"data": {"ndarray":[[7.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.9978, 3.51, 0.56, 9.4]]}}
                         )

print("wine quality prediction:", response.json()['data']['ndarray'][0])

wine quality prediction: 5.027922630310059


Deploy a new version

In [30]:
print(bst_100[1]) # 12f4f11316914e669733544324d1acc1
prepare_xgb_folder(run_id=bst_100[1])

12f4f11316914e669733544324d1acc1
Artefacts logged


In [31]:
%%writefile ../seldon/xgb_wine.yaml
apiVersion: machinelearning.seldon.io/v1alpha2
kind: SeldonDeployment
metadata:
  name: xgb-wine-k8s
  namespace: ml-models
spec:
  name: xgb-wines
  predictors:
    - graph:
        children: []
        envSecretRefName: seldon-init-container-secret
        implementation: XGBOOST_SERVER
        modelUri: s3://models/2/12f4f11316914e669733544324d1acc1/artifacts/xgb_wine/model/
        name: classifier
      name: default
      replicas: 1

Overwriting ../seldon/xgb_wine.yaml


In [33]:
!kubectl apply -n ml-models -f ../seldon/xgb_wine.yaml

seldondeployment.machinelearning.seldon.io/xgb-wine-k8s unchanged


In [34]:
response = requests.post("http://localhost:8000/api/v1.0/predictions",
                         json={"data": {"ndarray":[[7.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.9978, 3.51, 0.56, 9.4]]}}
                         )

print("wine quality prediction:", response.json()['data']['ndarray'][0])

wine quality prediction: 5.0922346115112305


Train an sklearn model and deploy the MLFlow artefact

In [36]:
en_99 = train_model(X_train, X_test, y_train, y_test, experiment_id, {"alpha":0.0005, "l1_ratio":0.99}, "sklearn")
en_99

mlflow-artifacts:/2/3304a579914342b5b2ab830cbe925e0c/artifacts sincere-cow-804 3304a579914342b5b2ab830cbe925e0c
  parameters: {'alpha': 0.0005, 'l1_ratio': 0.99}
  metrics: {'RMSE': 0.6205373614258609, 'MAE': 0.4896880688132586, 'R2': 0.33145255102153803}


(ElasticNet(alpha=0.0005, l1_ratio=0.99), '3304a579914342b5b2ab830cbe925e0c')

In [37]:
import conda_pack
env_file_path = "environment.tar.gz"
conda_pack.pack(
    output=str(env_file_path),
    force=True,
    verbose=True,
    ignore_editable_packages=False,
    ignore_missing_files=True,
)

Collecting packages...
Packing environment at '/Users/danielvargas/miniconda3/envs/mlflow-k8s' to 'environment.tar.gz'
[########################################] | 100% Completed | 26.6s


'environment.tar.gz'

In [38]:
run_id = en_99[1] # 3304a579914342b5b2ab830cbe925e0c
with mlflow.start_run(experiment_id=experiment_id, run_id=run_id):
    mlflow.log_artifact(env_file_path, artifact_path="model")

In [39]:
%%writefile ../seldon/mlflow_wine.yaml
apiVersion: machinelearning.seldon.io/v1alpha2
kind: SeldonDeployment
metadata:
  name: mlflow-wine
  namespace: ml-models
spec:
  protocol: kfserving
  name: wines
  predictors:
    - graph:
        children: []
        envSecretRefName: seldon-init-container-secret
        implementation: MLFLOW_SERVER
        modelUri: s3://models/2/3304a579914342b5b2ab830cbe925e0c/artifacts/model
        name: classifier
      name: default
      replicas: 1


Writing ../seldon/mlflow_wine.yaml


In [40]:
!kubectl apply -n ml-models -f ../seldon/mlflow_wine.yaml

seldondeployment.machinelearning.seldon.io/mlflow-wine created


In [68]:
# k8s port-forward -n ml-models svc/mlflow-wine-default 8000

In [41]:
inference_request = {
    "parameters": {"content_type": "pd"},
    "inputs": [
        {"name": "fixed acidity", "shape": [1], "datatype": "FP32", "data": [7.4],  "parameters": {"content_type": "np"}},
        {"name": "volatile acidity", "shape": [1], "datatype": "FP32", "data": [0.7000], "parameters": {"content_type": "np"}},
        {"name": "citric acid", "shape": [1], "datatype": "FP32", "data": [0], "parameters": {"content_type": "np"}},
        {"name": "residual sugar", "shape": [1], "datatype": "FP32", "data": [1.9], "parameters": {"content_type": "np"}},
        {"name": "chlorides", "shape": [1], "datatype": "FP32", "data": [0.076], "parameters": {"content_type": "np"}},
        {"name": "free sulfur dioxide", "shape": [1], "datatype": "FP32", "data": [11], "parameters": {"content_type": "np"}},
        {"name": "total sulfur dioxide", "shape": [1], "datatype": "FP32", "data": [34], "parameters": {"content_type": "np"}},
        {"name": "density", "shape": [1], "datatype": "FP32", "data": [0.9978], "parameters": {"content_type": "np"}},
        {"name": "pH", "shape": [1], "datatype": "FP32", "data": [3.51], "parameters": {"content_type": "np"}},
        {"name": "sulphates", "shape": [1], "datatype": "FP32", "data": [0.56], "parameters": {"content_type": "np"}},
        {"name": "alcohol", "shape": [1], "datatype": "FP32", "data": [9.4], "parameters": {"content_type": "np"}},
    ]
}


In [44]:
endpoint = "http://localhost:8000/v2/models/classifier/infer"
response = requests.post(endpoint, json=inference_request)

In [45]:
assert response.ok
print("Wine quality:", response.json()['outputs'][0]['data'][0])


Wine quality: 5.008359887222566
