# Train a Scikit-Learn model in SageMaker and track with MLFlow

## Setup Environment

In [1]:
import sys
sys.path.append('../../')

In [2]:
import sagemaker
import pandas as pd
from sklearn.datasets import load_boston
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split


In [3]:
from deep.constants import *
from deep.utils import *

It is important to use the constants `DEV_BUCKET`, `SAGEMAKER_ROLE` and `MLFLOW_SERVER`, otherwise it does not work.

In [12]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE

tracking_uri = MLFLOW_SERVER 

## Prepare data
We load a dataset from sklearn, split it and send it to S3. The dataset and its preprocessing can be whatever.

In [5]:
# we use the Boston housing dataset 
data = load_boston()

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX['target'] = y_test

## Upload data to S3

Please keep this format for the job name. You can change `sklearn` to the library you are using, e.g. `pytorch`, and `test` can be whatever you want.

In [6]:
job_name = f"sklearn-{formatted_time()}-test"

input_path = DEV_BUCKET / 'training' / 'input_data' / job_name
train_path = str(input_path / 'boston_train.csv')
test_path = str(input_path / 'boston_test.csv')

trainX.to_csv(train_path)
testX.to_csv(test_path)

## Train

In [10]:
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': 'boston-housing',
    'n-estimators': 100,
    'min-samples-leaf': 3,
    'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
    'target': 'target'
}

metric_definitions = [{'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}]

estimator = SKLearn(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / '../examples/mlflow-sklearn'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.m5.large',
    instance_count=1,
    role=role,
    framework_version='0.23-1',
    metric_definitions=metric_definitions,
    hyperparameters=hyperparameters,
    job_name=job_name,
)

In [11]:
estimator.fit({'train':train_path, 'test': test_path})

2021-06-24 11:32:21 Starting - Starting the training job...
2021-06-24 11:32:44 Starting - Launching requested ML instancesProfilerReport-1624534338: InProgress
......
2021-06-24 11:33:44 Starting - Preparing the instances for training.........
2021-06-24 11:35:35 Downloading - Downloading input data
2021-06-24 11:35:35 Training - Downloading the training image...
2021-06-24 11:36:05 Training - Training image download completed. Training in progress.[34m2021-06-24 11:35:51,613 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-06-24 11:35:51,629 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-06-24 11:35:51,646 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-06-24 11:35:58,973 sagemaker-training-toolkit INFO     Installing module with the following command:[0m
[34m/miniconda3/bin/python -m pip install . -r requirements.txt[0m
[34mProcessing /