# Train a Scikit-Learn model in SageMaker and track with MLFlow

## Setup Environment

In [None]:
!pip install -q --upgrade pip
!pip install -q --upgrade sagemaker==2.16.3

In [1]:
import sys
sys.path.append('../../..//../')

In [2]:
import sagemaker
import pandas as pd
from sklearn.datasets import load_boston
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split


In [3]:
from deep.constants import *
from deep.utils import *

In [28]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = 'AmazonSageMaker-ExecutionRole-20210519T102514'

MLFLOW_SERVER = "mlflow-terratest-387470f3-828569864.us-east-1.elb.amazonaws.com:80"
tracking_uri = MLFLOW_SERVER 

## Prepare data
We load a dataset from sklearn, split it and send it to S3

In [33]:
# we use the Boston housing dataset 
data = load_boston()

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX['target'] = y_test

In [34]:
job_name = f"sklearn-{formatted_time()}-test"
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name
train_path = str(input_path / 'boston_train.csv')
test_path = str(input_path / 'boston_test.csv')

trainX.to_csv(train_path)
testX.to_csv(test_path)

In [35]:
# # send data to S3. SageMaker will take training data from s3
# train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')
# test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')

## Train

In [36]:
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': 'boston-housing',
    'n-estimators': 100,
    'min-samples-leaf': 3,
    'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
    'target': 'target'
}

metric_definitions = [{'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}]

estimator = SKLearn(
    entry_point='train.py',
    source_dir='source_dir',
    role=role,
    metric_definitions=metric_definitions,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version='0.23-1',
    job_name=job_name,
)

In [37]:
estimator.fit({'train':train_path, 'test': test_path})

2021-06-23 18:53:07 Starting - Starting the training job...
2021-06-23 18:53:31 Starting - Launching requested ML instancesProfilerReport-1624474384: InProgress
...
2021-06-23 18:54:11 Starting - Preparing the instances for training.........
2021-06-23 18:55:40 Downloading - Downloading input data...
2021-06-23 18:56:12 Training - Downloading the training image..[34m2021-06-23 18:56:30,872 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-06-23 18:56:30,875 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-06-23 18:56:30,884 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-06-23 18:56:31,269 sagemaker-training-toolkit INFO     Installing module with the following command:[0m
[34m/miniconda3/bin/python -m pip install . -r requirements.txt[0m
[34mProcessing /opt/ml/code[0m
[34mCollecting mlflow==1.12.1
  Downloading mlflow-1.12.1-py3-none-any.w

UnexpectedStatusException: Error for Training job sagemaker-scikit-learn-2021-06-23-18-53-03-265: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 39, in main
    train(environment.Environment())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 35, in train
    runner_type=runner.ProcessRunnerType)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/entry_point.py", line 100, in run
    wait, capture_error
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 161, in run
    cwd=environment.code_dir,
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 81, in check_error
    raise error_class(return_code=return_code, cmd=" ".join(cmd), output=stderr)
sagemaker_training.errors.ExecuteUserScriptError: ExecuteUserScriptError:
Command "/miniconda3/bin/python 