In [46]:
import numpy as np
import pandas as pd
import sqlite3 as sq
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

## Cleanning data before applying autopilot

In [None]:
with gzip.open('df_final.csv.gz', 'rb') as f:
    df4= pd.read_csv(f)
df4.head()

Unnamed: 0,year,department,base_salary,overtime,irregular_cash,total_benefits,city_id,annual_average_cpi
0,2020,Parks,5257.5,0.0,139.32,418.88,2,258.8
1,2020,City Mgmt,7699.19,1916.9,0.0,746.36,2,258.8
2,2020,City Mgmt,2619.15,930.5,0.0,275.51,2,258.8
3,2020,City Mgmt,1870.62,591.22,0.0,191.07,2,258.8
4,2020,Public Works,158812.14,0.0,5676.94,60283.48,2,258.8


## Feature engineering

In [55]:
#convert ctity_id , year to string
df4['city_id'] = df4['city_id'].astype(str)
df4['year'] = df4['year'].astype(str)
# Convert categorical data to dummy values
df4= pd.get_dummies(df4, drop_first=True) # drop first dummy column to avoid dummy variable trap
df4.head()

Unnamed: 0,base_salary,overtime,irregular_cash,total_benefits,annual_average_cpi,year_2014,year_2015,year_2016,year_2017,year_2018,...,department_Human Services,department_IT,department_Law and Reg,"department_Libraries, Arts, Science, Museums",department_Parks,department_Police,department_Port,department_Public Works,city_id_2,city_id_3
0,5257.5,0.0,139.32,418.88,258.8,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,7699.19,1916.9,0.0,746.36,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2619.15,930.5,0.0,275.51,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1870.62,591.22,0.0,191.07,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,158812.14,0.0,5676.94,60283.48,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [56]:
#Adjusting all numbers to present-day CPI which is 2021
max_cpi = df4['annual_average_cpi'].max()

for column in ['base_salary', 'overtime', 'irregular_cash',  'total_benefits']:
    df4[column] = df4[column] * (max_cpi / df4['annual_average_cpi'])
df4.head()

Unnamed: 0,base_salary,overtime,irregular_cash,total_benefits,annual_average_cpi,year_2014,year_2015,year_2016,year_2017,year_2018,...,department_Human Services,department_IT,department_Law and Reg,"department_Libraries, Arts, Science, Museums",department_Parks,department_Police,department_Port,department_Public Works,city_id_2,city_id_3
0,5505.341963,0.0,145.887635,438.626275,258.8,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,8062.134815,2007.26391,0.0,781.543895,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2742.618431,974.364374,0.0,288.49772,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1958.802241,619.090495,0.0,200.077164,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,166298.647372,0.0,5944.554637,63125.28238,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [2]:
# import sagemaker
# from sagemaker import get_execution_role
# import pandas as pd
# import numpy as np
# import boto3
# import os

In [None]:
bucket = sess.default_bucket()
prefix = '<your_s3_prefix>'

# Autopilot

In [57]:
import boto3
import sagemaker
import pandas as pd
import json

In [58]:

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [59]:
auto_df = df4
auto_df.head()

Unnamed: 0,base_salary,overtime,irregular_cash,total_benefits,annual_average_cpi,year_2014,year_2015,year_2016,year_2017,year_2018,...,department_Human Services,department_IT,department_Law and Reg,"department_Libraries, Arts, Science, Museums",department_Parks,department_Police,department_Port,department_Public Works,city_id_2,city_id_3
0,5505.341963,0.0,145.887635,438.626275,258.8,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,8062.134815,2007.26391,0.0,781.543895,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2742.618431,974.364374,0.0,288.49772,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1958.802241,619.090495,0.0,200.077164,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,166298.647372,0.0,5944.554637,63125.28238,258.8,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [60]:
# Split all data into 90% train and 10% holdout
auto_train, auto_test = train_test_split(auto_df, test_size=0.25, random_state=508)

In [61]:
auto_train.shape

(1074602, 26)

#### Write train csv path with header for autopilot

In [66]:
autopilot_train_path = "./compensation_cpi_autopilot.csv"
autopilot_test_path = "./compensation_cpi_autopilot_test.csv"
auto_train.to_csv(autopilot_train_path, index=False, header=True)
auto_test.to_csv(autopilot_test_path, index=False, header=True)

#### Upload data to s3 for autopilot

In [63]:
train_s3_prefix = "data"
autopilot_train_s3_uri = sess.upload_data(path=autopilot_train_path, key_prefix=train_s3_prefix)
autopilot_train_s3_uri

's3://sagemaker-us-east-1-924490614652/data/compensation_cpi_autopilot.csv'

In [None]:
train_s3_prefix = "data"
autopilot_test_s3_uri = sess.upload_data(path=autopilot_test_path, key_prefix=train_s3_prefix)
autopilot_test_s3_uri

's3://sagemaker-us-east-1-924490614652/data/compensation_cpi_autopilot_test.csv'

In [64]:
!aws s3 ls $autopilot_train_s3_uri

2023-04-12 19:46:18  113300010 compensation_cpi_autopilot.csv


In [65]:
%store autopilot_train_s3_uri

Stored 'autopilot_train_s3_uri' (str)


#### View train data stored in S3 bucket

In [19]:
!aws s3 cp $autopilot_train_s3_uri ./tmp/

download: s3://sagemaker-us-east-1-924490614652/data/compensation_cpi_autopilot.csv to tmp/compensation_cpi_autopilot.csv


In [22]:
import csv

df5 = pd.read_csv("./tmp/compensation_cpi_autopilot.csv")
df5.head()


Unnamed: 0,base_salary,overtime,irregular_cash,total_benefits,annual_average_cpi,year_2014,year_2015,year_2016,year_2017,year_2018,...,department_Human Services,department_IT,department_Law and Reg,"department_Libraries, Arts, Science, Museums",department_Parks,department_Police,department_Port,department_Public Works,city_id_2,city_id_3
0,23593.27511,0.0,1462.881959,11261.339705,251.1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,131477.705112,0.0,686.945501,58580.548416,236.7,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,102632.296947,9207.026043,6414.123918,49454.379521,258.8,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,31509.960417,0.0,0.0,10898.784417,240.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,134209.848042,30539.780417,14578.37725,41240.554167,240.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0


#### Setup the S3 Location for the Autopilot-Generated Assets

In [23]:
prefix_model_output = "models/autopilot"

model_output_s3_uri = "s3://{}/{}".format(bucket, prefix_model_output)

print(model_output_s3_uri)

s3://sagemaker-us-east-1-924490614652/models/autopilot


In [24]:
max_candidates = 3

job_config = {
    "CompletionCriteria": {
        "MaxRuntimePerTrainingJobInSeconds": 900,
        "MaxCandidates": max_candidates,
        "MaxAutoMLJobRuntimeInSeconds": 5400,
    },
}

input_data_config = [
    {
        "DataSource": {"S3DataSource": {"S3DataType": "S3Prefix", "S3Uri": "{}".format(autopilot_train_s3_uri)}},
        "TargetAttributeName": "total_benefits",
    }
]

output_data_config = {"S3OutputPath": "{}".format(model_output_s3_uri)}

#### Check For Existing Autopilot Jobs

In [26]:
existing_jobs_response = sm.list_auto_ml_jobs()

In [27]:
num_existing_jobs = 0
running_jobs = 0

if "AutoMLJobSummaries" in existing_jobs_response.keys():
    job_list = existing_jobs_response["AutoMLJobSummaries"]
    num_existing_jobs = len(job_list)
    # print('[INFO] You already created {} Autopilot job(s) in this account.'.format(num_existing_jobs))
    for j in job_list:
        if "AutoMLJobStatus" in j.keys():
            if j["AutoMLJobStatus"] == "InProgress":
                running_jobs = running_jobs + 1
    print("[INFO] You have {} Autopilot job(s) currently running << Should be 0 jobs.".format(running_jobs))
else:
    print("[OK] Please continue.")

[INFO] You have 0 Autopilot job(s) currently running << Should be 0 jobs.


### Launch the SageMaker Autopilot Job

In [28]:
from time import gmtime, strftime, sleep

In [29]:
%store -r auto_ml_job_name

try:
    auto_ml_job_name
except NameError:
    timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())
    auto_ml_job_name = "automl-dm-" + timestamp_suffix
    print("Created AutoMLJobName: " + auto_ml_job_name)

no stored variable or alias auto_ml_job_name
Created AutoMLJobName: automl-dm-10-07-32-11


In [30]:
print(auto_ml_job_name)

automl-dm-10-07-32-11


In [31]:
%store auto_ml_job_name

Stored 'auto_ml_job_name' (str)


In [32]:
max_running_jobs = 1

if running_jobs < max_running_jobs:  # Limiting to max. 1 Jobs
    try:
        sm.create_auto_ml_job(
            AutoMLJobName=auto_ml_job_name,
            InputDataConfig=input_data_config,
            OutputDataConfig=output_data_config,
            AutoMLJobConfig=job_config,
            RoleArn=role,
        )
        print("[OK] Autopilot Job {} created.".format(auto_ml_job_name))
        running_jobs = running_jobs + 1
    except:
        print(
            "[INFO] You have already launched an Autopilot job. Please continue see the output of this job.".format(
                running_jobs
            )
        )
else:
    print(
        "[INFO] You have already launched {} Autopilot running job(s). Please continue see the output of the running job.".format(
            running_jobs
        )
    )

[OK] Autopilot Job automl-dm-10-07-32-11 created.


### Analyzing Data and Generate Notebooks

In [33]:
job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)

while (
    "AutoMLJobStatus" not in job_description_response.keys()
    and "AutoMLJobSecondaryStatus" not in job_description_response.keys()
):
    job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    print("[INFO] Autopilot Job has not yet started. Please wait. ")
    print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))
    print("[INFO] Waiting for Autopilot Job to start...")
    sleep(15)

print("[OK] AutoMLJob started.")

[OK] AutoMLJob started.


#### Review the SageMaker Processing Jobs

In [34]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/processing-jobs/">Processing Jobs</a></b>'.format(
            region
        )
    )
)

In [35]:
%%time

job_status = job_description_response["AutoMLJobStatus"]
job_sec_status = job_description_response["AutoMLJobSecondaryStatus"]

if job_status not in ("Stopped", "Failed"):
    while job_status in ("InProgress") and job_sec_status in ("Starting", "AnalyzingData"):
        job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
        job_status = job_description_response["AutoMLJobStatus"]
        job_sec_status = job_description_response["AutoMLJobSecondaryStatus"]
        print(job_status, job_sec_status)
        sleep(15)
    print("[OK] Data analysis phase completed.\n")

print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))

InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress FeatureEngineering
[OK] Data analysis phase completed.



NameError: name 'json' is not defined

In [37]:
print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))

{
    "AutoMLJobArn": "arn:aws:sagemaker:us-east-1:924490614652:automl-job/automl-dm-10-07-32-11",
    "AutoMLJobArtifacts": {
        "CandidateDefinitionNotebookLocation": "s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb",
        "DataExplorationNotebookLocation": "s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb"
    },
    "AutoMLJobConfig": {
        "CompletionCriteria": {
            "MaxAutoMLJobRuntimeInSeconds": 5400,
            "MaxCandidates": 3,
            "MaxRuntimePerTrainingJobInSeconds": 900
        }
    },
    "AutoMLJobName": "automl-dm-10-07-32-11",
    "AutoMLJobSecondaryStatus": "FeatureEngineering",
    

#### Waiting For Generated Notebooks

In [38]:
job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)

while "AutoMLJobArtifacts" not in job_description_response.keys():
    job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    print("[INFO] Autopilot Job has not yet generated the artifacts. Please wait. ")
    print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))
    print("[INFO] Waiting for AutoMLJobArtifacts...")
    sleep(15)

print("[OK] AutoMLJobArtifacts generated.")

[OK] AutoMLJobArtifacts generated.


In [39]:
job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)

while "DataExplorationNotebookLocation" not in job_description_response["AutoMLJobArtifacts"].keys():
    job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    print("[INFO] Autopilot Job has not yet generated the notebooks. Please wait. ")
    print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))
    print("[INFO] Waiting for DataExplorationNotebookLocation...")
    sleep(15)

print("[OK] DataExplorationNotebookLocation found.")

[OK] DataExplorationNotebookLocation found.


In [40]:
generated_resources = job_description_response["AutoMLJobArtifacts"]["DataExplorationNotebookLocation"]
download_path = generated_resources.rsplit("/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb")[0]
job_id = download_path.rsplit("/", 1)[-1]

In [41]:
from IPython.core.display import display, HTML

if not job_id:
    print("No AutoMLJobArtifacts found.")
else:
    display(
        HTML(
            '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/{}/sagemaker-automl-candidates/{}/">S3 Generated Resources</a></b>'.format(
                bucket, prefix_model_output, auto_ml_job_name, job_id
            )
        )
    )


### Download Generated Notebooks & Code

In [42]:
print(download_path)

s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e


In [43]:
try:
    !aws s3 cp --recursive $download_path .
except:
    print('Could not download the generated resources. Make sure the path is correct.')

download: s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/generated_module/MANIFEST.in to generated_module/MANIFEST.in
download: s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/generated_module/README.md to generated_module/README.md
download: s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/generated_module/candidate_data_processors/dpp0.py to generated_module/candidate_data_processors/dpp0.py
download: s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/generated_module/candidate_data_processors/sagemaker_serve.py

In [44]:
!ls ./generated_module/candidate_data_processors

dpp0.py  dpp1.py  dpp2.py  sagemaker_serve.py  trainer.py


In [45]:
!ls ./notebooks

SageMakerAutopilotCandidateDefinitionNotebook.ipynb  sagemaker_automl
SageMakerAutopilotDataExplorationNotebook.ipynb


### Feature Engineering


In [46]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/">Training Jobs</a></b>'.format(
            region
        )
    )
)

In [47]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/transform-jobs/">Batch Transform Jobs</a></b>'.format(
            region
        )
    )
)

In [48]:
%%time

job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
job_status = job_description_response["AutoMLJobStatus"]
job_sec_status = job_description_response["AutoMLJobSecondaryStatus"]
print(job_status)
print(job_sec_status)
if job_status not in ("Stopped", "Failed"):
    while job_status in ("InProgress") and job_sec_status in ("FeatureEngineering"):
        job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
        job_status = job_description_response["AutoMLJobStatus"]
        job_sec_status = job_description_response["AutoMLJobSecondaryStatus"]
        print(job_status, job_sec_status)
        sleep(15)
    print("[OK] Feature engineering phase completed.\n")

print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))

InProgress
FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress ModelTuning
[OK] Feature engineering phase completed.

{
    "AutoMLJobArn": "arn:aws:sagemaker:us-east-1:924490614652:automl-job/automl-dm-10-07-32-11",
    "AutoMLJobArtifacts": {
        "CandidateDefinitionNotebookLocation": "s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb",
        "DataExplorationNotebookLocation": "s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb"
    },
    "AutoMLJobConfig": {
        "CompletionCriteria": {
            "MaxAutoMLJobRuntimeInSecond

### Model Training and Tuning


In [49]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/hyper-tuning-jobs/">Hyperparameter Tuning Jobs</a></b>'.format(
            region
        )
    )
)

In [50]:
%%time
#Tuning  phase
job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
job_status = job_description_response["AutoMLJobStatus"]
job_sec_status = job_description_response["AutoMLJobSecondaryStatus"]
print(job_status)
print(job_sec_status)
if job_status not in ("Stopped", "Failed"):
    while job_status in ("InProgress") and job_sec_status in ("ModelTuning"):
        job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
        job_status = job_description_response["AutoMLJobStatus"]
        job_sec_status = job_description_response["AutoMLJobSecondaryStatus"]
        print(job_status, job_sec_status)
        sleep(15)
    print("[OK] Model tuning phase completed.\n")

print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))


InProgress
ModelTuning
InProgress ModelTuning
InProgress ModelTuning
InProgress ModelTuning
InProgress ModelTuning
InProgress ModelTuning
InProgress ModelTuning
InProgress ModelTuning
InProgress MergingAutoMLTaskReports
[OK] Model tuning phase completed.

{
    "AutoMLJobArn": "arn:aws:sagemaker:us-east-1:924490614652:automl-job/automl-dm-10-07-32-11",
    "AutoMLJobArtifacts": {
        "CandidateDefinitionNotebookLocation": "s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb",
        "DataExplorationNotebookLocation": "s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb"
    },
    "AutoMLJobConfig": {
        "CompletionCriteria": {


In [51]:
%%time
#Training phase
job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
job_status = job_description_response["AutoMLJobStatus"]
print(job_status)
if job_status not in ("Stopped", "Failed"):
    while job_status not in ("Completed"):
        job_description_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
        job_status = job_description_response["AutoMLJobStatus"]
        print(job_status)
        sleep(10)
    print("[OK] Autopilot Job completed.\n")
else:
    print(job_status)

InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed
[OK] Autopilot Job completed.

CPU times: user 397 ms, sys: 25.6 m

### Viewing All Candidates


In [59]:
candidates_response = sm.list_candidates_for_auto_ml_job(
    AutoMLJobName=auto_ml_job_name, SortBy="FinalObjectiveMetricValue"
)

#### Check that candidates exist

In [60]:
print(candidates_response.keys())

dict_keys(['Candidates', 'ResponseMetadata'])


In [61]:
while "Candidates" not in candidates_response.keys():
    candidates_response = sm.list_candidates_for_auto_ml_job(
        AutoMLJobName=auto_ml_job_name, SortBy="FinalObjectiveMetricValue"
    )
    print("[INFO] Autopilot Job is generating the Candidates. Please wait.")
    print(json.dumps(candidates_response, indent=4, sort_keys=True, default=str))
    sleep(10)

candidates = candidates_response["Candidates"]
print("[OK] Candidates generated.")

[OK] Candidates generated.


In [55]:
print(candidates[0].keys())

dict_keys(['CandidateName', 'FinalAutoMLJobObjectiveMetric', 'ObjectiveStatus', 'CandidateSteps', 'CandidateStatus', 'InferenceContainers', 'CreationTime', 'EndTime', 'LastModifiedTime', 'CandidateProperties'])


In [56]:
while "CandidateName" not in candidates[0]:
    candidates_response = sm.list_candidates_for_auto_ml_job(
        AutoMLJobName=auto_ml_job_name, SortBy="FinalObjectiveMetricValue"
    )
    candidates = candidates_response["Candidates"]
    print("[INFO] Autopilot Job is generating CandidateName. Please wait. ")
    print(json.dumps(candidates, indent=4, sort_keys=True, default=str))
    sleep(10)

print("[OK] CandidateName generated.")

[OK] CandidateName generated.


In [57]:
while "FinalAutoMLJobObjectiveMetric" not in candidates[0]:
    candidates_response = sm.list_candidates_for_auto_ml_job(
        AutoMLJobName=auto_ml_job_name, SortBy="FinalObjectiveMetricValue"
    )
    candidates = candidates_response["Candidates"]
    print("[INFO] Autopilot Job is generating FinalAutoMLJobObjectiveMetric. Please wait. ")
    print(json.dumps(candidates, indent=4, sort_keys=True, default=str))
    sleep(10)

print("[OK] FinalAutoMLJobObjectiveMetric generated.")

[OK] FinalAutoMLJobObjectiveMetric generated.


In [58]:
print(json.dumps(candidates, indent=4, sort_keys=True, default=str))


[
    {
        "CandidateName": "automl-dm-10-07-32-11KWw7cLa0hBd-003-66814648",
        "CandidateProperties": {
            "CandidateMetrics": [
                {
                    "MetricName": "MAE",
                    "Set": "Validation",
                    "StandardMetricName": "MAE",
                    "Value": 6531.89404296875
                },
                {
                    "MetricName": "RMSE",
                    "Set": "Validation",
                    "StandardMetricName": "RMSE",
                    "Value": 9452.228515625
                },
                {
                    "MetricName": "MSE",
                    "Set": "Validation",
                    "StandardMetricName": "MSE",
                    "Value": 89344632.0
                },
                {
                    "MetricName": "R2",
                    "Set": "Validation",
                    "StandardMetricName": "R2",
                    "Value": 0.8436599969863892
                }
  

In [62]:
for index, candidate in enumerate(candidates):
    print(
        str(index)
        + "  "
        + candidate["CandidateName"]
        + "  "
        + str(candidate["FinalAutoMLJobObjectiveMetric"]["Value"])
    )

0  automl-dm-10-07-32-11KWw7cLa0hBd-003-66814648  89344632.0
1  automl-dm-10-07-32-11KWw7cLa0hBd-002-3efd9c25  89344632.0
2  automl-dm-10-07-32-11KWw7cLa0hBd-001-3332516c  70039264.0


### Inspect Trials using Experiments API

In [63]:
from sagemaker.analytics import ExperimentAnalytics, TrainingJobAnalytics

exp = ExperimentAnalytics(
    sagemaker_session=sess,
    experiment_name=auto_ml_job_name + "-aws-auto-ml-job",
)

df = exp.dataframe()
print(df)

                                  TrialComponentName  \
0  automl-dm-10-07-32-11KWw7cLa0hBd-002-3efd9c25-...   
1  automl-dm-10-07-32-11KWw7cLa0hBd-003-66814648-...   
2  automl-dm-10-07-32-11KWw7cLa0hBd-001-3332516c-...   
3  automl-dm-10-07-32-11-dpp1-csv-1-ac6a2ee0cdc54...   
4  automl-dm-10-07-32-11-dpp2-csv-1-da418f26489c4...   
5  automl-dm-10-07-32-11-dpp2-1-3421a8be86b04fe6b...   
6  automl-dm-10-07-32-11-dpp1-1-9be00f19eeb64a97a...   
7  automl-dm-10-07-32-11-db-1-88312a4a0ab24b24885...   

                                         DisplayName  \
0  automl-dm-10-07-32-11KWw7cLa0hBd-002-3efd9c25-...   
1  automl-dm-10-07-32-11KWw7cLa0hBd-003-66814648-...   
2  automl-dm-10-07-32-11KWw7cLa0hBd-001-3332516c-...   
3  automl-dm-10-07-32-11-dpp1-csv-1-ac6a2ee0cdc54...   
4  automl-dm-10-07-32-11-dpp2-csv-1-da418f26489c4...   
5  automl-dm-10-07-32-11-dpp2-1-3421a8be86b04fe6b...   
6  automl-dm-10-07-32-11-dpp1-1-9be00f19eeb64a97a...   
7  automl-dm-10-07-32-11-db-1-88312a4a0ab24b248

### Explore the Best Candidate


In [65]:
best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)

In [66]:
print(best_candidate_response.keys())

dict_keys(['AutoMLJobName', 'AutoMLJobArn', 'InputDataConfig', 'OutputDataConfig', 'RoleArn', 'AutoMLJobConfig', 'CreationTime', 'EndTime', 'LastModifiedTime', 'BestCandidate', 'AutoMLJobStatus', 'AutoMLJobSecondaryStatus', 'GenerateCandidateDefinitionsOnly', 'AutoMLJobArtifacts', 'ResolvedAttributes', 'ResponseMetadata'])


In [67]:
while "BestCandidate" not in best_candidate_response:
    best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    print("[INFO] Autopilot Job is generating BestCandidate. Please wait. ")
    print(json.dumps(best_candidate_response, indent=4, sort_keys=True, default=str))
    sleep(10)

best_candidate = best_candidate_response["BestCandidate"]
print("[OK] BestCandidate generated.")

[OK] BestCandidate generated.


In [68]:
print(json.dumps(best_candidate_response, indent=4, sort_keys=True, default=str))

{
    "AutoMLJobArn": "arn:aws:sagemaker:us-east-1:924490614652:automl-job/automl-dm-10-07-32-11",
    "AutoMLJobArtifacts": {
        "CandidateDefinitionNotebookLocation": "s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb",
        "DataExplorationNotebookLocation": "s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/sagemaker-automl-candidates/automl-dm-10-07-32-11-pr-1-5552ac2104684f7498e34be6144a83be2c1e/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb"
    },
    "AutoMLJobConfig": {
        "CompletionCriteria": {
            "MaxAutoMLJobRuntimeInSeconds": 5400,
            "MaxCandidates": 3,
            "MaxRuntimePerTrainingJobInSeconds": 900
        }
    },
    "AutoMLJobName": "automl-dm-10-07-32-11",
    "AutoMLJobSecondaryStatus": "Completed",
    "AutoMLJo

In [69]:
print(best_candidate.keys())

dict_keys(['CandidateName', 'FinalAutoMLJobObjectiveMetric', 'ObjectiveStatus', 'CandidateSteps', 'CandidateStatus', 'InferenceContainers', 'CreationTime', 'EndTime', 'LastModifiedTime', 'CandidateProperties'])


In [70]:
while "CandidateName" not in best_candidate:
    best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    best_candidate = best_candidate_response["BestCandidate"]
    print("[INFO] Autopilot Job is generating BestCandidate CandidateName. Please wait. ")
    print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))
    sleep(10)

print("[OK] BestCandidate CandidateName generated.")


[OK] BestCandidate CandidateName generated.


In [71]:
while "FinalAutoMLJobObjectiveMetric" not in best_candidate:
    best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    best_candidate = best_candidate_response["BestCandidate"]
    print("[INFO] Autopilot Job is generating BestCandidate FinalAutoMLJobObjectiveMetric. Please wait. ")
    print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))
    sleep(10)

print("[OK] BestCandidate FinalAutoMLJobObjectiveMetric generated.")

[OK] BestCandidate FinalAutoMLJobObjectiveMetric generated.


In [72]:
best_candidate_identifier = best_candidate["CandidateName"]
print("Candidate name: " + best_candidate_identifier)
print("Metric name: " + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"])
print("Metric value: " + str(best_candidate["FinalAutoMLJobObjectiveMetric"]["Value"]))

Candidate name: automl-dm-10-07-32-11KWw7cLa0hBd-001-3332516c
Metric name: validation:mse
Metric value: 70039264.0


In [73]:
print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))

{
    "CandidateName": "automl-dm-10-07-32-11KWw7cLa0hBd-001-3332516c",
    "CandidateProperties": {
        "CandidateArtifactLocations": {
            "Explainability": "s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/documentation/explainability/output",
            "ModelInsights": "s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/documentation/model_monitor/output"
        },
        "CandidateMetrics": [
            {
                "MetricName": "MAE",
                "Set": "Validation",
                "StandardMetricName": "MAE",
                "Value": 5832.0615234375
            },
            {
                "MetricName": "RMSE",
                "Set": "Validation",
                "StandardMetricName": "RMSE",
                "Value": 8368.9462890625
            },
            {
                "MetricName": "MSE",
                "Set": "Validation",
                "StandardMetricName": "MSE",
                

In [75]:
steps = []
for step in best_candidate["CandidateSteps"]:
    print("Candidate Step Type: {}".format(step["CandidateStepType"]))
    print("Candidate Step Name: {}".format(step["CandidateStepName"]))
    steps.append(step["CandidateStepName"])

Candidate Step Type: AWS::SageMaker::ProcessingJob
Candidate Step Name: automl-dm-10-07-32-11-db-1-88312a4a0ab24b24885cd11e84c5b1392bb9
Candidate Step Type: AWS::SageMaker::TrainingJob
Candidate Step Name: automl-dm-10-07-32-11-dpp1-1-9be00f19eeb64a97a58d95421a9ccb529a
Candidate Step Type: AWS::SageMaker::TransformJob
Candidate Step Name: automl-dm-10-07-32-11-dpp1-csv-1-ac6a2ee0cdc54bbfbe42995b7e2c5e
Candidate Step Type: AWS::SageMaker::TrainingJob
Candidate Step Name: automl-dm-10-07-32-11KWw7cLa0hBd-001-3332516c


### See the Containers and Models within the Inference Pipeline

In [88]:
while "InferenceContainers" not in best_candidate:
    best_candidate_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    best_candidate = best_candidate_response["BestCandidate"]
    print("[INFO] Autopilot Job is generating BestCandidate InferenceContainers. Please wait. ")
    print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))
    sleep(10)

print("[OK] BestCandidate InferenceContainers generated.")

[OK] BestCandidate InferenceContainers generated.


In [89]:
best_candidate_containers = best_candidate["InferenceContainers"]

In [90]:
for container in best_candidate_containers:
    print(container["Image"])
    print(container["ModelDataUrl"])
    print("======================")

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-sklearn-automl:2.5-1-cpu-py3
s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/data-processor-models/automl-dm-10-07-32-11-dpp1-1-9be00f19eeb64a97a58d95421a9ccb529a/output/model.tar.gz
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.3-1-cpu-py3
s3://sagemaker-us-east-1-924490614652/models/autopilot/automl-dm-10-07-32-11/tuning/automl-dm--dpp1-xgb/automl-dm-10-07-32-11KWw7cLa0hBd-001-3332516c/output/model.tar.gz


In [92]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}