In [1]:
import sagemaker
import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'sagemaker/autopilot-dm'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)

In [2]:
!wget -N https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip
!unzip -o bank-additional.zip

local_data_path = './bank-additional/bank-additional-full.csv'

--2020-03-03 09:24:42--  https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip
Resolving sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com (sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com)... 52.218.216.209
Connecting to sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com (sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com)|52.218.216.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 432828 (423K) [application/zip]
Saving to: ‘bank-additional.zip’


2020-03-03 09:24:42 (48.3 MB/s) - ‘bank-additional.zip’ saved [432828/432828]

Archive:  bank-additional.zip
   creating: bank-additional/
  inflating: bank-additional/bank-additional-names.txt  
  inflating: bank-additional/bank-additional.csv  
  inflating: bank-additional/bank-additional-full.csv  


In [3]:
import pandas as pd

data = pd.read_csv(local_data_path)
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 10)         # Keep the output on one page
data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,334,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,383,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,189,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,442,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [4]:
train_data = data.sample(frac=0.8,random_state=200)

test_data = data.drop(train_data.index)

test_data_no_target = test_data.drop(columns=['y'])

In [5]:
train_file = 'train_data.csv';
train_data.to_csv(train_file, index=False, header=True)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

test_file = 'test_data.csv';
test_data_no_target.to_csv(test_file, index=False, header=False)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

Train data uploaded to: s3://sagemaker-us-west-2-141763122129/sagemaker/autopilot-dm/train/train_data.csv
Test data uploaded to: s3://sagemaker-us-west-2-141763122129/sagemaker/autopilot-dm/test/test_data.csv


In [6]:
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/train'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'y'
    }
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
  }

In [7]:
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

auto_ml_job_name = 'automl-banking-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      RoleArn=role)

AutoMLJobName: automl-banking-03-09-26-57


{'AutoMLJobArn': 'arn:aws:sagemaker:us-west-2:141763122129:automl-job/automl-banking-03-09-26-57',
 'ResponseMetadata': {'RequestId': '4d6975c3-b035-4be4-94f7-6e37249df23b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4d6975c3-b035-4be4-94f7-6e37249df23b',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '97',
   'date': 'Tue, 03 Mar 2020 09:26:57 GMT'},
  'RetryAttempts': 0}}

In [8]:
print ('JobStatus - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    timestamp = strftime('%d-%H-%M-%S', gmtime())
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (timestamp + ":" + describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
03-09-29-29:InProgress - AnalyzingData
03-09-29-59:InProgress - AnalyzingData
03-09-30-29:InProgress - AnalyzingData
03-09-30-59:InProgress - AnalyzingData
03-09-31-29:InProgress - AnalyzingData
03-09-31-59:InProgress - AnalyzingData
03-09-32-29:InProgress - AnalyzingData
03-09-32-59:InProgress - AnalyzingData
03-09-33-30:InProgress - AnalyzingData
03-09-34-00:InProgress - AnalyzingData
03-09-34-30:InProgress - AnalyzingData
03-09-35-00:InProgress - FeatureEngineering
03-09-35-30:InProgress - FeatureEngineering
03-09-36-00:InProgress - FeatureEngineering
03-09-36-30:InProgress - FeatureEngineering
03-09-37-00:InProgress - FeatureEngineering
03-09-37-30:InProgress - FeatureEngineering
03-09-38-00:InProgress - FeatureEngineering
03-09-38-31:InProgress - FeatureEngineering
03-09-39-01:InProgress - FeatureEngineering
03-09-39-31:InProgress - FeatureEngineering
03-09-40-01:InProgress - FeatureEngineering


In [9]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

{'CandidateName': 'tuning-job-1-bd5f4ade6b9345f3b1-148-df9fd5c7', 'FinalAutoMLJobObjectiveMetric': {'MetricName': 'validation:accuracy', 'Value': 0.9160720109939575}, 'ObjectiveStatus': 'Succeeded', 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-west-2:141763122129:processing-job/db-1-be3d772a4cbe41f580ffbf2df2d6965f5a61eb6762254132a6560bd31f', 'CandidateStepName': 'db-1-be3d772a4cbe41f580ffbf2df2d6965f5a61eb6762254132a6560bd31f'}, {'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-west-2:141763122129:training-job/automl-ban-dpp5-1-a76908e539914599bdc2517726927fd51a02fe88a60d4', 'CandidateStepName': 'automl-ban-dpp5-1-a76908e539914599bdc2517726927fd51a02fe88a60d4'}, {'CandidateStepType': 'AWS::SageMaker::TransformJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-west-2:141763122129:transform-job/automl-ban-dpp5-rpb-1-39a7fb3e6ec545c3842a0becd24c8e4f770905ebf', 'CandidateStepNa

In [10]:
model_name = 'automl-banking-model-' + timestamp_suffix

model = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))

Model ARN corresponding to the best candidate is : arn:aws:sagemaker:us-west-2:141763122129:model/automl-banking-model-03-09-26-57


In [11]:
transform_job_name = 'automl-banking-transform-' + timestamp_suffix

transform_input = {
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': test_data_s3_path
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    }

transform_output = {
        'S3OutputPath': 's3://{}/{}/inference-results'.format(bucket,prefix),
    }

transform_resources = {
        'InstanceType': 'ml.m5.4xlarge',
        'InstanceCount': 1
    }

sm.create_transform_job(TransformJobName = transform_job_name,
                        ModelName = model_name,
                        TransformInput = transform_input,
                        TransformOutput = transform_output,
                        TransformResources = transform_resources
)

{'TransformJobArn': 'arn:aws:sagemaker:us-west-2:141763122129:transform-job/automl-banking-transform-03-09-26-57',
 'ResponseMetadata': {'RequestId': '12dfdcec-d9e7-4868-850b-611a0f775d55',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '12dfdcec-d9e7-4868-850b-611a0f775d55',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '113',
   'date': 'Wed, 04 Mar 2020 04:16:14 GMT'},
  'RetryAttempts': 0}}

In [12]:
print ('JobStatus')
print('----------')


describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
job_run_status = describe_response['TransformJobStatus']
print (job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    timestamp = strftime('%d-%H-%M-%S', gmtime())
    describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
    job_run_status = describe_response['TransformJobStatus']
    print (timestamp + ":" + job_run_status)
    sleep(30)

JobStatus
----------
InProgress
04-04-18-03:InProgress
04-04-18-33:InProgress
04-04-19-03:InProgress
04-04-19-33:InProgress
04-04-20-04:Completed


In [13]:
s3_output_key = '{}/inference-results/test_data.csv.out'.format(prefix);
local_inference_results_path = 'inference_results.csv'

s3 = boto3.resource('s3')
inference_results_bucket = s3.Bucket(session.default_bucket())

inference_results_bucket.download_file(s3_output_key, local_inference_results_path);

data = pd.read_csv(local_inference_results_path, sep=';')
pd.set_option('display.max_rows', 10)         # Keep the output on one page
data

Unnamed: 0,no
0,no
1,no
2,no
3,no
4,no
...,...
8232,yes
8233,yes
8234,no
8235,yes


In [14]:
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, SortBy='FinalObjectiveMetricValue')['Candidates']
index = 1
for candidate in candidates:
  print (str(index) + "  " + candidate['CandidateName'] + "  " + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))
  index += 1

1  tuning-job-1-bd5f4ade6b9345f3b1-148-df9fd5c7  0.9160720109939575
2  tuning-job-1-bd5f4ade6b9345f3b1-065-2ac33d5c  0.9151620268821716
3  tuning-job-1-bd5f4ade6b9345f3b1-209-e12afedc  0.9151620268821716
4  tuning-job-1-bd5f4ade6b9345f3b1-083-7b4cddb2  0.9144030213356018
5  tuning-job-1-bd5f4ade6b9345f3b1-043-fbe6b2c6  0.9144030213356018
6  tuning-job-1-bd5f4ade6b9345f3b1-069-42a99b60  0.9142510294914246
7  tuning-job-1-bd5f4ade6b9345f3b1-117-138c848e  0.9140989780426025
8  tuning-job-1-bd5f4ade6b9345f3b1-206-ee694be3  0.9139469861984253
9  tuning-job-1-bd5f4ade6b9345f3b1-103-09b85850  0.913644015789032
10  tuning-job-1-bd5f4ade6b9345f3b1-222-ddc8bcbf  0.913644015789032
