# Imports and Variables

In [157]:
import os
import boto3
from botocore.exceptions import ClientError
import logging
import sagemaker
from sagemaker import get_execution_role
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from time import gmtime, strftime, sleep
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
aws_user = os.environ['AWS_USER']
aws_key = os.environ['AWS_KEY']
region_name = 'eu-central-1'

session = boto3.session.Session(
    aws_access_key_id=aws_user,
    aws_secret_access_key=aws_key,
    region_name=region_name)

s3_client = session.client('s3')
s3 = session.resource('s3')

region_name = 'eu-central-1'
bucket_name = 'niy-certification'

# Prepare AutoML

## Preprocessing

In [6]:
file_uris = {}
file_uris['Jan_2019'] = 'https://niy-certification.s3.eu-central-1.amazonaws.com/usdot/Jan_2019_usdot.csv'
file_uris['Jan_2020'] = 'https://niy-certification.s3.eu-central-1.amazonaws.com/usdot/Jan_2020_usdot.csv'
file_uris['Feb_2019'] = 'https://niy-certification.s3.eu-central-1.amazonaws.com/usdot/Feb_2019_usdot.csv'
file_uris['Feb_2020'] = 'https://niy-certification.s3.eu-central-1.amazonaws.com/usdot/Feb_2020_usdot.csv'

In [126]:
df_raw = pd.read_csv(file_uris['Jan_2019'], 
                 dtype={'ARR_TIME': str, 
                        'DEP_TIME': str, 
                        'DAY_OF_MONTH': str})

In [127]:
# drop flights which are either canceled or diverted
# as this is a conceptually different thing than being delayed
df_raw = df_raw.drop(df_raw.index[(df_raw['CANCELLED'] == 1) | 
                         (df_raw['DIVERTED'] == 1)]).reset_index(drop=True)

In [128]:
df_raw['DAY_OF_MONTH'] = df_raw['DAY_OF_MONTH'].astype(float)

In [129]:
columns_to_drop = ['YEAR',
                   'MONTH',
                   'FL_DATE',
                   'TAIL_NUM', # TO CHECK
                   'ORIGIN',
                   'ORIGIN_CITY_NAME', # TO CHECK
                   'ORIGIN_STATE_NM',
                   'DEST_CITY_NAME',
                   'DEST_STATE_NM',
                   'DEP_TIME', 
                   'DEP_TIME_BLK',
                   'ARR_TIME',
                   'ARR_TIME_BLK',
                   'CANCELLED',
                   'DIVERTED',
                   'CARRIER_DELAY',
                   'WEATHER_DELAY', 
                   'NAS_DELAY', 
                   'SECURITY_DELAY', 
                   'LATE_AIRCRAFT_DELAY',
                   'FLIGHTS', # only one entry for column
                   'Unnamed: 54']

In [146]:
df = df_raw.drop(columns_to_drop, axis=1)

In [147]:
# TO DO FOR LATER ON
df_raw['OP_UNIQUE_CARRIER'].unique() # OHE
df_raw['ORIGIN'].nunique() # OHE
df_raw['ORIGIN_STATE_ABR'].nunique() # OHE
df_raw['DEST'].nunique() # OHE

360

In [148]:
# INCLUDE FOR LATER
df = df.drop(['OP_CARRIER', 'ORIGIN_STATE_ABR', 'DEST', 'DEST_STATE_ABR'], axis=1)

In [149]:
df = df.drop(['CRS_DEP_TIME', 
              'OP_CARRIER_AIRLINE_ID',	
              'OP_CARRIER_FL_NUM',	
              'ORIGIN_AIRPORT_ID',	
              'ORIGIN_AIRPORT_SEQ_ID',	
              'ORIGIN_CITY_MARKET_ID',	
              'ORIGIN_STATE_FIPS',	
              'DEST_AIRPORT_ID',	
              'DEST_AIRPORT_SEQ_ID',	
              'DEST_CITY_MARKET_ID',
              'DEST_STATE_FIPS',
              'DEP_DELAY',
              'DEP_DELAY_NEW',
              'DEP_DEL15', 
              'DEP_DELAY_GROUP', 
              'WHEELS_OFF', 
              'WHEELS_ON', 
              'CRS_ARR_TIME',
              'ARR_DELAY',
              'ARR_DEL15', 
              'ARR_DELAY_GROUP', 
              'CRS_ELAPSED_TIME', 
              'ACTUAL_ELAPSED_TIME', 
              'AIR_TIME'], axis=1)

In [150]:
df = df[sorted(df.columns)]

In [240]:
df.columns

Index(['ARR_DELAY_NEW', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE',
       'OP_UNIQUE_CARRIER'],
      dtype='object')

## Train Test Split

In [241]:
df_train, df_test = train_test_split(df, 
                                     test_size=0.2,
                                     random_state=42)

In [242]:
enc = OneHotEncoder(handle_unknown = 'ignore')

In [243]:
enc.fit(df_train[['OP_UNIQUE_CARRIER']])
enc.categories_

[array(['9E', 'AA', 'AS', 'AX', 'B6', 'C5', 'CP', 'DL', 'EM', 'EV', 'F9',
        'G4', 'G7', 'HA', 'KS', 'MQ', 'NK', 'OH', 'OO', 'PT', 'QX', 'UA',
        'WN', 'YV', 'YX', 'ZW'], dtype=object)]

In [245]:
enc_columns = sorted(df_train['OP_UNIQUE_CARRIER'].unique())

In [246]:
df_train = pd.concat([df_train,
           pd.DataFrame(index=df_train.index,
                        data=enc.transform(df_train[['OP_UNIQUE_CARRIER']]).toarray(), 
                        columns=enc_columns)], axis=1).drop('OP_UNIQUE_CARRIER', axis=1)

In [247]:
df_test = pd.concat([df_test,
           pd.DataFrame(index=df_test.index,
                        data=enc.transform(df_test[['OP_UNIQUE_CARRIER']]).toarray(), 
                        columns=enc_columns)], axis=1).drop('OP_UNIQUE_CARRIER', axis=1)

In [254]:
file_path_train = '/Users/yasinedin/Downloads/automl/train.csv'
file_path_test = '/Users/yasinedin/Downloads/automl/test.csv'

In [255]:
df_train.to_csv(file_path_train, index=False, header=True)
df_test.to_csv(file_path_test, index=False, header=False)

## Uploading selected data to S3

In [256]:
# upload data to s3
try:
    train_response = s3_client.upload_file(file_path_train, bucket_name, 'automl/train.csv')
    print('Train data successfully uploaded')
except ClientError as e:
    logging.error(e)
    print('Something went wrong.')
    
try:
    test_response = s3_client.upload_file(file_path_test, bucket_name, 'automl/test.csv')
    print('Test data successfully uploaded')
except ClientError as e:
    logging.error(e)
    print('Something went wrong.')

Train data successfully uploaded
Test data successfully uploaded


In [257]:
# list files in bucket
for key in s3_client.list_objects(Bucket=bucket_name)['Contents']:
    print(key['Key'])

automl/test.csv
automl/train.csv
kaggle/
kaggle/Feb_2019_ontime.csv
kaggle/Feb_2020_ontime.csv
kaggle/Jan_2019_ontime.csv
kaggle/Jan_2020_ontime.csv
usdot/
usdot/Feb_2019_usdot.csv
usdot/Feb_2020_usdot.csv
usdot/Jan_2019_usdot.csv
usdot/Jan_2020_usdot.csv


# SageMaker AutoML

In [258]:
sm_session = sagemaker.Session(session)

In [259]:
sm = session.client(service_name='sagemaker', region_name=region_name)

In [260]:
role = 'arn:aws:iam::898627427171:role/service-role/AmazonSageMaker-ExecutionRole-20201106T104926'

In [298]:
timestamp_suffix = strftime('%Y%m%d-%H%M')

auto_ml_job_name = 'niy-aidacert-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

AutoMLJobName: niy-aidacert-20201130-1701


In [299]:
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': f's3://{bucket_name}/automl/train.csv'
        }
      },
      'TargetAttributeName': 'ARR_DELAY_NEW'
    }
  ]

job_config = {
    'CompletionCriteria': {
      'MaxRuntimePerTrainingJobInSeconds': 180,
      'MaxAutoMLJobRuntimeInSeconds': 1080
    },
}

output_data_config = {
    'S3OutputPath': f's3://{bucket_name}/automl/output'
  }

problem_type = 'Regression'
job_objective = { 'MetricName': 'MSE' }

In [300]:
sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig=job_config,
                      AutoMLJobObjective=job_objective,
                      ProblemType=problem_type,
                      RoleArn=role)

{'AutoMLJobArn': 'arn:aws:sagemaker:eu-central-1:898627427171:automl-job/niy-aidacert-20201130-1701',
 'ResponseMetadata': {'RequestId': 'aa1469b6-f9c8-4ebb-8d52-88684b15a057',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'aa1469b6-f9c8-4ebb-8d52-88684b15a057',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '100',
   'date': 'Mon, 30 Nov 2020 16:01:06 GMT'},
  'RetryAttempts': 0}}

In [301]:
print ('JobStatus - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print(describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print(describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - F

In [302]:
describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
describe_response

{'AutoMLJobName': 'niy-aidacert-20201130-1701',
 'AutoMLJobArn': 'arn:aws:sagemaker:eu-central-1:898627427171:automl-job/niy-aidacert-20201130-1701',
 'InputDataConfig': [{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://niy-certification/automl/train.csv'}},
   'TargetAttributeName': 'ARR_DELAY_NEW'}],
 'OutputDataConfig': {'S3OutputPath': 's3://niy-certification/automl/output'},
 'RoleArn': 'arn:aws:iam::898627427171:role/service-role/AmazonSageMaker-ExecutionRole-20201106T104926',
 'AutoMLJobObjective': {'MetricName': 'MSE'},
 'ProblemType': 'Regression',
 'AutoMLJobConfig': {'CompletionCriteria': {'MaxRuntimePerTrainingJobInSeconds': 180,
   'MaxAutoMLJobRuntimeInSeconds': 1080}},
 'CreationTime': datetime.datetime(2020, 11, 30, 17, 1, 5, 476000, tzinfo=tzlocal()),
 'EndTime': datetime.datetime(2020, 11, 30, 17, 19, 12, 961000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2020, 11, 30, 17, 19, 12, 987000, tzinfo=tzlocal()),
 'AutoMLJobStatu

In [303]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

KeyError: 'BestCandidate'

In [None]:
job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
job.keys()

dict_keys(['AutoMLJobName', 'AutoMLJobArn', 'InputDataConfig', 'OutputDataConfig', 'RoleArn', 'AutoMLJobObjective', 'ProblemType', 'AutoMLJobConfig', 'CreationTime', 'EndTime', 'LastModifiedTime', 'BestCandidate', 'AutoMLJobStatus', 'AutoMLJobSecondaryStatus', 'GenerateCandidateDefinitionsOnly', 'AutoMLJobArtifacts', 'ResolvedAttributes', 'ResponseMetadata'])

In [None]:
job['ResolvedAttributes']

{'AutoMLJobObjective': {'MetricName': 'F1macro'},
 'ProblemType': 'MulticlassClassification',
 'CompletionCriteria': {'MaxCandidates': 250,
  'MaxRuntimePerTrainingJobInSeconds': 300,
  'MaxAutoMLJobRuntimeInSeconds': 1800}}

In [None]:
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, SortBy='FinalObjectiveMetricValue')['Candidates']
index = 1
for candidate in candidates:
    print(str(index) + "  " + candidate['CandidateName'] + "  " + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))
    index += 1

1  tuning-job-1-ecbc1a8683ff49e89c-008-60ec475e  0.9968799948692322
2  tuning-job-1-ecbc1a8683ff49e89c-011-fea7c40f  0.9962499737739563
3  tuning-job-1-ecbc1a8683ff49e89c-009-3e60571d  0.9962499737739563
4  tuning-job-1-ecbc1a8683ff49e89c-010-887b97f7  0.9962499737739563
5  tuning-job-1-ecbc1a8683ff49e89c-002-5d3c30b6  0.9956300258636475
6  tuning-job-1-ecbc1a8683ff49e89c-001-d9c83bf5  0.9948099851608276
7  tuning-job-1-ecbc1a8683ff49e89c-012-c7ec3de5  0.9948099851608276
8  tuning-job-1-ecbc1a8683ff49e89c-016-e32ee717  0.9946200251579285
9  tuning-job-1-ecbc1a8683ff49e89c-006-2e728d26  0.9938899874687195
10  tuning-job-1-ecbc1a8683ff49e89c-004-ccabe506  0.9938899874687195


In [None]:
# autopilot cadidate definition notebook
sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']

's3://ye-1490/output/automl-yedin-12-03-43/sagemaker-automl-candidates/pr-1-925ccccedf5b44c4b30f8b03cd685e36aa5ed021fd1d4882857b2c76f0/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb'

In [None]:
# data exploration notebook
sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobArtifacts']['DataExplorationNotebookLocation']

's3://ye-1490/output/automl-yedin-12-03-43/sagemaker-automl-candidates/pr-1-925ccccedf5b44c4b30f8b03cd685e36aa5ed021fd1d4882857b2c76f0/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb'

In [None]:
bucket = s3.Bucket(bucket_name)
bucket.download_file('output/automl-yedin-12-03-43/sagemaker-automl-candidates/pr-1-925ccccedf5b44c4b30f8b03cd685e36aa5ed021fd1d4882857b2c76f0/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb', 'candidate_notebook.ipynb');

In [None]:
bucket.download_file('output/automl-yedin-12-03-43/sagemaker-automl-candidates/pr-1-925ccccedf5b44c4b30f8b03cd685e36aa5ed021fd1d4882857b2c76f0/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb', 'data_exploration.ipynb');

# Inference

In [None]:
timestamp_suffix = strftime('%H-%M-%S')
model_name = 'automl-yedin-' + timestamp_suffix

model = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))

Model ARN corresponding to the best candidate is : arn:aws:sagemaker:eu-central-1:898627427171:model/automl-yedin-14-47-30


In [None]:
transform_job_name = 'automl-yedin-' + timestamp_suffix

transform_input = {
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': 's3://{}/test.csv'.format(bucket_name)
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    }

transform_output = {
        'S3OutputPath': 's3://{}/inference-results'.format(bucket_name),
    }

transform_resources = {
        'InstanceType': 'ml.m5.4xlarge',
        'InstanceCount': 1
    }

sm.create_transform_job(TransformJobName = transform_job_name,
                        ModelName = model_name,
                        TransformInput = transform_input,
                        TransformOutput = transform_output,
                        TransformResources = transform_resources
)

{'TransformJobArn': 'arn:aws:sagemaker:eu-central-1:898627427171:transform-job/automl-yedin-14-47-30',
 'ResponseMetadata': {'RequestId': 'da72145c-9e7d-43e9-a63e-80009e0f8025',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'da72145c-9e7d-43e9-a63e-80009e0f8025',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '101',
   'date': 'Mon, 23 Nov 2020 13:47:40 GMT'},
  'RetryAttempts': 0}}

In [None]:
print ('JobStatus')
print('----------')


describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
job_run_status = describe_response['TransformJobStatus']
print (job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
    job_run_status = describe_response['TransformJobStatus']
    print (job_run_status)
    sleep(30)

JobStatus
----------
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


In [None]:
s3_output_key = 'inference-results/test.csv.out'
local_inference_results_path = 'inference_results.csv'

inference_results_bucket = s3.Bucket(bucket_name)
inference_results_bucket.download_file(s3_output_key, local_inference_results_path);

data = pd.read_csv(local_inference_results_path, sep=';', names=['predicted'])

s3.Bucket(name='ye-1490')


In [None]:
data['true'] = y_test

In [None]:
f1_score(data['true'], data['predicted'], average='weighted')

0.9578332990698479

In [None]:
cm = confusion_matrix(data['true'], data['predicted'])

In [None]:
data['true'].value_counts().sort_index()

1    496
2    471
3    420
4    491
5    532
6    537
Name: true, dtype: int64

In [None]:
data['predicted'].value_counts().sort_index()

1    521
2    465
3    404
4    459
5    561
6    537
Name: predicted, dtype: int64

In [None]:
print(cm)
# rows are true labels
# columns are predicted labels

[[491   5   0   0   0   0]
 [ 27 440   4   0   0   0]
 [  3  17 400   0   0   0]
 [  0   3   0 441  47   0]
 [  0   0   0  18 514   0]
 [  0   0   0   0   0 537]]
