In [None]:
#Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import io

In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name
print(region)

session = sagemaker.Session()
bucket = session.default_bucket()
print(bucket)
prefix = 'sagemaker20220615/automl-dm'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker', region_name=region)
sm_rt = boto3.Session().client('runtime.sagemaker', region_name=region)

In [None]:
#Reading CSV files and combining them into single dataframe

In [None]:
s3_client = boto3.client('s3')
bucket_name = 'news-headlines'
obj_list = s3_client.list_objects(Bucket=bucket_name)
file_list = []
for contents in obj_list['Contents']:
    file_list.append(contents['Key'])
print(file_list)

In [None]:
files = file_list[:]
print(files)
data = pd.DataFrame()
for file in files:
    response = s3_client.get_object(Bucket=bucket_name, Key=file)
    response_body = response["Body"].read()
    temp = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=",", low_memory=False)
    print(temp.shape)
    data = data.append(temp, ignore_index = True)
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 50)         # Keep the output on one page
data[:10]    

In [None]:
data.shape

In [None]:
data.columns
data.head(10)

In [None]:
#Splitting the dataset into 0.05:0.95 ratio

In [None]:
train_data, test_data, _ = np.split(data.sample(frac=1, random_state=123), 
                                                  [int(0.95 * len(data)), int(len(data))])  


train_data.to_csv('automl-train.csv', index=False, header=True, sep=',') 
test_data.to_csv('automl-test.csv', index=False, header=True, sep=',')

In [None]:
print(test_data.shape)
print(test_data.columns)

In [None]:
test_data.head(10)

In [None]:
#Storing training and testing datasets in S3

In [None]:
session.upload_data(path="automl-train.csv", key_prefix=prefix + "/input")
session.upload_data(path="automl-test.csv", key_prefix=prefix + "/input")

In [None]:
#A: Using Amazon SageMkaer Autopilot

In [None]:
#Setting up Amazon SageMkaer Autopilot JOb

In [None]:
job_config = {
    'CompletionCriteria': {
      'MaxRuntimePerTrainingJobInSeconds': 600,
      # 'MaxCandidates': 10,
      'MaxAutoMLJobRuntimeInSeconds': 3600
    },
}
input_data_config = [
    {
        'DataSource':{
            'S3DataSource': {
                'S3DataType':'S3Prefix',
                'S3Uri':'s3://{}/{}/input'.format(bucket,prefix)
            }
        },
        
        'TargetAttributeName': 'y'
    }
]

output_data_config = {
    'S3OutputPath':'s3://{}/{}/output'.format(bucket,prefix)
}

problem_type = 'BinaryClassification'

job_objective = { 'MetricName': 'F1' }

In [None]:
#Creating Amazon SageMaker AutoPilot job

In [None]:
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

auto_ml_job_name = 'automl-bankapp-' + timestamp_suffix
print('AutoMLJobName: ', auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig=job_config,
                      AutoMLJobObjective=job_objective,
                      ProblemType=problem_type,
                      RoleArn=role)

In [None]:
 #Tracking the progress of the Amazon SageMaker AutoPilot job

In [None]:
%%time
print('Jobstatus - Secondary status')
print('----------------------------')
describe_response = sm.describe_auto_ml_job(AutoMLJobName = auto_ml_job_name) 
print(describe_response['AutoMLJobStatus'] +' - '+ describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    
    describe_response = sm.describe_auto_ml_job(AutoMLJobName = auto_ml_job_name) 
    job_run_status = describe_response['AutoMLJobStatus']
    print(describe_response['AutoMLJobStatus'] +' - '+ describe_response['AutoMLJobSecondaryStatus']) 
    sleep(30)

In [None]:
#Get Candidate Notebooks

In [None]:
job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
job_candidate_notebook = job['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
job_data_notebook = job['AutoMLJobArtifacts']['DataExplorationNotebookLocation']

print(job_candidate_notebook)
print(job_data_notebook)

In [None]:
#Listing all candidates

In [None]:
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, 
                                                SortBy='FinalObjectiveMetricValue')['Candidates']
index = 1
for candidate in candidates:
  print (str(index) + "  " 
         + candidate['CandidateName'] + "  " 
         + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))
  index += 1

In [None]:
#Getting best candidate's info

In [None]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']

print("Candidate name: " + best_candidate_name)

In [None]:
 for container in best_candidate['InferenceContainers']:
    print(container['Image'])
    print(container['ModelDataUrl'])
    print('-')

In [None]:
#Deploying the best candidate

In [None]:
model_name = 'automl-dm-model-' + timestamp_suffix

model_arn = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

print('Model ARN: ', model_arn['ModelArn'])

In [None]:
s3_capture_path = 's3://jsimon-capture-eueast1/' + model_name + '/'

print(s3_capture_path)

In [None]:
 data_capture_configuration = {
    "EnableCapture": True, # flag turns data capture on and off
    "DestinationS3Uri": s3_capture_path, # s3 location where captured data is saved
    "InitialSamplingPercentage": 100, # sampling rate to capture data. max is 100%
    "CaptureOptions": [
       {
            "CaptureMode": "Output" # The type of capture this option enables. Values can be: [Output/Input]
        },
        {
            "CaptureMode": "Input" # The type of capture this option enables. Values can be: [Output/Input]
        }
    ],
    "CaptureContentTypeHeader": {
       "CsvContentTypes": ["text/csv"], # headers which should signal to decode the payload into CSV format 
       "JsonContentTypes": ["application/json"] # headers which should signal to decode the payload into JSON format 
    }
}

In [None]:
#creating the endpoint configuration, 

In [None]:
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
epc_name = 'automl-dm-epc-' + timestamp_suffix
print('Endpoint configuration name:', epc_name)

ep_config = sm.create_endpoint_config(EndpointConfigName = epc_name,
                                      ProductionVariants=[{'InstanceType':'ml.m4.xlarge',
                                                           'InitialInstanceCount':1,
                                                           'ModelName':model_name,
                                                           'VariantName': 'AllTraffic'}],
                                      DataCaptureConfig = data_capture_configuration)

In [None]:
#Creating the endpoints

In [None]:
ep_name = 'automl-dm-ep-' + timestamp_suffix
variant_name = 'automl-dm-variant-' + timestamp_suffix
print('Endpoint name:', ep_name)

create_endpoint_response = sm.create_endpoint(EndpointName=ep_name,
                                              EndpointConfigName=epc_name)

In [None]:
sm.get_waiter('endpoint_in_service').wait(EndpointName=ep_name)

resp = sm.describe_endpoint(EndpointName=ep_name)
status = resp['EndpointStatus']

print("Endpoint ARN   : " + resp['EndpointArn'])
print("Endpoint status: " + status)

In [None]:
# Predicting and scoring the candidate
# Getting TP, Tn, Fp, Fn parameter manually

In [None]:
tp = tn = fp = fn = count = 0

with open('automl-test.csv') as f:
    lines = f.readlines()
    for l in lines[1:]:   # Skip header
        l = l.split(',')  # Split CSV line into features
        label = l[-1]     # Store 'yes'/'no' label
        l = l[:-1]        # Remove label
        l = ','.join(l)   # Rebuild CSV line without label
                
        response = sm_rt.invoke_endpoint(EndpointName=ep_name, ContentType='text/csv', Accept='text/csv', Body=l)

        response = response['Body'].read().decode("utf-8")
        #print ("label %s response %s" %(label,response))

        if 'yes' in label:
            # Sample is positive
            if 'yes' in response:
                # True positive
                tp=tp+1
            else:
                # False negative
                fn=fn+1
        else:
            # Sample is negative
            if 'no' in response:
                # True negative
                tn=tn+1
            else:
                # False positive
                fp=fp+1
        count = count+1
        if (count % 100 == 0):   
            sys.stdout.write(str(count)+' ')
            
print ("Done")

In [None]:
#Getting evalution metrics

In [None]:
print ("%d %d" % (tn, fp))
print ("%d %d" % (fn, tp))

accuracy  = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall    = tn/(tp+fn)
f1        = (2*precision*recall)/(precision+recall)

print ("Accuracy: %.4f, Precision: %.4f, Recall: %.4f, F1: %.4f" % (accuracy, precision, recall, f1))

In [None]:
#Deleting the endpoint

In [None]:
sm.delete_endpoint(EndpointName=ep_name)

In [None]:
#B. Manually

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri 
container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='1.0-1')
container

In [None]:
output_path ='s3://{}/{}/output'.format(bucket, prefix)

In [None]:
train_file_location = 's3://{0}/{1}/{2}'.format(bucket, prefix, train_file_name)
test_file_location = 's3://{0}/{1}/{2}'.format(bucket, prefix, test_file_name)
print(train_file_location)
print(test_file_location)

In [None]:
training_file = sagemaker.session.s3_input(s3_data=train_file_location, content_type = "csv")
validation_file = sagemaker.session.s3_input(s3_data=test_file_location, content_type = "csv")

In [None]:
#Initial Hyperparameters

In [None]:
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }

In [None]:
#Builfing models xgboost

In [None]:
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          train_use_spot_instances=True,
                                          train_max_run=300,
                                          train_max_wait=600)

In [None]:
estimator.fit({'train': training_file,'validation': validation_file})

In [None]:
#Deploy Machine Learning Model As Endpoints

In [None]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

In [None]:
#Prediction of the Test Data

In [None]:
from sagemaker.predictor import csv_serializer
test_data_array = test_data.drop(['y'], axis=1).values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

In [None]:
predictions_array

In [None]:
#Getting evalution metrics

In [None]:
cm = pd.crosstab(index=test_data['y'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No", "Yes"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Yes", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))

In [None]:
#Deleting the endpoint

In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)