# Packages and Bucket Declaration

In [12]:
bucket = 'bankmarketingfull'
prefix = 'bankmarketingfull/xgboost'
# Define IAM role
import boto3
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer

role = get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


# Data Import, Splitting, and Saving to Bucket

In [3]:
dataset = pd.read_csv('s3://bankmarketingfull/bank_full_enc.csv')

In [5]:
train_data, validation_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset)), int(0.9 * len(dataset))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

In [13]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

# Modeling

In [14]:
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [15]:
xgb.set_hyperparameters(eta=0.1, objective='binary:logistic', num_round=25) 
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

INFO:sagemaker:Creating training-job with name: xgboost-2023-10-18-23-56-20-369


2023-10-18 23:56:20 Starting - Starting the training job...
2023-10-18 23:56:36 Starting - Preparing the instances for training.........
2023-10-18 23:57:59 Downloading - Downloading input data...
2023-10-18 23:58:29 Training - Downloading the training image...
2023-10-18 23:59:05 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2023-10-18:23:59:18:INFO] Running standalone xgboost training.[0m
[34m[2023-10-18:23:59:18:INFO] File size need to be processed in the node: 4.3mb. Available memory size in the node: 8559.09mb[0m
[34m[2023-10-18:23:59:18:INFO] Determined delimiter of CSV input is ','[0m
[34m[23:59:18] S3DistributionType set as FullyReplicated[0m
[34m[23:59:18] 31646x51 matrix with 1613946 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-10-18:23:59:18:INFO] Determined delimiter of CSV input is ','[0m
[34m[23:59:18] S3DistributionType set as FullyReplicated[0m
[34m[23:

In [16]:
xgb_predictor = xgb.deploy(
	initial_instance_count = 1,
	instance_type = 'ml.m4.xlarge',
	serializer = CSVSerializer())

INFO:sagemaker:Creating model with name: xgboost-2023-10-19-00-04-58-362
INFO:sagemaker:Creating endpoint-config with name xgboost-2023-10-19-00-04-58-362
INFO:sagemaker:Creating endpoint with name xgboost-2023-10-19-00-04-58-362


-----!

In [17]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.to_numpy()[:,1:])
predictions

array([0.24250209, 0.04707704, 0.04958318, ..., 0.0471497 , 0.05205324,
       0.04871094])

# Instance Deletion

In [19]:
xgb_predictor.delete_model()

ClientError: An error occurred (ValidationException) when calling the DescribeEndpointConfig operation: Could not find endpoint configuration "xgboost-2023-10-19-00-04-58-362".

In [18]:
xgb_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: xgboost-2023-10-19-00-04-58-362
INFO:sagemaker:Deleting endpoint with name: xgboost-2023-10-19-00-04-58-362
