In [1]:
# basic packages
import os
import subprocess
import logging
from tqdm import tqdm
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from time import localtime, strftime

In [2]:
# sklearn
from sklearn.model_selection import train_test_split

In [3]:
# sagemaker parameters
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import boto3

session = sagemaker.Session()
role = get_execution_role()
bucket = session.default_bucket()
prefix = 'kaggle/tweetSentiment'  # Prefix should not tontain '/' at the end!
s3 = boto3.client('s3')

In [4]:
# directories
working_dir = '/home/ec2-user/SageMaker/kaggle_data'
data_dir = os.path.join(working_dir, 'processed_data/')
cache_dir = os.path.join(working_dir, 'cache/')

In [5]:
# Split the training data to train and velidation data.
train = pd.read_csv(os.path.join(data_dir, 'train_processed.csv'), header=None)
train, validation = train_test_split(
    train, 
    stratify=train[0].values,
    test_size=0.25,
    random_state=0)

In [6]:
# Save the split data to the local.
train.to_csv(os.path.join(data_dir, 'train_processed_split.csv'), header=None, index=None)
validation.to_csv(os.path.join(data_dir, 'validation.csv'), header=None, index=None)

In [7]:
# data upload to S3
s3_object_dict = s3.list_objects_v2(
    Bucket=bucket,
    Prefix=prefix
)

# Fetch filenames in the data directory.
local_data_list = os.listdir(data_dir)

# Combine the name with the S3 bucket prefix.
local_data_list = [os.path.join(prefix, f) for f in local_data_list]

# Upload the data if they are not present in S3.
try:
    s3_object_list = [content['Key'] for content in s3_object_dict['Contents']]
    if set(local_data_list).intersection(s3_object_list) == set(local_data_list):
        train_location = os.path.join('s3://', bucket, prefix, 'train_processed_split.csv')
        validation_location = os.path.join('s3://', bucket, prefix, 'validation.csv')
        
        print("input_data already present in S3.")
        
    else:
        # Split the training data to train and velidation data.
        train = pd.read_csv(os.path.join(data_dir, 'train_processed.csv'), header=None)
        train, validation = train_test_split(
            train, 
            stratify=train[0].values,
            test_size=0.25,
            random_state=0
        )
        
        # Save them locally.
        train.to_csv(os.path.join(data_dir, 'train_processed_split.csv'), header=None, index=None)
        validation.to_csv(os.path.join(data_dir, 'validation.csv'), header=None, index=None)
        
        train = None,
        validation = None
        
        # Upload the data to S3.
        train_location = session.upload_data(
            path=os.path.join(data_dir, 'train_processed_split.csv'), 
            bucket=bucket,
            key_prefix=prefix
        )
        validation_location = session.upload_data(
            path=os.path.join(data_dir, 'validation.csv'), 
            bucket=bucket,
            key_prefix=prefix
        )
        
        print("train and validation data uploaded to S3.")
        
except KeyError:  # if nothing exists in the S3 bucket.
    # Split the training data to train and velidation data.
    train = pd.read_csv(os.path.join(data_dir, 'train_processed.csv'), header=None)
    train, validation = train_test_split(
        train, 
        stratify=train[0].values,
        test_size=0.25,
        random_state=0
    )

    # Save them locally.
    train.to_csv(os.path.join(data_dir, 'train_processed_split.csv'), header=None, index=None)
    validation.to_csv(os.path.join(data_dir, 'validation.csv'), header=None, index=None)

    train = None,
    validation = None
    
    # Upload the data to S3.
    train_location = session.upload_data(
        path=os.path.join(data_dir, 'train_processed_split.csv'), 
        bucket=bucket,
        key_prefix=prefix
    )
    validation_location = session.upload_data(
        path=os.path.join(data_dir, 'validation.csv'), 
        bucket=bucket,
        key_prefix=prefix
    )
    
    print("train and validation data uploaded to S3.")

input_data already present in S3.


In [8]:
# We will need to know the name of the container that we want to use for training. SageMaker provides
# a nice utility method to construct this for us.
container = get_image_uri(session.boto_region_name, 'xgboost', '0.90-2')

# We now specify the parameters we wish to use for our training job
training_params = {}

# We need to specify the permissions that this training job will have. For our purposes we can use
# the same permissions that our current SageMaker session has.
training_params['RoleArn'] = role

# Here we describe the algorithm we wish to use. The most important part is the container which
# contains the training code.
training_params['AlgorithmSpecification'] = {
    "TrainingImage": container,
    "TrainingInputMode": "File"
}

# We also need to say where we would like the resulting model artifacts stored.
training_params['OutputDataConfig'] = {
    "S3OutputPath": os.path.join("s3://", bucket, prefix, "output")
}

# We also need to set some parameters for the training job itself. Namely we need to describe what sort of
# compute instance we wish to use along with a stopping condition to handle the case that there is
# some sort of error and the training script doesn't terminate.
training_params['ResourceConfig'] = {
    "InstanceCount": 1,
    "InstanceType": "ml.m4.xlarge",
    "VolumeSizeInGB": 5
}
    
training_params['StoppingCondition'] = {
    "MaxRuntimeInSeconds": 86400
}

# Next we set the algorithm specific hyperparameters. In this case, since we are setting up
# a training job which will serve as the base training job for the eventual hyperparameter
# tuning job, we only specify the _static_ hyperparameters. That is, the hyperparameters that
# we do _not_ want SageMaker to change.
training_params['StaticHyperParameters'] = {
    "gamma": "4",
    "subsample": "0.8",
    "objective": "binary:logistic",
    "early_stopping_rounds": "100",
    "num_round": "1000"
}

# Now we need to tell SageMaker where the data should be retrieved from.
training_params['InputDataConfig'] = [
    {
        "ChannelName": "train",
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": train_location,
                "S3DataDistributionType": "FullyReplicated"
            }
        },
        "ContentType": "csv",
        "CompressionType": "None"
    },
    {
        "ChannelName": "validation",
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": validation_location,
                "S3DataDistributionType": "FullyReplicated"
            }
        },
        "ContentType": "csv",
        "CompressionType": "None"
    }
]

In [9]:
# We need to construct a dictionary which specifies the tuning job we want SageMaker to perform
tuning_job_config = {
    # First we specify which hyperparameters we want SageMaker to be able to vary,
    # and we specify the type and range of the hyperparameters.
    "ParameterRanges": {
    "CategoricalParameterRanges": [],
    "ContinuousParameterRanges": [
        {
            "MaxValue": "0.5",
            "MinValue": "0.05",
            "Name": "eta"
        },
    ],
    "IntegerParameterRanges": [
        {
            "MaxValue": "12",
            "MinValue": "3",
            "Name": "max_depth"
        },
        {
            "MaxValue": "8",
            "MinValue": "2",
            "Name": "min_child_weight"
        }
    ]},
    # We also need to specify how many models should be fit and how many can be fit in parallel
    "ResourceLimits": {
        "MaxNumberOfTrainingJobs": 21,
        "MaxParallelTrainingJobs": 3
    },
    # Here we specify how SageMaker should update the hyperparameters as new models are fit
    "Strategy": "Bayesian",
    # And lastly we need to specify how we'd like to determine which models are better or worse
    "HyperParameterTuningJobObjective": {
        "MetricName": "validation:f1",
        "Type": "Maximize"
    }
  }

In [10]:
# First we need to choose a name for the job. This is useful for if we want to recall information about our
# tuning job at a later date. Note that SageMaker requires a tuning job name and that the name needs to
# be unique, which we accomplish by appending the current timestamp.
tuning_job_name = "tuning-job" + strftime("%Y-%m-%d-%H-%M-%S", localtime())

# And now we ask SageMaker to create (and execute) the training job
session.sagemaker_client.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName = tuning_job_name,
    HyperParameterTuningJobConfig = tuning_job_config,
    TrainingJobDefinition = training_params
)

{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-2:815596061983:hyper-parameter-tuning-job/tuning-job2020-06-03-15-49-53',
 'ResponseMetadata': {'RequestId': '345d93b9-44d6-4867-b27e-26dd80904430',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '345d93b9-44d6-4867-b27e-26dd80904430',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '130',
   'date': 'Wed, 03 Jun 2020 15:49:53 GMT'},
  'RetryAttempts': 0}}

In [11]:
session.wait_for_tuning_job(tuning_job_name)

...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................!


{'HyperParameterTuningJobName': 'tuning-job2020-06-03-15-49-53',
 'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-2:815596061983:hyper-parameter-tuning-job/tuning-job2020-06-03-15-49-53',
 'HyperParameterTuningJobConfig': {'Strategy': 'Bayesian',
  'HyperParameterTuningJobObjective': {'Type': 'Maximize',
   'MetricName': 'validation:f1'},
  'ResourceLimits': {'MaxNumberOfTrainingJobs': 21,
   'MaxParallelTrainingJobs': 3},
  'ParameterRanges': {'IntegerParameterRanges': [{'Name': 'max_depth',
     'MinValue': '3',
     'MaxValue': '12',
     'ScalingType': 'Auto'},
    {'Name': 'min_child_weight',
     'MinValue': '2',
     'MaxValue': '8',
     'ScalingType': 'Auto'}],
   'ContinuousParameterRanges': [{'Name': 'eta',
     'MinValue': '0.05',
     'MaxValue': '0.5',
     'ScalingType': 'Auto'}],
   'CategoricalParameterRanges': []}},
 'TrainingJobDefinition': {'StaticHyperParameters': {'_tuning_objective_metric': 'validation:f1',
   'early_stopping_rounds': '100',
   'gamma': 

In [None]:
tuning_job_info = session.sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)
best_training_job_name = tuning_job_info['BestTrainingJob']['TrainingJobName']

In [12]:
# Save the tuning job name.
subprocess.check_call("echo {} > xgboost_model.txt".format(best_training_job_name), shell=True)

0