In [1]:
# basic packages
import os
import subprocess
import logging
from tqdm import tqdm
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from time import localtime, strftime

In [2]:
# sagemaker parameters
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import boto3

session = sagemaker.Session()
role = get_execution_role()
bucket = session.default_bucket()
prefix = 'kaggle/tweetSentiment'  # Prefix should not tontain '/' at the end!
s3 = boto3.client('s3')

In [3]:
# directories
working_dir = '/home/ec2-user/SageMaker/kaggle_data'
data_dir = os.path.join(working_dir, 'processed_data/')
output_dir = os.path.join(working_dir, 'output/')
if not os.path.exists(output_dir):
    subprocess.check_call('mkdir {}'.format(output_dir), shell=True)

In [4]:
# data upload to S3
s3_object_dict = s3.list_objects_v2(
    Bucket=bucket,
    Prefix=prefix
)

# Fetch filenames in the data directory.
local_data_list = os.listdir(data_dir)

# Combine the name with the S3 bucket prefix.
local_data_list = [os.path.join(prefix, f) for f in local_data_list]

# Upload the data if they are not present in S3.
try:
    s3_object_list = [content['Key'] for content in s3_object_dict['Contents']]
    if set(local_data_list).intersection(s3_object_list) == set(local_data_list):
        test_location = os.path.join('s3://', bucket, prefix, 'test_processed.csv')        
        print("input_data already present in S3.")
        
    else:
        # Split the training data to train and velidation data.
        train = pd.read_csv(os.path.join(data_dir, 'train_processed.csv'), header=None)
        train, validation = train_test_split(
            train, 
            stratify=train[0].values,
            test_size=0.25,
            random_state=0
        )
        
        # Save them locally.
        train.to_csv(os.path.join(data_dir, 'train_processed_split.csv'), header=None, index=None)
        validation.to_csv(os.path.join(data_dir, 'validation.csv'), header=None, index=None)
        
        train = None,
        validation = None
        
        # Upload the data to S3.
        test_location = session.upload_data(
            path=os.path.join(data_dir, 'test_processed.csv'), 
            bucket=bucket,
            key_prefix=prefix
        )
        
        print("train and validation data uploaded to S3.")
        
except KeyError:  # if nothing exists in the S3 bucket.
    # Upload the data to S3.
    test_location = session.upload_data(
        path=os.path.join(data_dir, 'test_processed.csv'), 
        bucket=bucket,
        key_prefix=prefix
    )
    
    print("train and validation data uploaded to S3.")

input_data already present in S3.


In [5]:
with open('tuning_job_name.txt', 'r') as f:
    tuning_job_name = f.read().split()[0]
print(tuning_job_name)

tuning-job2020-05-30-19-45-31


In [6]:
tuning_job_info = session.sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)
best_training_job_name = tuning_job_info['BestTrainingJob']['TrainingJobName']
training_job_info = session.sagemaker_client.describe_training_job(TrainingJobName=best_training_job_name)

model_artifacts = training_job_info['ModelArtifacts']['S3ModelArtifacts']

In [10]:
container = get_image_uri(session.boto_region_name, 'xgboost', '0.90-1')

# Just like when we created a training job, the model name must be unique
model_name = best_training_job_name + "-model"

# We also need to tell SageMaker which container should be used for inference and where it should
# retrieve the model artifacts from. In our case, the xgboost container that we used for training
# can also be used for inference.
primary_container = {
    "Image": container,
    "ModelDataUrl": model_artifacts
}

# And lastly we construct the SageMaker model
model_info = session.sagemaker_client.create_model(
                                ModelName = model_name,
                                ExecutionRoleArn = role,
                                PrimaryContainer = primary_container)

In [11]:
# Just like in each of the previous steps, we need to make sure to name our job and the name should be unique.
transform_job_name = 'boston-xgboost-batch-transform-' + strftime("%Y-%m-%d-%H-%M-%S", localtime())

# Now we construct the data structure which will describe the batch transform job.
transform_request = {
    "TransformJobName": transform_job_name,
    "ModelName": model_name,
    "MaxConcurrentTransforms": 1,
    "MaxPayloadInMB": 6,
    "BatchStrategy": "MultiRecord",
    "TransformOutput": {
        "S3OutputPath": "s3://{}/{}/batchTransform/".format(bucket, prefix)
    },
    "TransformInput": {
        "ContentType": "text/csv",
        "SplitType": "Line",
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": test_location,
            }
        }
    },
    "TransformResources": {
            "InstanceType": "ml.m4.xlarge",
            "InstanceCount": 1
    }
}

In [12]:
transform_response = session.sagemaker_client.create_transform_job(**transform_request)
transform_desc = session.wait_for_transform_job(transform_job_name)

...............................................!


In [22]:
transform_output = "s3://{}/{}/batchTransform/test_processed.csv.out".format(bucket, prefix)
subprocess.check_call(
    'aws s3 cp {} {}'.format(
        transform_output, os.path.join(output_dir, 'test_processed.csv.out')
    ),
    shell=True
)

0

In [23]:
test = pd.read_csv('test.csv')
id = test.id
pred = pd.read_csv(os.path.join(output_dir, 'test_processed.csv.out'), header=None)
out = pd.concat([id, pred.round().astype(int)], axis=1)
out.columns = ['id', 'target']

out.to_csv(os.path.join(output_dir, 'out.csv'), index=False)

In [24]:
subprocess.check_call('aws s3 cp {} s3://{}/{}/out.csv'.format(os.path.join(output_dir, 'out.csv'), bucket, prefix), shell=True)

0