In [1]:
# basic packages
import os
import subprocess
import logging
from tqdm import tqdm
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from time import localtime, strftime

In [2]:
# sagemaker parameters
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import boto3

session = sagemaker.Session()
role = get_execution_role()
bucket = session.default_bucket()
prefix = 'kaggle/tweetSentiment'  # Prefix should not tontain '/' at the end!
s3 = boto3.client('s3')

In [3]:
# directories
working_dir = '/home/ec2-user/SageMaker/kaggle_data'
data_dir = os.path.join(working_dir, 'processed_data/')
output_dir = os.path.join(working_dir, 'output/')
if not os.path.exists(output_dir):
    subprocess.check_call('mkdir {}'.format(output_dir), shell=True)

In [4]:
with open('tf_model.txt', 'r') as f:
    tf_model = f.read().split()[0]
print(tf_model)


tensorflow-training-200610-1950-009-1915ccbb


In [5]:
from sagemaker.tensorflow.model import TensorFlowModel

In [6]:
from sagemaker.tensorflow import TensorFlow

tf_estimator = TensorFlow(
    entry_point='train.py',
    source_dir='source',
    role=role,
    train_instance_count=1,
    train_instance_type='ml.p2.xlarge',
    framework_version='2.1.0',
    py_version='py3',
    distributions={'parameter_server': {'enabled': True}},
)

Parameter distribution will be renamed to {'parameter_server': {'enabled': True}} in SageMaker Python SDK v2.


In [7]:
tf_estimator = tf_estimator.attach(tf_model)

2020-06-10 20:55:00 Starting - Preparing the instances for training
2020-06-10 20:55:00 Downloading - Downloading input data
2020-06-10 20:55:00 Training - Training image download completed. Training in progress.
2020-06-10 20:55:00 Uploading - Uploading generated training model
2020-06-10 20:55:00 Completed - Training job completed[34m2020-06-10 20:53:01,664 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-06-10 20:53:01,665 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value loss to Json.[0m
[34mReturning the value itself[0m
[34m2020-06-10 20:53:01,695 sagemaker_tensorflow_container.training INFO     Appending the training job name to model_dir: s3://sagemaker-us-east-2-815596061983/kaggle/tweetSentiment/model/tensorflow-training-2020-06-10-19-50-44-027/model/tensorflow-training-200610-1950-009-1915ccbb/model[0m
[34m2020-06-10 20:53:02,140 sagemaker-containers INFO     Installing modu

[34mEpoch 11/100[0m
[34mEpoch 12/100[0m
[34mEpoch 13/100[0m
[34mEpoch 14/100[0m
[34mEpoch 15/100[0m
[34mEpoch 16/100[0m
[34mEpoch 17/100[0m
[34mEpoch 18/100[0m
[34mEpoch 19/100[0m
[34mEpoch 20/100[0m
[34mEpoch 21/100[0m
[34mEpoch 22/100[0m
[34mEpoch 23/100[0m
[34mEpoch 24/100[0m
[34mEpoch 25/100[0m
[34mEpoch 26/100[0m
[34mEpoch 27/100[0m
[34mEpoch 28/100[0m
[34mEpoch 29/100[0m
[34mEpoch 30/100[0m
[34mEpoch 31/100[0m
[34mEpoch 32/100[0m
[34mEpoch 33/100[0m
[34mEpoch 34/100[0m
[34mEpoch 35/100[0m
[34mEpoch 36/100[0m
[34mEpoch 37/100[0m
[34mEpoch 38/100[0m
[34mEpoch 39/100[0m
[34mEpoch 40/100[0m
[34mEpoch 41/100[0m
[34mEpoch 42/100[0m
[34mEpoch 43/100[0m
[34mEpoch 44/100[0m
[34mEpoch 45/100[0m
[34mEpoch 46/100[0m
[34mEpoch 47/100[0m
[34mEpoch 48/100[0m
[34mEpoch 49/100[0m
[34mEpoch 50/100[0m
[34mEpoch 51/100[0m
[34mEpoch 52/100[0m
[34mEpoch 53/100[0m
[34mEpoch 54/100[0m
[34mEpoch 55/100[0m
[34mEpoch

In [8]:
predictor = tf_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


-------------!

In [9]:
test = pd.read_csv('processed_data/test_processed.csv', header=None)
test = test.to_numpy()


In [10]:
# Sending all the test data at once returns 'Broken Pipe' error.
# So send them by chunks.

def predict_by_chunks(test_arr, chunk_size):
    test_split = np.array_split(test_arr, test_arr.shape[0] / chunk_size)
    for i in range(len(test_split)):
        result = predictor.predict(test_split[i])
        result = np.array(result['predictions'])
        if i == 0:
            result_all = result.copy()

        else:
            result_all = np.append(result_all, result)
            
    return result_all.astype(int)

In [11]:
y_pred = predict_by_chunks(test, 100)

In [12]:
test = pd.read_csv("test.csv")

In [13]:
out = pd.concat([test.id, pd.DataFrame(y_pred)], axis=1)
out.columns = ['id', 'target']

In [14]:
out.to_csv("output/tf_out.csv", header=True, index=False)
subprocess.check_call('aws s3 cp {} s3://{}/{}/tf_out.csv'.format(os.path.join(output_dir, 'tf_out.csv'), bucket, prefix), shell=True)

0

In [15]:
# Print a path to s3 location.
os.path.join('s3://', bucket, prefix, 'tf_out.csv')

's3://sagemaker-us-east-2-815596061983/kaggle/tweetSentiment/tf_out.csv'

In [16]:
predictor.delete_endpoint()