In [18]:
# basic packages
import os
import subprocess
import logging
from tqdm import tqdm
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from time import localtime, strftime

In [19]:
# sklearn
from sklearn.model_selection import train_test_split

In [20]:
# sagemaker parameters
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import boto3

session = sagemaker.Session()
role = get_execution_role()
bucket = session.default_bucket()
prefix = 'kaggle/tweetSentiment'  # Prefix should not tontain '/' at the end!
s3 = boto3.client('s3')

In [21]:
# directories
working_dir = '/home/ec2-user/SageMaker/kaggle_data'
data_dir = os.path.join(working_dir, 'processed_data/')
cache_dir = os.path.join(working_dir, 'cache/')

In [22]:
# Split the training data to train and velidation data.
train = pd.read_csv(os.path.join(data_dir, 'train_processed.csv'), header=None)
train, validation = train_test_split(
    train, 
    stratify=train[0].values,
    test_size=0.25,
    random_state=0)

In [23]:
# Save the split data to the local.
train.to_csv(os.path.join(data_dir, 'train_processed_split.csv'), header=None, index=None)
validation.to_csv(os.path.join(data_dir, 'validation.csv'), header=None, index=None)

In [24]:
# data upload to S3
s3_object_dict = s3.list_objects_v2(
    Bucket=bucket,
    Prefix=prefix
)

# Fetch filenames in the data directory.
local_data_list = os.listdir(data_dir)

# Combine the name with the S3 bucket prefix.
local_data_list = [os.path.join(prefix, f) for f in local_data_list]

# Upload the data if they are not present in S3.
# 'data_location' is a path to a directory in which .csv files are located.
try:
    s3_object_list = [content['Key'] for content in s3_object_dict['Contents']]
    if set(local_data_list).intersection(s3_object_list) == set(local_data_list):
        train_location = os.path.join('s3://', bucket, prefix, 'train_processed_split.csv')
        validation_location = os.path.join('s3://', bucket, prefix, 'validation.csv')
        data_location = os.path.join('s3://', bucket, prefix)
        
        print("input_data already present in S3.")
        
    else:
        # Split the training data to train and velidation data.
        train = pd.read_csv(os.path.join(data_dir, 'train_processed.csv'), header=None)
        train, validation = train_test_split(
            train, 
            stratify=train[0].values,
            test_size=0.25,
            random_state=0
        )
        
        # Save them locally.
        train.to_csv(os.path.join(data_dir, 'train_processed_split.csv'), header=None, index=None)
        validation.to_csv(os.path.join(data_dir, 'validation.csv'), header=None, index=None)
        
        train = None,
        validation = None
        
        # Upload the data to S3.
        train_location = session.upload_data(
            path=os.path.join(data_dir, 'train_processed_split.csv'), 
            bucket=bucket,
            key_prefix=prefix
        )
        validation_location = session.upload_data(
            path=os.path.join(data_dir, 'validation.csv'), 
            bucket=bucket,
            key_prefix=prefix
        )
        data_location = os.path.join('s3://', bucket, prefix)

        print("train and validation data uploaded to S3.")
        
except KeyError:  # if nothing exists in the S3 bucket.
    # Split the training data to train and velidation data.
    train = pd.read_csv(os.path.join(data_dir, 'train_processed.csv'), header=None)
    train, validation = train_test_split(
        train, 
        stratify=train[0].values,
        test_size=0.25,
        random_state=0
    )

    # Save them locally.
    train.to_csv(os.path.join(data_dir, 'train_processed_split.csv'), header=None, index=None)
    validation.to_csv(os.path.join(data_dir, 'validation.csv'), header=None, index=None)

    train = None,
    validation = None
    
    # Upload the data to S3.
    train_location = session.upload_data(
        path=os.path.join(data_dir, 'train_processed_split.csv'), 
        bucket=bucket,
        key_prefix=prefix
    )
    validation_location = session.upload_data(
        path=os.path.join(data_dir, 'validation.csv'), 
        bucket=bucket,
        key_prefix=prefix
    )
    data_location = os.path.join('s3://', bucket, prefix)

    print("train and validation data uploaded to S3.")

input_data already present in S3.


In [29]:
from sagemaker.tensorflow import TensorFlow
# Initiate a TensorFlow instance.

tf_estimator = TensorFlow(
    entry_point='train.py',
    source_dir='source',
    role=role,
    train_instance_count=1,
    train_instance_type='ml.p2.xlarge',
    framework_version='2.1.0',
    py_version='py3',
    distributions={'parameter_server': {'enabled': True}},
    output_path=os.path.join('s3://', bucket, prefix, 'model'),
    hyperparameters={
        'input_dim': 5223,
        'epochs': 100, 
        'batch_size': 128,
    }
)



In [30]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

# Tune hyperparameters.

hyperparameter_ranges = {
    'lr': ContinuousParameter(0.0001, 0.01),
    'drop_rate': ContinuousParameter(0.1, 0.4),
}  

metric_definitions = [{
    'Name': 'loss',
    'Regex': 'loss: ([0-9\\.]+)'
}]

# Initialise Sagemaker's hyperparametertuner
tuner = HyperparameterTuner(
    tf_estimator,
    objective_metric_name='loss',
    objective_type='Minimize',
    metric_definitions=metric_definitions,
    hyperparameter_ranges=hyperparameter_ranges,
    max_jobs=10,
    max_parallel_jobs=1,  # only 1 instance allowed
    early_stopping_type='Auto'
)

In [31]:
tuner.fit(
    data_location
)



In [None]:
tuner.wait()

...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................!


In [34]:
tuner.best_training_job()

'tensorflow-training-200604-2204-006-ca546ec3'

In [35]:
subprocess.check_call('echo {} > tf_model.txt'.format(tuner.best_training_job()), shell=True)

0