In [1]:
# Import libraries
from sagemaker import get_execution_role
import boto3, sys, os
import sagemaker

# S3 prefix
bucket = 'sagemaker-getting-start-test'
prefix = 'sagemaker/sklearn-gradient'

sagemaker_session = sagemaker.Session()
# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()
my_region = boto3.session.Session().region_name # set the region of the instance
print("Execution role is " + role)
print("Success - the MySageMakerInstance is in the " + my_region + ".")

Execution role is arn:aws:iam::251344623468:role/service-role/AmazonSageMaker-ExecutionRole-20191017T203175
Success - the MySageMakerInstance is in the ap-northeast-1.


In [2]:
s3 = boto3.resource('s3')

try:
    if my_region == 'ap-northeast-1':
        s3.create_bucket(Bucket=bucket)
    else:
        s3.create_bucket(Bucket=bucket, CreateBucketConfiguration={'LocationConstraint': my_region})
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ', e)

S3 error:  An error occurred (IllegalLocationConstraintException) when calling the CreateBucket operation: The unspecified location constraint is incompatible for the region specific endpoint this request was sent to.


In [3]:
import csv
import time
import json
import requests
import numpy as np
import pandas as pd
from datetime import datetime, date, timedelta, timezone

headers = {'Content-Type': 'application/json'}
api_url_base = 'https://public.bitbank.cc'
pair = 'btc_jpy'
period = '1min'

today = datetime.today()
yesterday = today - timedelta(days=1)
yesterday = "{0:%Y%m%d}".format(yesterday)

def api_ohlcv(timestamp):
    api_url = '{0}/{1}/candlestick/{2}/{3}'.format(api_url_base, pair, period, timestamp)
    response = requests.get(api_url, headers=headers)

    if response.status_code == 200:
        ohlcv = json.loads(response.content.decode('utf-8'))['data']['candlestick'][0]['ohlcv']
        return ohlcv
    else:
        return None

ohlcv = api_ohlcv(yesterday)
open, high, low, close, volume, timestamp = [],[],[],[],[],[]
for i in ohlcv:
    open.append(int(i[0]))
    high.append(int(i[1]))
    low.append(int(i[2]))
    close.append(int(i[3]))
    volume.append(float(i[4]))
    time_str = str(i[5])
    timestamp.append(datetime.fromtimestamp(int(time_str[:10])).strftime('%Y/%m/%d %H:%M:%M'))

date_time_index = pd.to_datetime(timestamp) # convert to DateTimeIndex type
df = pd.DataFrame({'open': open, 'high': high, 'low': low, 'close': close, 'volume': volume}, index=date_time_index)
print("DataFrame shape: {}".format(df.shape))
print(df.columns)
df.head(10)

DataFrame shape: (1440, 5)
Index(['open', 'high', 'low', 'close', 'volume'], dtype='object')


Unnamed: 0,open,high,low,close,volume
2019-12-07 00:00:00,818995,818995,818995,818995,0.011
2019-12-07 00:01:01,818995,819010,818995,819010,0.2255
2019-12-07 00:02:02,819955,820518,819955,820515,1.4304
2019-12-07 00:03:03,820514,820523,820514,820518,0.9708
2019-12-07 00:04:04,820519,820519,820518,820518,0.32
2019-12-07 00:05:05,820519,820519,820518,820518,0.5341
2019-12-07 00:06:06,820518,820518,820518,820518,0.3407
2019-12-07 00:07:07,820518,820519,820000,820518,3.6664
2019-12-07 00:08:08,820500,820524,820500,820524,1.4373
2019-12-07 00:09:09,820524,821234,820524,821234,3.9741


In [4]:
# labelling
f = lambda x: 1 if x>0.0001 else -1 if x<-0.0001 else 0 if -0.0001<=x<=0.0001 else np.nan
y = df.rename(columns={'close': 'y'}).loc[:, 'y'].pct_change(1).shift(-1).fillna(0)
X = df.copy()
y_ = pd.DataFrame(y.map(f), columns=['y'])
df_ = pd.concat([X, y_], axis=1)

# check the shape
print('----------------------------------------------------------------------------------------')
print('X shape: (%i,%i)' % X.shape)
print('y shape: (%i,%i)' % y_.shape)
print('----------------------------------------------------------------------------------------')
print(y_.groupby('y').size())
print('y=1 up, y=0 stay, y=-1 down')
print('----------------------------------------------------------------------------------------')

----------------------------------------------------------------------------------------
X shape: (1440,5)
y shape: (1440,1)
----------------------------------------------------------------------------------------
y
-1    264
 0    912
 1    264
dtype: int64
y=1 up, y=0 stay, y=-1 down
----------------------------------------------------------------------------------------


In [5]:
# Create directory and write csv
os.makedirs('./data', exist_ok=True)
df_.to_csv("./data/ohlcv.csv", header=False, index=False)

# Upload the csv file to S3
WORK_DIRECTORY = 'data'
print("Default buckert: {}".format(sagemaker_session.default_bucket()))
train_input = sagemaker_session.upload_data("./data/ohlcv.csv", key_prefix="{}/{}".format(prefix, WORK_DIRECTORY))

Default buckert: sagemaker-ap-northeast-1-251344623468


In [6]:
from sagemaker.sklearn import SKLearn
script_path = 'scikit_learn_gradient.py'

# Initialise SDK
sklearn_estimator = SKLearn(
        entry_point=script_path,
        role = role,
        train_instance_type="ml.c4.xlarge",
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/output".format(sagemaker_session.default_bucket())
)

print("Estimator object: {}".format(sklearn_estimator))

Estimator object: <sagemaker.sklearn.estimator.SKLearn object at 0x7fa929431588>


In [7]:
# Run model training job
sklearn_estimator.fit({'train': train_input})

2019-12-08 02:57:07 Starting - Starting the training job...
2019-12-08 02:57:09 Starting - Launching requested ML instances......
2019-12-08 02:58:12 Starting - Preparing the instances for training...
2019-12-08 02:59:01 Downloading - Downloading input data...
2019-12-08 02:59:37 Training - Training image download completed. Training in progress...[34m2019-12-08 02:59:38,570 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2019-12-08 02:59:38,572 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2019-12-08 02:59:38,582 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2019-12-08 02:59:38,851 sagemaker-containers INFO     Module scikit_learn_gradient does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2019-12-08 02:59:38,851 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2019-12-08 02:59:38,851 sagemaker-containers INFO     Generating MANIFE

In [9]:
# Deploy trained model to an endpoint
predictor = sklearn_estimator.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge", endpoint_name="sagemaker-scikit-learn")

---------------------------------------------------------------------------------------------------!

In [10]:
today = datetime.today()
today = "{0:%Y%m%d}".format(today)

ohlcv = api_ohlcv(today)
open, high, low, close, volume, timestamp = [],[],[],[],[],[]
for i in ohlcv:
    open.append(int(i[0]))
    high.append(int(i[1]))
    low.append(int(i[2]))
    close.append(int(i[3]))
    volume.append(float(i[4]))
    time_str = str(i[5])
    timestamp.append(datetime.fromtimestamp(int(time_str[:10])).strftime('%Y/%m/%d %H:%M:%M'))

date_time_index = pd.to_datetime(timestamp) # convert to DateTimeIndex type
df_t = pd.DataFrame({'open': open, 'high': high, 'low': low, 'close': close, 'volume': volume}, index=date_time_index)

print('----------------------------------------------------------------------------------------')
print('The last minute feature selections is below:')
print(df_t.tail(1))
input_X = df_t.iloc[-1]

----------------------------------------------------------------------------------------
The last minute feature selections is below:
                       open    high     low   close  volume
2019-12-08 03:15:15  804035  804035  804000  804000   0.299


In [11]:
print(type(input_X))
print(type(input_X.values))
prediction = predictor.predict(input_X.values.reshape(1,-1))
print("Next minute prediction is: {}".format(prediction))

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
Next minute prediction is: [0]


In [31]:
sklearn_estimator.delete_endpoint()