In [None]:
!pip install s3fs
!pip install sagemaker

In [2]:
BUCKET='ml-lab-pyspark'
!export AWS_PROFILE=ml-lab

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

In [4]:
deals=pd.read_csv('../data/deals_replay.csv',
    header=None,
    names=["timestamp","deal_type","order_type","stop","limit","price","stop_price","limit_price","loss_or_profit"],
    dtype={
        "timestamp":str
    }
)

## All orders sorted by timestamp

In [5]:
deals.sort_values(by="timestamp")

Unnamed: 0,timestamp,deal_type,order_type,stop,limit,price,stop_price,limit_price,loss_or_profit
585,1584100045.375273,stop,buy,10,10,1.25193,1.25093,1.25293,-0.001
719,1584100045.375273,limit,sell,10,10,1.25176,1.25276,1.25076,0.001
17636,1584100045.375273,limit,sell,20,20,1.25176,1.25376,1.24976,0.002
15462,1584100045.375273,stop,buy,20,20,1.25193,1.24993,1.25393,-0.002
17637,1584100045.480978,limit,sell,20,20,1.25177,1.25377,1.24977,0.002
...,...,...,...,...,...,...,...,...,...
292,1584102585.325714,stop,sell,20,20,1.24964,1.25164,1.24764,-0.002
293,1584102585.747461,stop,sell,20,20,1.24964,1.25164,1.24764,-0.002
244,1584102585.747461,stop,sell,10,10,1.24964,1.25064,1.24864,-0.001
361,1584102585.747461,limit,buy,10,10,1.24981,1.24881,1.25081,0.001


### All `buy` type orders where the stop/limit was 10

In [6]:
qualified_deals=deals.query('order_type=="buy" & stop==10')
qualified_deals

Unnamed: 0,timestamp,deal_type,order_type,stop,limit,price,stop_price,limit_price,loss_or_profit
102,1584102154.841821,limit,buy,10,10,1.24902,1.24802,1.25002,0.001
103,1584102155.189365,limit,buy,10,10,1.24901,1.24801,1.25001,0.001
104,1584102161.575295,limit,buy,10,10,1.24902,1.24802,1.25002,0.001
105,1584102162.229208,limit,buy,10,10,1.24902,1.24802,1.25002,0.001
106,1584102163.200662,limit,buy,10,10,1.24902,1.24802,1.25002,0.001
...,...,...,...,...,...,...,...,...,...
24268,1584102007.087893,stop,buy,10,10,1.25001,1.24901,1.25101,-0.001
24269,1584102008.139766,stop,buy,10,10,1.25001,1.24901,1.25101,-0.001
24270,1584102008.448525,stop,buy,10,10,1.25001,1.24901,1.25101,-0.001
24271,1584102008.748936,stop,buy,10,10,1.25001,1.24901,1.25101,-0.001


In [7]:
qualified_deals.describe()

Unnamed: 0,stop,limit,price,stop_price,limit_price,loss_or_profit
count,7171.0,7171.0,7171.0,7171.0,7171.0,7171.0
mean,10.0,10.0,1.251397,1.250397,1.252397,-0.000171
std,0.0,0.0,0.000649,0.000649,0.000649,0.000985
min,10.0,10.0,1.24901,1.24801,1.25001,-0.001
25%,10.0,10.0,1.25093,1.24993,1.25193,-0.001
50%,10.0,10.0,1.25145,1.25045,1.25245,-0.001
75%,10.0,10.0,1.25186,1.25086,1.25286,0.001
max,10.0,10.0,1.25295,1.25195,1.25395,0.001


Among above orders, below have made a profit. We want to predict these orders:

In [8]:
len(qualified_deals.query('deal_type=="limit"'))

2973

In [9]:
price=pd.read_csv('../data/price.csv',
    header=None,
    names=["datetime","timestamp","MID_OPEN","BID","OFFER","CHANGE","CHANGE_PCT","HIGH","LOW"],
    dtype={
        "timestamp":str,
        "datetime":str
    }
)

In [10]:
price.sort_values(by="timestamp")

Unnamed: 0,datetime,timestamp,MID_OPEN,BID,OFFER,CHANGE,CHANGE_PCT,HIGH,LOW
49,20-03-13 11:47:24,1584100045.375273,1.25699,1.25176,1.25193,-0.00515,-0.41,1.26256,1.25052
50,20-03-13 11:47:25,1584100045.480978,1.25699,1.25177,1.25194,-0.00513,-0.41,1.26256,1.25052
51,20-03-13 11:47:25,1584100045.680272,1.25699,1.25180,1.25200,-0.00509,-0.40,1.26256,1.25052
52,20-03-13 11:47:25,1584100045.953611,1.25699,1.25184,1.25202,-0.00506,-0.40,1.26256,1.25052
53,20-03-13 11:47:26,1584100046.104995,1.25699,1.25182,1.25202,-0.00507,-0.40,1.26256,1.25052
...,...,...,...,...,...,...,...,...,...
44,20-03-13 12:24:50,1584102584.117409,1.25699,1.24964,1.24981,-0.00727,-0.58,1.26256,1.24884
45,20-03-13 12:24:50,1584102584.525387,1.25699,1.24963,1.24980,-0.00727,-0.58,1.26256,1.24884
46,20-03-13 12:24:50,1584102584.931247,1.25699,1.24964,1.24981,-0.00727,-0.58,1.26256,1.24884
47,20-03-13 12:24:50,1584102585.325714,1.25699,1.24964,1.24981,-0.00726,-0.58,1.26256,1.24884


In [11]:
deals_with_price=pd.merge(qualified_deals, price,how="left",on="timestamp",sort=True)
deals_with_price['MID']=(deals_with_price['BID']+deals_with_price['OFFER'])/2
deals_with_price

Unnamed: 0,timestamp,deal_type,order_type,stop,limit,price,stop_price,limit_price,loss_or_profit,datetime,MID_OPEN,BID,OFFER,CHANGE,CHANGE_PCT,HIGH,LOW,MID
0,1584100045.375273,stop,buy,10,10,1.25193,1.25093,1.25293,-0.001,20-03-13 11:47:24,1.25699,1.25176,1.25193,-0.00515,-0.41,1.26256,1.25052,1.251845
1,1584100045.480978,stop,buy,10,10,1.25194,1.25094,1.25294,-0.001,20-03-13 11:47:25,1.25699,1.25177,1.25194,-0.00513,-0.41,1.26256,1.25052,1.251855
2,1584100045.680272,stop,buy,10,10,1.25200,1.25100,1.25300,-0.001,20-03-13 11:47:25,1.25699,1.25180,1.25200,-0.00509,-0.40,1.26256,1.25052,1.251900
3,1584100045.953611,stop,buy,10,10,1.25202,1.25102,1.25302,-0.001,20-03-13 11:47:25,1.25699,1.25184,1.25202,-0.00506,-0.40,1.26256,1.25052,1.251930
4,1584100046.104995,stop,buy,10,10,1.25202,1.25102,1.25302,-0.001,20-03-13 11:47:26,1.25699,1.25182,1.25202,-0.00507,-0.40,1.26256,1.25052,1.251920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7166,1584102584.117409,limit,buy,10,10,1.24981,1.24881,1.25081,0.001,20-03-13 12:24:50,1.25699,1.24964,1.24981,-0.00727,-0.58,1.26256,1.24884,1.249725
7167,1584102584.525387,limit,buy,10,10,1.24980,1.24880,1.25080,0.001,20-03-13 12:24:50,1.25699,1.24963,1.24980,-0.00727,-0.58,1.26256,1.24884,1.249715
7168,1584102584.931247,limit,buy,10,10,1.24981,1.24881,1.25081,0.001,20-03-13 12:24:50,1.25699,1.24964,1.24981,-0.00727,-0.58,1.26256,1.24884,1.249725
7169,1584102585.325714,limit,buy,10,10,1.24981,1.24881,1.25081,0.001,20-03-13 12:24:50,1.25699,1.24964,1.24981,-0.00726,-0.58,1.26256,1.24884,1.249725


In [32]:
_format='%y-%m-%d %H:%M:%S'

def datetime_to_int(x):
    return int(dt.strptime(x,_format).timestamp())

datetime_to_int('20-03-13 11:47:24')

1584100044

In [33]:
from datetime import datetime as dt
deals_with_price['datetime_int']=deals_with_price['datetime'].apply(datetime_to_int)
deals_with_price

Unnamed: 0,timestamp,deal_type,order_type,stop,limit,price,stop_price,limit_price,loss_or_profit,datetime,MID_OPEN,BID,OFFER,CHANGE,CHANGE_PCT,HIGH,LOW,MID,datetime_int
0,1584100045.375273,stop,buy,10,10,1.25193,1.25093,1.25293,-0.001,20-03-13 11:47:24,1.25699,1.25176,1.25193,-0.00515,-0.41,1.26256,1.25052,1.251845,1584100044
1,1584100045.480978,stop,buy,10,10,1.25194,1.25094,1.25294,-0.001,20-03-13 11:47:25,1.25699,1.25177,1.25194,-0.00513,-0.41,1.26256,1.25052,1.251855,1584100045
2,1584100045.680272,stop,buy,10,10,1.25200,1.25100,1.25300,-0.001,20-03-13 11:47:25,1.25699,1.25180,1.25200,-0.00509,-0.40,1.26256,1.25052,1.251900,1584100045
3,1584100045.953611,stop,buy,10,10,1.25202,1.25102,1.25302,-0.001,20-03-13 11:47:25,1.25699,1.25184,1.25202,-0.00506,-0.40,1.26256,1.25052,1.251930,1584100045
4,1584100046.104995,stop,buy,10,10,1.25202,1.25102,1.25302,-0.001,20-03-13 11:47:26,1.25699,1.25182,1.25202,-0.00507,-0.40,1.26256,1.25052,1.251920,1584100046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7166,1584102584.117409,limit,buy,10,10,1.24981,1.24881,1.25081,0.001,20-03-13 12:24:50,1.25699,1.24964,1.24981,-0.00727,-0.58,1.26256,1.24884,1.249725,1584102290
7167,1584102584.525387,limit,buy,10,10,1.24980,1.24880,1.25080,0.001,20-03-13 12:24:50,1.25699,1.24963,1.24980,-0.00727,-0.58,1.26256,1.24884,1.249715,1584102290
7168,1584102584.931247,limit,buy,10,10,1.24981,1.24881,1.25081,0.001,20-03-13 12:24:50,1.25699,1.24964,1.24981,-0.00727,-0.58,1.26256,1.24884,1.249725,1584102290
7169,1584102585.325714,limit,buy,10,10,1.24981,1.24881,1.25081,0.001,20-03-13 12:24:50,1.25699,1.24964,1.24981,-0.00726,-0.58,1.26256,1.24884,1.249725,1584102290


In [34]:
mean_price_each_second=deals_with_price[['datetime','BID','OFFER']].groupby('datetime').mean().reset_index()
mean_price_each_second['datetime_int']=mean_price_each_second['datetime'].apply(datetime_to_int)
mean_price_each_second['MID']=(mean_price_each_second['BID']+mean_price_each_second['OFFER'])/2
mean_price_each_second

Unnamed: 0,datetime,BID,OFFER,datetime_int,MID
0,20-03-13 11:47:24,1.251760,1.251930,1584100044,1.251845
1,20-03-13 11:47:25,1.251803,1.251987,1584100045,1.251895
2,20-03-13 11:47:26,1.251812,1.252033,1584100046,1.251923
3,20-03-13 11:47:27,1.251766,1.251976,1584100047,1.251871
4,20-03-13 11:47:28,1.251683,1.251903,1584100048,1.251793
...,...,...,...,...,...
1824,20-03-13 12:24:46,1.249650,1.249828,1584102286,1.249739
1825,20-03-13 12:24:47,1.249620,1.249800,1584102287,1.249710
1826,20-03-13 12:24:48,1.249616,1.249786,1584102288,1.249701
1827,20-03-13 12:24:49,1.249602,1.249777,1584102289,1.249690


In [35]:
import numpy as np
timepoints=np.concatenate(
    (np.arange(-1,-60,-1),
    np.arange(-60,-600,-60))
)
timepoints

array([  -1,   -2,   -3,   -4,   -5,   -6,   -7,   -8,   -9,  -10,  -11,
        -12,  -13,  -14,  -15,  -16,  -17,  -18,  -19,  -20,  -21,  -22,
        -23,  -24,  -25,  -26,  -27,  -28,  -29,  -30,  -31,  -32,  -33,
        -34,  -35,  -36,  -37,  -38,  -39,  -40,  -41,  -42,  -43,  -44,
        -45,  -46,  -47,  -48,  -49,  -50,  -51,  -52,  -53,  -54,  -55,
        -56,  -57,  -58,  -59,  -60, -120, -180, -240, -300, -360, -420,
       -480, -540])

In [36]:
mean_price_each_second.query('datetime_int==%s' % 1584100048)['MID'].values[0]

1.251793333333333

In [37]:
training_data=pd.DataFrame(data=None)
# training_data['BID_CHANGE']=deals_with_price['BID']-deals_with_price['MID_OPEN']
training_data['BID_TO_OPEN_PCT']=(deals_with_price['BID']/deals_with_price['MID_OPEN']-1)*100
# training_data['OFFER_CHANGE']=deals_with_price['OFFER']-deals_with_price['MID_OPEN']
training_data['OFFER_TO_OPEN_PCT']=(deals_with_price['OFFER']/deals_with_price['MID_OPEN']-1)*100
# training_data['HIGH_TO_OPEN']=deals_with_price['HIGH']-deals_with_price['MID_OPEN']
training_data['MID_TO_HIGH_PCT']=((deals_with_price['BID']+deals_with_price['OFFER'])/2/deals_with_price['HIGH']-1)*100
# training_data['LOW_TO_OPEN']=deals_with_price['LOW']-deals_with_price['MID_OPEN']
training_data['MID_TO_LOW_PCT']=((deals_with_price['BID']+deals_with_price['OFFER'])/2/deals_with_price['LOW']-1)*100

training_data

Unnamed: 0,BID_TO_OPEN_PCT,OFFER_TO_OPEN_PCT,MID_TO_HIGH_PCT,MID_TO_LOW_PCT
0,-0.416073,-0.402549,-0.848673,0.105956
1,-0.415278,-0.401753,-0.847880,0.106756
2,-0.412891,-0.396980,-0.844316,0.110354
3,-0.409709,-0.395389,-0.841940,0.112753
4,-0.411300,-0.395389,-0.842732,0.111953
...,...,...,...,...
7166,-0.584730,-0.571206,-1.016585,0.070866
7167,-0.585526,-0.572001,-1.017377,0.070065
7168,-0.584730,-0.571206,-1.016585,0.070866
7169,-0.584730,-0.571206,-1.016585,0.070866


In [38]:
def get_mid_at_timepoint(row,timepoint):
    _timestamp=row['datetime_int']+timepoint
    mid=row['MID']
    values=mean_price_each_second.query('datetime_int==%s' % _timestamp)['MID'].values
    if len(values)>0:
        mid_old=values[0]
        return (mid/mid_old-1)*100
    else:
        return

for t in timepoints:    
    training_data['CHANGE_PCT'+str(t)]=deals_with_price.apply(lambda x:get_mid_at_timepoint(x,t),axis=1)


In [39]:
# training_data.drop('profit',axis=1,inplace=True)

In [40]:
training_data[1000:1100]

Unnamed: 0,BID_TO_OPEN_PCT,OFFER_TO_OPEN_PCT,MID_TO_HIGH_PCT,MID_TO_LOW_PCT,CHANGE_PCT-1,CHANGE_PCT-2,CHANGE_PCT-3,CHANGE_PCT-4,CHANGE_PCT-5,CHANGE_PCT-6,...,CHANGE_PCT-59,CHANGE_PCT-60,CHANGE_PCT-120,CHANGE_PCT-180,CHANGE_PCT-240,CHANGE_PCT-300,CHANGE_PCT-360,CHANGE_PCT-420,CHANGE_PCT-480,CHANGE_PCT-540
1000,-0.499606,-0.485286,-0.931441,0.029590,-0.001199,-0.001199,0.000933,-0.001199,0.000000,-0.000999,...,0.002998,0.003898,0.000266,-0.061123,-0.077091,,,,,
1001,-0.499606,-0.484491,-0.931045,0.029990,-0.000799,-0.000799,0.001332,-0.000799,0.000400,-0.000600,...,0.003398,0.004297,0.000666,-0.060724,-0.076692,,,,,
1002,-0.500402,-0.485286,-0.931837,0.029190,-0.000700,-0.001599,-0.001599,0.000533,-0.001599,-0.000400,...,0.001279,0.002598,0.003598,-0.060724,-0.076692,,,,,
1003,-0.500402,-0.484491,-0.931441,0.029590,-0.000300,-0.001199,-0.001199,0.000933,-0.001199,0.000000,...,0.001679,0.002998,0.003998,-0.060325,-0.076293,,,,,
1004,-0.500402,-0.483695,-0.931045,0.029990,0.000600,0.000100,-0.000799,-0.000799,0.001332,-0.000799,...,0.002199,0.002079,0.002265,-0.060724,-0.075295,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,-0.496424,-0.482900,-0.928669,0.032389,0.000400,0.000533,-0.000160,0.000400,-0.001999,,...,0.003198,0.003598,0.010794,0.006316,-0.065114,-0.084173,,,,
1096,-0.497220,-0.483695,-0.929461,0.031589,-0.001599,-0.000400,-0.000266,-0.000959,-0.000400,-0.002798,...,0.003038,0.002398,0.006929,0.003358,-0.065780,-0.083874,,,,
1097,-0.491651,-0.478126,-0.923916,0.037187,0.005596,0.003997,0.005197,0.005330,0.004637,0.005197,...,0.010394,0.008635,0.013492,0.007675,-0.059123,-0.076819,,,,
1098,-0.496424,-0.482900,-0.928669,0.032389,0.000799,-0.000799,0.000400,0.000533,-0.000160,0.000400,...,0.005597,0.003838,0.008695,0.002878,-0.063916,-0.081612,,,,


In [52]:
training_data.fillna(method='backfill',inplace=True)
training_data.fillna(method='pad',inplace=True)

In [53]:
labels=deals_with_price['deal_type'].apply(lambda x:1 if x=='limit' else 0)
labels

0       0
1       0
2       0
3       0
4       0
       ..
7166    1
7167    1
7168    1
7169    1
7170    1
Name: deal_type, Length: 7171, dtype: int64

In [76]:
count=len(training_data)

import math
training=math.floor(0.7*count)
test=math.floor(0.15*count)
validation=math.floor(0.15*count)
evaluation=math.floor(0.2*count)

In [95]:
training_data_set=training_data[0:training]
training_labels=labels[0:training]
len(training_data_set)

5019

In [93]:
test_data_set=training_data[(training+1):(training+test)]
test_labels=labels[(training+1):(training+test)]
len(test_data_set)

1074

In [78]:
validation_data_set=training_data[(training+test+1):]
validation_labels=labels[(training+test+1):]
len(validation_data_set)

1076

In [101]:
training_data_set.to_csv('../data/training_data.csv',header=None,index=False)
test_data_set.to_csv('../data/test_data.csv',header=None,index=False)
validation_data_set.to_csv('../data/validation_data.csv',header=None,index=False)

In [96]:
training_data_np=training_data_set.to_numpy().astype('float32')
training_labels_np=training_labels.to_numpy().astype('float32')

test_data_np=test_data_set.to_numpy().astype('float32')
test_labels_np=test_labels.to_numpy().astype('float32')

validation_data_np=validation_data_set.to_numpy().astype('float32')
validation_labels_np=validation_labels.to_numpy().astype('float32')

In [86]:
import io
import sagemaker.amazon.common as smac
import boto3
session=boto3.Session(profile_name='ml-lab')

In [97]:
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, training_data_np, training_labels_np)
buf.seek(0)
session.resource('s3').Bucket(BUCKET).Object('training_data.io').upload_fileobj(buf)

In [98]:
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, test_data_np, test_labels_np)
buf.seek(0)
session.resource('s3').Bucket(BUCKET).Object('test_data.io').upload_fileobj(buf)

In [99]:
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, validation_data_np, validation_labels_np)
buf.seek(0)
session.resource('s3').Bucket(BUCKET).Object('validation_data.io').upload_fileobj(buf)

In [58]:
evaluation_data=training_data.copy()
evaluation_data['profit']=labels
evaluation_data_set=evaluation_data[(len(evaluation_data)-evaluation):]

Unnamed: 0,BID_TO_OPEN_PCT,OFFER_TO_OPEN_PCT,MID_TO_HIGH_PCT,MID_TO_LOW_PCT,CHANGE_PCT-1,CHANGE_PCT-2,CHANGE_PCT-3,CHANGE_PCT-4,CHANGE_PCT-5,CHANGE_PCT-6,...,CHANGE_PCT-60,CHANGE_PCT-120,CHANGE_PCT-180,CHANGE_PCT-240,CHANGE_PCT-300,CHANGE_PCT-360,CHANGE_PCT-420,CHANGE_PCT-480,CHANGE_PCT-540,profit
5737,-0.481309,-0.466989,-0.913224,0.051184,0.000320,-0.000400,-0.002158,0.000320,-0.004529,-0.002398,...,-0.005915,-0.052968,-0.058158,-0.122250,-0.122250,-0.098522,-0.096328,-0.065264,-0.051032,0
5738,-0.484491,-0.466989,-0.914808,0.049585,-0.001279,-0.001998,-0.003757,-0.001279,-0.006128,-0.003997,...,-0.007513,-0.054566,-0.059756,-0.123846,-0.123846,-0.100119,-0.097925,-0.066861,-0.052630,0
5739,-0.483695,-0.466989,-0.914412,0.049984,-0.000879,-0.001599,-0.003357,-0.000879,-0.005728,-0.003597,...,-0.007114,-0.054167,-0.059357,-0.123447,-0.123447,-0.099719,-0.097526,-0.066462,-0.052230,0
5740,-0.484491,-0.466193,-0.914412,0.049984,-0.001279,-0.000879,-0.001599,-0.003357,-0.000879,-0.005728,...,-0.008392,-0.054626,-0.060873,-0.121513,-0.122550,-0.101415,-0.103648,-0.063189,-0.050733,0
5741,-0.481309,-0.467784,-0.913620,0.050784,-0.000480,-0.000080,-0.000799,-0.002558,-0.000080,-0.004929,...,-0.007593,-0.053827,-0.060075,-0.120715,-0.121752,-0.100616,-0.102850,-0.062390,-0.049934,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7166,-0.584730,-0.571206,-1.016585,0.070866,0.002801,0.001920,0.001200,-0.001100,-0.006401,-0.008801,...,-0.016198,-0.071604,-0.083725,0.052239,-0.131524,-0.010801,-0.030637,-0.100521,-0.080619,1
7167,-0.585526,-0.572001,-1.017377,0.070065,0.002000,0.001120,0.000400,-0.001900,-0.007201,-0.009601,...,-0.016198,-0.071604,-0.083725,0.052239,-0.131524,-0.010801,-0.031437,-0.101320,-0.081419,1
7168,-0.584730,-0.571206,-1.016585,0.070866,0.002801,0.001920,0.001200,-0.001100,-0.006401,-0.008801,...,-0.016198,-0.071604,-0.083725,0.052239,-0.131524,-0.010801,-0.030637,-0.100521,-0.080619,1
7169,-0.584730,-0.571206,-1.016585,0.070866,0.002801,0.001920,0.001200,-0.001100,-0.006401,-0.008801,...,-0.016198,-0.071604,-0.083725,0.052239,-0.131524,-0.010801,-0.030637,-0.100521,-0.080619,1
