In [None]:
from blazingsql import BlazingContext
bc = BlazingContext()

bc.s3('estee',
     bucket_name='datalake',
     access_key_id='epapenhausen',
     secret_key='5d96af203bf43b74a93765dc',
     endpoint_override="http://130.245.177.209:9000")

bc.create_table('estee', 's3://estee/epapenhausen/TCS 0804 to 0831.csv')

In [None]:
raw = bc.sql("select * from estee")
raw.drop(['Unnamed: 0'], axis=1, inplace=True);

In [None]:
target = 'MidReturn10Min'

# drop attributes collinear with the target
drop= ['MidReturn2Min', 'MidReturn4Min', 'MidReturn6Min', 'MidReturn8Min', 'Return_10_Min', 'Return_1_Min', 'Return_3_Min','Return_20_Min', 'Return_60_Min', 'EpochTime']

### Multiclass Mining

For pattern mining with a multiclass target variable, the target attribute is expected to map classes consecutive integers (i.e. 0 for class 1, 1 for class 2, etc.). Multiclass mining uses the same statistical tests as the Binary pattern mining (i.e. binary mining is performed for each class). Each pattern is specific to a single class (i.e. a pattern specifies a region where the occurrence of a particular class is unusually high). The effect size used is the odds ratio and is specific to each class.

The es_thresh parameter specifies the minimum effect size (i.e. as measured by the odds ratio) for a pattern to be considered 'interesting'. The odds ratio quantifies the change in odds of seeing a specific class within the pattern versus outside the pattern (i.e. values > 1 indicate a higher likelihood of seeing the target class in the pattern). The es_thresh parameter can either be a float or a python dictionary mapping class ids (i.e. 0, 1, or 2) to its corresponding minimum effect size. This allows for specifying different effect size thresholds for different classes.

In [None]:
# convert to multi-class
t = 35

# 0: buy, 1: hold, 2: sell
raw['trade'] = raw[target].applymap(lambda x: 0 if x > t else (2 if x < -t else 1))

### Cross sectional analysis

In [None]:
import data_context_map.pattern_miner as pm

# odds ratio: 2.0 for class 0 (buy), 5.0 for class 1 (hold), 2.0 for class 2 (sell)
es_thresh={0: 2.0, 1: 5.0, 2: 2.0}

col_drop = drop + [target]
out = pm.DataContextMap(raw.to_pandas().drop(col_drop, axis=1), dependent='trade', mine_type='multiclass', es_thresh=es_thresh,  max_pattern=100)
out.render()

### Time series analysis

#### There are noticeable gaps in EpochTime. I suspect these correpsond to different days.

In [None]:
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (20,5)

plt.scatter(raw['EpochTime'].to_pandas(), raw[target].to_pandas())
plt.show()

#### We will assign all points in each section to the same 'Epoch'.

In [None]:
raw['Epoch'] = 0
t=1596550000
raw.loc[(raw['EpochTime']<t), 'Epoch'] = 0
raw.loc[(raw['EpochTime']>=t) & (raw['EpochTime']<t+100000), 'Epoch'] = 1
t += 100000
raw.loc[(raw['EpochTime']>=t) & (raw['EpochTime']<t+100000), 'Epoch'] = 2
t += 100000
raw.loc[(raw['EpochTime']>=t) & (raw['EpochTime']<t+100000), 'Epoch'] = 3

#t = 1597012711
tv = [1597000000, 1597100000, 1597150000, 1597250000, 1597350000]
t0 = tv[0]
t1 = tv[1]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 4

t0 = tv[1]
t1 = tv[2]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 5

t0 = tv[2]
t1 = tv[3]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 6

t0 = tv[3]
t1 = tv[4]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 7

tv = [1597500000, 1597700000, 1597800000, 1597850000, 1597950000, 1598200000]
t0 = tv[0]
t1 = tv[1]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 8

t0 = tv[1]
t1 = tv[2]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 9

t0 = tv[2]
t1 = tv[3]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 10

t0 = tv[3]
t1 = tv[4]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 11

t0 = tv[4]
t1 = tv[5]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 12


tv = [1598200000, 1598350000, 1598500000, 1598700000, 1600000000]
t0 = tv[0]
t1 = tv[1]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 13

t0 = tv[1]
t1 = tv[2]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 14

t0 = tv[2]
t1 = tv[3]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 15

t0 = tv[3]
t1 = tv[4]
raw.loc[(raw['EpochTime']>=t0) & (raw['EpochTime']<t1), 'Epoch'] = 16


In [None]:
import data_context_map.pattern_miner as pm

col_drop = drop + [target]
out = pm.DataContextMap(raw.to_pandas().drop(col_drop, axis=1), dependent='trade', temporal='Epoch', min_stable=1, ts_width=1, mine_type='multiclass', es_thresh={0: 2.0, 1: 5.0, 2: 2.0}, max_pattern=100)
out.render()