Download data from [kaggle](https://www.kaggle.com/shivinder/googlestockpricing/data).

In [1]:
import pandas as pd

df = pd.read_csv("C:/Users/staraustin/PycharmProjects/finance_ml/examples/data/TSLA1minETH.csv")
df.index = pd.DatetimeIndex(df['Date'].values)
close = df["Close"]

In [2]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
2020-09-23 11:00:00,20200923 110000,385.62,386.7,385.22,386.54,173996
2020-09-23 11:01:00,20200923 110100,386.54,387.89,386.11,387.54,188062
2020-09-23 11:02:00,20200923 110200,387.61,387.95,387.08,387.71,157564
2020-09-23 11:03:00,20200923 110300,387.67,388.39,387.2,388.37,143958
2020-09-23 11:04:00,20200923 110400,388.33,388.34,387.18,387.19,168513


In [3]:
import numpy as np
import pandas as pd

def get_daily_vol(close, span=100):
    use_idx = close.index.searchsorted(close.index - pd.Timedelta(days=1))
    use_idx = use_idx[use_idx > 0]
    # Get rid of duplications in index
    use_idx = np.unique(use_idx)
    prev_idx = pd.Series(close.index[use_idx - 1], index=close.index[use_idx])
    ret = close.loc[prev_idx.index] / close.loc[prev_idx.values].values - 1
    vol = ret.ewm(span=span).std()
    return vol

In [4]:
vol = get_daily_vol(df["Close"])
vol.head()

2020-09-23 11:01:00         NaN
2020-09-23 11:02:00    0.001519
2020-09-23 11:03:00    0.001075
2020-09-23 11:04:00    0.002486
2020-09-23 11:05:00    0.002846
Name: Close, dtype: float64

# 3.1

In [5]:
import numbers


def cusum_filter(close, h):
    # asssum that E y_t = y_{t-1}
    t_events = []
    s_pos, s_neg = 0, 0
    ret = close.pct_change().dropna()
    diff = ret.diff().dropna()
    # time variant threshold
    if isinstance(h, numbers.Number):
        h = pd.Series(h, index=diff.index)
    h = h.reindex(diff.index, method='bfill')
    h = h.dropna()
    for t in h.index:
        s_pos = max(0, s_pos + diff.loc[t])
        s_neg = min(0, s_neg + diff.loc[t])
        if s_pos > h.loc[t]:
            s_pos = 0
            t_events.append(t)
        elif s_neg < -h.loc[t]:
            s_neg = 0
            t_events.append(t)
    return pd.DatetimeIndex(t_events)

In [6]:
cusum_filter(df["Close"], 0.1)

DatetimeIndex([], dtype='datetime64[ns]', freq=None)

In [7]:
vol = get_daily_vol(close)
sampled_idx = cusum_filter(close, vol)
sampled_idx

DatetimeIndex(['2020-09-23 11:02:00', '2020-09-23 11:03:00',
               '2020-09-23 11:04:00', '2020-09-23 11:06:00',
               '2020-09-23 11:07:00', '2020-09-23 11:09:00',
               '2020-09-23 11:12:00', '2020-09-23 11:14:00',
               '2020-09-23 11:16:00', '2020-09-23 11:17:00',
               ...
               '2020-11-12 17:19:00', '2020-11-12 17:46:00',
               '2020-11-12 18:03:00', '2020-11-12 18:46:00',
               '2020-11-12 18:47:00', '2020-11-12 18:50:00',
               '2020-11-12 18:51:00', '2020-11-12 18:52:00',
               '2020-11-12 18:54:00', '2020-11-12 18:55:00'],
              dtype='datetime64[ns]', length=12084, freq=None)

In [8]:
sampled_idx.shape

(12084,)

In [9]:
df.shape

(30452, 6)

In [10]:
def get_t1(close, t_events, num_days):
    t1 = close.index.searchsorted(t_events + pd.Timedelta(days=num_days))
    t1 = t1[t1 < close.shape[0]]
    t1 = pd.Series(close.index[t1], index=t_events[:t1.shape[0]])
    return t1

In [11]:
t1 = get_t1(close, sampled_idx, num_days=1)
print(t1.shape)
t1.head()

(12084,)


2020-09-23 11:02:00   2020-09-24 11:02:00
2020-09-23 11:03:00   2020-09-24 11:03:00
2020-09-23 11:04:00   2020-09-24 11:04:00
2020-09-23 11:06:00   2020-09-24 11:06:00
2020-09-23 11:07:00   2020-09-24 11:07:00
dtype: datetime64[ns]

In [12]:
def apply_ptslt1(close, events, ptsl, molecule):
    """Return datafram about if price touches the boundary"""
    # Sample a subset with specific indices
    _events = events.loc[molecule]
    # Time limit
    
    out = pd.DataFrame(index=_events.index)
    # Set Profit Taking and Stop Loss
    if ptsl[0] > 0:
        pt = ptsl[0] *  _events["trgt"]
    else:
        # Switch off profit taking
        pt = pd.Series(index=_events.index)
    if ptsl[1] > 0:
        sl = -ptsl[1] * _events["trgt"]
    else:
        # Switch off stop loss
        sl = pd.Series(index=_events.index)
    # Replace undifined value with the last time index
    time_limits = _events["t1"].fillna(close.index[-1])
    for loc, t1 in time_limits.iteritems():
        df = close[loc:t1]
        # Change the direction depending on the side
        df = (df / close[loc] - 1) * _events.at[loc, 'side']
        # print(df)
        # print(loc, t1, df[df < sl[loc]].index.min(), df[df > pt[loc]].index.min())
        out.at[loc, 'sl'] = df[df < sl[loc]].index.min()
        out.at[loc, 'pt'] = df[df > pt[loc]].index.min()
    out['t1'] = _events['t1'].copy(deep=True)
    return out


def get_3barriers(close, t_events, ptsl, trgt, min_ret=0, num_threads=1,
                  t1=False, side=None):
    # Get sampled target values
    trgt = trgt.loc[t_events]
    trgt = trgt[trgt > min_ret]
    # Get time boundary t1
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=t_events)
    # Define the side
    if side is None:
        _side = pd.Series(1., index=trgt.index)
        _ptsl = [ptsl, ptsl]
    else:
        _side = side.loc[trgt.index]
        _ptsl = ptsl[:2]
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': _side}, axis=1)
    events = events.dropna(subset=['trgt'])
    time_idx = apply_ptslt1(close, events, _ptsl, events.index)
    # Skip when all of barrier are not touched
    events['t1'] = time_idx.dropna(how='all').min(axis=1)
    events = events.drop('side', axis=1)
    return events

In [13]:
trgt = vol
events = get_3barriers(close, t_events=sampled_idx, trgt=trgt,
                       ptsl=1, t1=t1)
events.head()

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: DatetimeIndex(['2020-09-23 16:05:00', '2020-09-23 16:12:00',\n               '2020-09-23 16:41:00', '2020-09-24 03:14:00',\n               '2020-09-24 03:25:00',\n               ...\n               '2020-11-12 15:33:00', '2020-11-12 15:48:00',\n               '2020-11-12 16:17:00', '2020-11-12 16:20:00',\n               '2020-11-12 16:59:00'],\n              dtype='datetime64[ns]', length=1927, freq=None). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

In [15]:
def get_bins(events, close):
    # Prices algined with events
    events = events.dropna(subset=['t1'])
    px = events.index.union(events['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    # Create out object
    out = pd.DataFrame(index=events.index)
    out['ret'] = px.loc[events['t1'].values].values / px.loc[events.index] - 1.
    if 'side' in events:
        out['ret'] *= events['side']
    out['bin'] = np.sign(out['ret'])
    if 'side' in events:
        out.loc[out['ret'] <= 0, 'bin'] = 0
    return out

In [16]:
bins = get_bins(events, close)
bins.head()

Unnamed: 0,ret,bin
2004-08-24,0.010775,1.0
2004-08-25,0.018019,1.0
2004-08-31,-0.020709,-1.0
2004-09-02,-0.014777,-1.0
2004-09-07,0.007088,1.0


# 3.2

In [17]:
def drop_labels(events, min_pct=0.05):
    while True:
        df = events['bin'].value_counts(normalize=True)
        if df.min() > min_pct or df.shape[0] < 3:
            break
        print('dropped label', df.argmin(), df.min())
        events = events[events['bin'] != df.argmin()]
    return events

In [18]:
dropped_bins = drop_labels(bins)
print(bins.shape)
print(dropped_bins.shape)

dropped label 0.0 0.0008216926869350862
(1217, 2)
(1216, 2)


  
  import sys


In [19]:
bins = dropped_bins

# 3.3

In [20]:
def get_3barriers(close, t_events, ptsl, trgt, min_ret=0, num_threads=1,
                  t1=False, side=None):
    # Get sampled target values
    trgt = trgt.loc[t_events]
    trgt = trgt[trgt > min_ret]
    # Get time boundary t1
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=t_events)
    # Define the side
    if side is None:
        _side = pd.Series(1., index=trgt.index)
        _ptsl = [ptsl, ptsl]
    else:
        _side = side.loc[trgt.index]
        _ptsl = ptsl[:2]
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': _side}, axis=1)
    events = events.dropna(subset=['trgt'])
    time_idx = apply_ptslt1(close, events, _ptsl, events.index)
    # Skip when all of barrier are not touched
    time_idx = time_idx.dropna(how='all')
    events['t1_type'] = time_idx.idxmin(axis=1)
    events['t1'] = time_idx.min(axis=1)
    if side is None:
        events = events.drop('side', axis=1)
    return events

def get_bins(events, close):
    # Prices algined with events
    events = events.dropna(subset=['t1'])
    px = events.index.union(events['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    # Create out object
    out = pd.DataFrame(index=events.index)
    out['ret'] = px.loc[events['t1'].values].values / px.loc[events.index] - 1.
    if 'side' in events:
        out['ret'] *= events['side']
    out['bin'] = np.sign(out['ret'])
    # 0 when touching vertical line
    out['bin'].loc[events['t1_type'] == 't1'] = 0
    if 'side' in events:
        out.loc[out['ret'] <= 0, 'bin'] = 0
    return out

In [21]:
t1 = get_t1(close, sampled_idx, num_days=1)
events = get_3barriers(close, t_events=sampled_idx, trgt=trgt,
                       ptsl=1, t1=t1)
events.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  after removing the cwd from sys.path.


Unnamed: 0,t1,trgt,t1_type
2004-08-24,2004-08-25,0.036396,t1
2004-08-25,2004-08-26,0.02993,t1
2004-08-31,2004-09-01,0.026605,t1
2004-09-02,2004-09-03,0.024097,t1
2004-09-07,2004-09-08,0.02361,t1


In [22]:
print(events['t1_type'].unique())
print(events['t1_type'].describe())

['t1' 'pt' 'sl']
count     1217
unique       3
top         t1
freq       906
Name: t1_type, dtype: object


In [23]:
bins = get_bins(events, close)
bins.head()

Unnamed: 0,ret,bin
2004-08-24,0.010775,0.0
2004-08-25,0.018019,0.0
2004-08-31,-0.020709,0.0
2004-09-02,-0.014777,0.0
2004-09-07,0.007088,0.0


In [24]:
bins['bin'].value_counts()

 0.0    906
 1.0    180
-1.0    131
Name: bin, dtype: int64

# 3.4

In [70]:
import talib
import numpy as np


def macd_side(close):
    macd, signal, hist = talib.MACD(close.values)
    hist = pd.Series(hist).fillna(1).values
    return pd.Series(2 * ((hist > 0).astype(float) - 0.5), index=close.index[-len(hist):])

In [71]:
import numpy as np

vol = get_daily_vol(close)
sampled_idx = cusum_filter(close, vol)
t1 = get_t1(close, sampled_idx, num_days=1)
side =  macd_side(close)
events = get_3barriers(close, t_events=sampled_idx, trgt=vol,
                       ptsl=[1, 2], t1=t1, side=side)
events.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  after removing the cwd from sys.path.


Unnamed: 0,side,t1,trgt,t1_type
2004-08-24,1.0,2004-08-25,0.036396,t1
2004-08-25,1.0,2004-08-26,0.02993,t1
2004-08-31,1.0,2004-09-01,0.026605,t1
2004-09-02,1.0,2004-09-03,0.024097,t1
2004-09-07,1.0,2004-09-08,0.02361,t1


In [72]:
bins = get_bins(events, close)
bins.head()

Unnamed: 0,ret,bin
2004-08-24,0.010775,0.0
2004-08-25,0.018019,0.0
2004-08-31,-0.020709,0.0
2004-09-02,-0.014777,0.0
2004-09-07,0.007088,0.0


In [27]:
bins['bin'].unique()

array([0., 1.])

In [93]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
x = np.hstack([events['side'].values[:, np.newaxis], close.loc[events.index].values[:, np.newaxis]])
y = bins['bin'].values
clf.fit(x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [94]:
clf.predict(x)

array([0., 0., 0., ..., 0., 0., 0.])

In [91]:
x.shape

(1217, 2)

In [76]:
events['side'].values

array([ 1.,  1.,  1., ..., -1., -1.,  1.])

In [37]:
help(talib.MACD)

Help on built-in function MACD in module talib.func:

MACD(...)
    MACD(real[, fastperiod=?, slowperiod=?, signalperiod=?])
    
    Moving Average Convergence/Divergence (Momentum Indicators)
    
    Inputs:
        real: (any ndarray)
    Parameters:
        fastperiod: 12
        slowperiod: 26
        signalperiod: 9
    Outputs:
        macd
        macdsignal
        macdhist



In [42]:
macd, signal, hist = talib.MACD(close.values)

In [45]:
np.max(macd[100:] - signal[100:]  - hist[100:] )

0.0

In [49]:
macd[np.isfinite(macd)].shape

(3092,)

In [51]:
signal = signal[np.isfinite(signal)]

In [55]:
2 * ((signal > 0).astype(float) - 0.5)

array([1., 1., 1., ..., 1., 1., 1.])

In [68]:
macd.fill(1)

In [69]:
macd

array([1., 1., 1., ..., 1., 1., 1.])