In [1]:
%run ../utils/commonImports.py
%run ../utils/tradingImports.py
%matplotlib inline

# Properties

In [2]:
dataRoot = 'D:\\Dropbox\\My work\\krypl-project\\dataLabeled'
featuresRoot = 'D:\\Dropbox\\My work\\krypl-project\\dataFeatures'

In [3]:
fileName = 'return-01-within-day.tsv'
file = '{dataRoot}\\{fileName}'.format(dataRoot=dataRoot, fileName=fileName)
featuresFile = '{dataRoot}\\{fileName}'.format(dataRoot=featuresRoot, fileName=fileName)
data = pd.read_csv(file, sep='\t')

In [4]:
featured = data.copy()

# Return

In [5]:
def lag(arr, l):
    laged = arr[:arr.shape[0]-l]
    laged = np.array([0]*l + laged.tolist())
    return laged

In [6]:
def _return(arr):
    arr0 = arr[:arr.shape[0]-1]
    arr1 = arr[1:]
    r = (arr1 / arr0) - 1
    r = np.insert(r, 0, 0., axis=0)
    return r

In [7]:
featured['return'] = _return(featured['close'].values)
featured['return_1'] = lag(featured['return'].values, 1)
featured['return_2'] = lag(featured['return'].values, 2)

# Z-Score

In [8]:
from scipy import stats
def zscore(x, window):
    r = x.rolling(window=window)
    m = r.mean().shift(1)
    s = r.std(ddof=0).shift(1)
    z = (x-m)/s
    return z

In [9]:
featured['zscore_36_return'] = zscore(featured['return'], 36)
featured['zscore_288_return'] = zscore(featured['return'], 288)

In [10]:
featured['zscore_36_volume'] = zscore(featured['volume'], 36)
featured['zscore_288_volume'] = zscore(featured['volume'], 288)
featured['zscore_2016_volume'] = zscore(featured['volume'], 2016)

# Sliding Mean

In [11]:
def sliding_mean(x, window):
    return x.rolling(center=False,window=window).mean()

In [12]:
sliding_mean(featured['close'], 36)

0               nan
1               nan
2               nan
3               nan
4               nan
5               nan
6               nan
7               nan
8               nan
9               nan
10              nan
11              nan
12              nan
13              nan
14              nan
15              nan
16              nan
17              nan
18              nan
19              nan
20              nan
21              nan
22              nan
23              nan
24              nan
25              nan
26              nan
27              nan
28              nan
29              nan
            ...    
160925   6,256.2874
160926   6,256.4497
160927   6,256.7438
160928   6,257.3022
160929   6,257.8977
160930   6,258.4507
160931   6,258.9789
160932   6,259.5748
160933   6,260.1974
160934   6,261.0131
160935   6,261.6251
160936   6,263.5598
160937   6,265.2763
160938   6,265.9830
160939   6,266.3229
160940   6,266.5720
160941   6,266.4050
160942   6,266.0520
160943   6,265.7479


# Change In Price

$ pma = zscore(\frac{p}{avg(p, w_m)} - 1, w_z) $

In [13]:
def change_in_price(x, mean_window, zscore_window):
    mean_x = sliding_mean(x, mean_window)
    change_x = (x / mean_x) - 1
    zscore_change = zscore(change_x, zscore_window)
    return zscore_change

In [14]:
featured['pma36'] = change_in_price(featured['close'], 36, 288)
featured['pma288'] = change_in_price(featured['close'], 288, 288)
featured['pma2016'] = change_in_price(featured['close'], 2016, 288)

In [15]:
featured['vma36'] = change_in_price(featured['volume'], 36, 288)
featured['vma288'] = change_in_price(featured['volume'], 288, 288)
featured['vma2016'] = change_in_price(featured['volume'], 2016, 288)

# Sliding means change

$ ma = zscore(\frac{avg(p, w_1)}{avg(p, w_2)}-1, w_z) $

In [16]:
def sliding_means_change(x, w1, w2, zscore_window):
    mean1 = sliding_mean(x, w1)
    mean2 = sliding_mean(x, w2)
    change = (mean1 / mean2) - 1
    zscored = zscore(change, zscore_window)
    return zscored

In [17]:
featured['ma_12_108'] = sliding_means_change(featured['close'], 12, 108, 288)
featured['ma_36_288'] = sliding_means_change(featured['close'], 12, 108, 288)

# Price acceleration

$ pavg = \frac{p}{avg(p, w_m)} $

$ ac = zscore(\frac{pavg}{avg(pavg, w_m)}, w_z) $

In [18]:
def price_acceleration(x, w_mean, zscore_window):
    pavg = sliding_mean(x, w_mean)
    pavg_mean = sliding_mean(pavg, w_mean)
    acceleration = pavg / pavg_mean
    zscored = zscore(acceleration, zscore_window)
    return zscored

In [19]:
featured['ac36'] = price_acceleration(featured['close'], 36, 288)
featured['ac288'] = price_acceleration(featured['close'], 288, 288)

# Volatility

In [20]:
def std(x, window):
    return x.rolling(center=False,window=window).std()


def volatility(x, w_std, w_zscore):
    vol = std(x, w_std)
    return zscore(vol, w_zscore)

In [21]:
featured['vol36'] = volatility(featured['return'], 36, 288)
featured['vol288'] = volatility(featured['return'], 288, 288)
featured['vol2016'] = volatility(featured['return'], 2016, 288)

# Change in Volatility

$ sd = std(r, w_s)

$ dv =  zscore(\frac{std}{avg(std, w_a)}, w_z) $

In [22]:
def volatility_change(x, w_std, w_a,  w_zscore):
    sd = std(x, w_std)
    sd_mean = sliding_mean(sd, w_a)
    change = sd / sd_mean
    return zscore(change, w_zscore)

In [23]:
featured['dv_36_288'] = volatility_change(featured['return'], 36, 288, 288)
featured['dv_36_288'] = volatility_change(featured['return'], 288, 2016, 288)

In [24]:
featured.head()

Unnamed: 0,timestamp,period,open,high,low,close,volume,quoteVolume,weightedAverage,date,...,vma288,vma2016,ma_12_108,ma_36_288,ac36,ac288,vol36,vol288,vol2016,dv_36_288
0,1483225200,5min,964.0,964.0,964.0,964.0,3.7374,0.0039,964.0,2017-01-01 00:00:00,...,,,,,,,,,,
1,1483225500,5min,962.86,962.86,962.86,962.86,74.4955,0.0774,962.86,2017-01-01 00:05:00,...,,,,,,,,,,
2,1483225800,5min,964.0,964.0,962.86,962.86,273.307,0.2837,963.2768,2017-01-01 00:10:00,...,,,,,,,,,,
3,1483226100,5min,964.0,964.0,964.0,964.0,69.1332,0.0717,964.0,2017-01-01 00:15:00,...,,,,,,,,,,
4,1483226400,5min,964.0,965.0,962.86,965.0,3050.495,3.1637,964.2074,2017-01-01 00:20:00,...,,,,,,,,,,


# Save Features

In [25]:
featured.columns

Index(['timestamp', 'period', 'open', 'high', 'low', 'close', 'volume',
       'quoteVolume', 'weightedAverage', 'date', 'maxReturn', 'tillMax',
       'minReturn', 'label', 'return', 'return_1', 'return_2',
       'zscore_36_return', 'zscore_288_return', 'zscore_36_volume',
       'zscore_288_volume', 'zscore_2016_volume', 'pma36', 'pma288', 'pma2016',
       'vma36', 'vma288', 'vma2016', 'ma_12_108', 'ma_36_288', 'ac36', 'ac288',
       'vol36', 'vol288', 'vol2016', 'dv_36_288'],
      dtype='object')

In [28]:
write_cols = ['date', 'label', 'close', 'return', 'return_1', 'return_2', 'zscore_36_return',
       'zscore_288_return', 'pma36', 'ma_12_108', 'ma_36_288', 'ac36', 'ac288',
       'zscore_36_volume', 'zscore_288_volume', 'zscore_2016_volume', 'vol36',
       'vol288', 'vol2016', 'dv_36_288']

In [29]:
write_tsv(featured[write_cols].dropna(), featuresFile)

In [26]:
featured.shape

(160955, 36)

In [27]:
featured.dropna().shape

(158365, 36)

# Explore Features

In [8]:
featured.columns

Index(['close', 'timestamp', 'high', 'low', 'open', 'quoteVolume', 'volume',
       'weightedAverage', 'date', 'entryReturn_001', 'entryReturn_003',
       'entryReturn_006', 'entryReturn_01', 'entryReturn_03', 'entryReturn_05',
       'entryReturn_07', 'entryReturn_1', 'entryReturn_12', 'entryReturn_15',
       'entryReturn_18', 'entryReturn_2', 'entryReturn_22', 'entryReturn_25',
       'entryReturn_28', 'entryReturn_3', 'labelSlope_close_12', 'returnOpen',
       'returnClose', 'returnHigh', 'returnLow', 'labelSlope_returnClose_12',
       'return', 'return_1', 'return_2'],
      dtype='object')