In [1]:
%run ../../utils/commonImports.py
%run ../../utils/tradingImports.py
%matplotlib inline

# Properties

In [2]:
dataRoot = 'D:\\Dropbox\\My work\\krypl-project\\dataLabeled'
featuresRoot = 'D:\\Dropbox\\My work\\krypl-project\\dataFeatures'

fileName = 'return-01-loss-max-01-within-day.tsv'
fileNameDropedNA = 'return-01-loss-max-01-within-day-droped-na.tsv'
featuresFile = f'{featuresRoot}\\{fileName}'
featuresFileDropedNA = f'{featuresRoot}\\{fileNameDropedNA}'

In [3]:
file = f'{dataRoot}\\{fileName}'
data = pd.read_csv(file, sep='\t')
featured = data.copy()

# Return

In [4]:
def lag(arr, l):
    laged = arr[:arr.shape[0]-l]
    laged = np.array([0]*l + laged.tolist())
    return laged

In [5]:
def _return(arr):
    arr0 = arr[:arr.shape[0]-1]
    arr1 = arr[1:]
    r = (arr1 / arr0) - 1
    r = np.insert(r, 0, 0., axis=0)
    return r

In [6]:
featured['return'] = _return(featured['close'].values)
featured['return_1'] = lag(featured['return'].values, 1)
featured['return_2'] = lag(featured['return'].values, 2)

# Z-Score

In [7]:
from scipy import stats
def zscore(x, window):
    r = x.rolling(window=window)
    m = r.mean().shift(1)
    s = r.std(ddof=0).shift(1)
    z = (x-m)/s
    return z

In [8]:
featured['zscore_36_return'] = zscore(featured['return'], 36)
featured['zscore_288_return'] = zscore(featured['return'], 288)

In [9]:
featured['zscore_36_volume'] = zscore(featured['volume'], 36)
featured['zscore_288_volume'] = zscore(featured['volume'], 288)
featured['zscore_2016_volume'] = zscore(featured['volume'], 2016)

# Sliding Mean

In [10]:
def sliding_mean(x, window):
    return x.rolling(center=False,window=window).mean()

In [11]:
sliding_mean(featured['close'], 36)

0               nan
1               nan
2               nan
3               nan
4               nan
5               nan
6               nan
7               nan
8               nan
9               nan
10              nan
11              nan
12              nan
13              nan
14              nan
15              nan
16              nan
17              nan
18              nan
19              nan
20              nan
21              nan
22              nan
23              nan
24              nan
25              nan
26              nan
27              nan
28              nan
29              nan
            ...    
360262   6,496.8732
360263   6,496.6553
360264   6,495.7416
360265   6,494.8440
360266   6,494.4865
360267   6,493.9882
360268   6,493.5159
360269   6,492.6906
360270   6,492.5938
360271   6,492.5554
360272   6,492.4701
360273   6,493.0376
360274   6,493.4741
360275   6,493.6836
360276   6,493.5285
360277   6,493.5506
360278   6,493.5063
360279   6,492.5320
360280   6,491.4848


# Change In Price

$ pma = zscore(\frac{p}{avg(p, w_m)} - 1, w_z) $

In [12]:
def change_in_price(x, mean_window, zscore_window):
    mean_x = sliding_mean(x, mean_window)
    change_x = (x / mean_x) - 1
    zscore_change = zscore(change_x, zscore_window)
    return zscore_change

In [13]:
featured['pma36'] = change_in_price(featured['close'], 36, 288)
featured['pma288'] = change_in_price(featured['close'], 288, 288)
featured['pma2016'] = change_in_price(featured['close'], 2016, 288)

In [14]:
featured['vma36'] = change_in_price(featured['volume'], 36, 288)
featured['vma288'] = change_in_price(featured['volume'], 288, 288)
featured['vma2016'] = change_in_price(featured['volume'], 2016, 288)

# Sliding means change

$ ma = zscore(\frac{avg(p, w_1)}{avg(p, w_2)}-1, w_z) $

In [15]:
def sliding_means_change(x, w1, w2, zscore_window):
    mean1 = sliding_mean(x, w1)
    mean2 = sliding_mean(x, w2)
    change = (mean1 / mean2) - 1
    zscored = zscore(change, zscore_window)
    return zscored

In [16]:
featured['ma_12_108'] = sliding_means_change(featured['close'], 12, 108, 288)
featured['ma_36_288'] = sliding_means_change(featured['close'], 12, 288, 288)

# Price acceleration

$ pavg = \frac{p}{avg(p, w_m)} $

$ ac = zscore(\frac{pavg}{avg(pavg, w_m)}, w_z) $

In [17]:
def price_acceleration(x, w_mean, zscore_window):
    pavg = sliding_mean(x, w_mean)
    pavg_mean = sliding_mean(pavg, w_mean)
    acceleration = pavg / pavg_mean
    zscored = zscore(acceleration, zscore_window)
    return zscored

In [18]:
featured['ac36'] = price_acceleration(featured['close'], 36, 288)
featured['ac288'] = price_acceleration(featured['close'], 288, 288)

# Volatility

In [19]:
def std(x, window):
    return x.rolling(center=False,window=window).std()


def volatility(x, w_std, w_zscore):
    vol = std(x, w_std)
    return zscore(vol, w_zscore)

In [20]:
featured['vol36'] = volatility(featured['return'], 36, 288)
featured['vol288'] = volatility(featured['return'], 288, 288)
featured['vol2016'] = volatility(featured['return'], 2016, 288)

# Change in Volatility

$ sd = std(r, w_s)

$ dv =  zscore(\frac{std}{avg(std, w_a)}, w_z) $

In [21]:
def volatility_change(x, w_std, w_a,  w_zscore):
    sd = std(x, w_std)
    sd_mean = sliding_mean(sd, w_a)
    change = sd / sd_mean
    return zscore(change, w_zscore)

In [22]:
featured['dv_36_288'] = volatility_change(featured['return'], 36, 288, 288)
featured['dv_288_2016'] = volatility_change(featured['return'], 288, 2016, 288)

In [23]:
featured.head()

Unnamed: 0,timestamp,period,open,high,low,close,volume,quoteVolume,weightedAverage,date,...,vma2016,ma_12_108,ma_36_288,ac36,ac288,vol36,vol288,vol2016,dv_36_288,dv_288_2016
0,1424372400,5min,225.0,225.0,225.0,225.0,1.0,0.0044,225.0,2015-02-19 20:00:00,...,,,,,,,,,,
1,1424373000,5min,225.0,225.0,225.0,225.0,1.0,0.0044,225.0,2015-02-19 20:10:00,...,,,,,,,,,,
2,1424373300,5min,225.0,225.0,225.0,225.0,0.0,0.0,225.0,2015-02-19 20:15:00,...,,,,,,,,,,
3,1424373600,5min,225.0,225.0,225.0,225.0,0.0,0.0,225.0,2015-02-19 20:20:00,...,,,,,,,,,,
4,1424373900,5min,225.0,225.0,225.0,225.0,0.0,0.0,225.0,2015-02-19 20:25:00,...,,,,,,,,,,


# Save Features

In [24]:
featured.columns

Index(['timestamp', 'period', 'open', 'high', 'low', 'close', 'volume',
       'quoteVolume', 'weightedAverage', 'date', 'maxReturn', 'tillMax',
       'minReturn', 'label', 'return', 'return_1', 'return_2',
       'zscore_36_return', 'zscore_288_return', 'zscore_36_volume',
       'zscore_288_volume', 'zscore_2016_volume', 'pma36', 'pma288', 'pma2016',
       'vma36', 'vma288', 'vma2016', 'ma_12_108', 'ma_36_288', 'ac36', 'ac288',
       'vol36', 'vol288', 'vol2016', 'dv_36_288', 'dv_288_2016'],
      dtype='object')

In [25]:
write_tsv(fillna(featured, 0), featuresFile)
write_tsv(dropna(featured), featuresFileDropedNA)

In [26]:
featured.shape

(360292, 37)

In [27]:
dropna(featured).shape

(323391, 37)

In [28]:
dropna(featured).describe()

Unnamed: 0,timestamp,open,high,low,close,volume,quoteVolume,weightedAverage,maxReturn,tillMax,...,vma2016,ma_12_108,ma_36_288,ac36,ac288,vol36,vol288,vol2016,dv_36_288,dv_288_2016
count,323391.0,323391.0,323391.0,323391.0,323391.0,323391.0,323391.0,323391.0,323391.0,323391.0,...,323391.0,323391.0,323391.0,323391.0,323391.0,323391.0,323391.0,323391.0,323391.0,323391.0
mean,1483618664.2578,3120.7577,3127.6637,3113.2482,3120.5208,65340.2579,13.5572,3120.4525,0.0031,136.7691,...,0.0435,-0.0134,-0.0182,-0.0092,0.0075,0.0762,-0.0781,-0.0264,0.1224,-0.023
std,28654744.5802,3933.3445,3944.369,3921.2168,3932.9298,183985.4619,33.2333,3932.827,0.0256,116.4608,...,2.0713,1.2758,1.4347,1.2414,1.6941,1.2879,1.6602,1.7326,1.2875,1.651
min,1425447000.0,178.7193,178.7193,174.5701,178.7193,0.0,0.0,0.0,-1.0,-1.0,...,-1.5447,-13.5045,-12.029,-33.4562,-5.363,-6.0729,-51.6309,-49.843,-5.9688,-14.9462
25%,1459698150.0,437.0674,437.5,437.0,437.1639,0.7008,0.0014,437.0783,0.0,23.0,...,-0.4014,-0.7615,-1.0272,-0.6986,-1.5208,-0.7781,-1.3781,-1.4281,-0.74,-1.3388
50%,1483952400.0,904.914,905.9,903.5,904.7731,1983.1365,1.4232,904.7568,0.011,97.0,...,-0.2176,-0.0153,-0.022,-0.002,0.018,-0.1641,-0.1841,-0.1074,-0.1288,-0.0767
75%,1508206650.0,5621.0972,5632.9403,5610.0,5621.0012,52868.2268,13.1682,5621.0327,0.0133,286.0,...,-0.0514,0.7421,1.0039,0.687,1.536,0.734,1.2481,1.4045,0.7779,1.2872
max,1537207200.0,19896.6873,19903.4402,19860.12,19896.6873,8905611.9065,1244.438,19888.01,0.2598,286.0,...,385.3693,50.0138,9.3768,50.3711,4.9823,69.142,37.4764,34.4536,92.977,37.2431


In [29]:
dropna(featured['vma2016']).describe()

count   357,400.0000
mean          1.0671
std         277.6016
min          -1.5447
25%          -0.3780
50%          -0.1822
75%          -0.0627
max     111,739.8308
Name: vma2016, dtype: float64