In [11]:
import numpy as np
import pandas as pd
from scipy import stats

In [None]:
kTPERIOD_RETURNS = 12

# Prepare Data
Timeline alignment(done), Return of 51 names, gap filling, filter out overnight / auction time buckets

In [2]:
data_origin = pd.read_csv("close_hk.csv")\
                .set_index("Unnamed: 0")
    
#make the index a proper datetime so we can use between 
#and (later) flexibly change time windows   
data_origin = data_origin.set_index(pd.DatetimeIndex(data_origin.index))

In [3]:
# using close prices
# any gap filling should be BEFORE we create returns. we want to LOCF prices only
# Gap filling - use foward filling
data_close = data_origin.filter(regex="close").ffill()
data_close.columns = [ ticker for _,ticker in [ c.split('-') for c in data_close.columns]]

In [5]:
#manually create period returns, right aligned
#todo: rather than manual want to use pandas period functionality w/ datetime
data_closemt = data_close.shift(kTPERIOD_RETURNS)
data_rtns = data_close / data_closemt - 1

In [6]:
# testing: just visually inspect one name
data_inspect_0001HK = pd.concat([data_close['0001.HK'],data_closemt['0001.HK'],data_rtns['0001.HK']],axis=1)
data_inspect_0001HK.columns = ['close','close-minus-t','rtn']
data_inspect_0001HK

Unnamed: 0_level_0,close,close-minus-t,rtn
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-04-23 01:15:00,92.75,,
2018-04-23 01:25:00,92.75,,
2018-04-23 01:30:00,92.60,,
2018-04-23 01:35:00,92.80,,
2018-04-23 01:40:00,93.05,,
2018-04-23 01:45:00,92.85,,
2018-04-23 01:50:00,92.80,,
2018-04-23 01:55:00,92.65,,
2018-04-23 02:00:00,92.75,,
2018-04-23 02:05:00,92.55,,


In [7]:
# filter out overnight / auction time buckets
# todo: with next commit move to time period based return calculation
#       so that we dont have to manually compensate for the opening gap
#       HK auction ends at 01:30 UTC but have to make sure we dont 
#       carry over from the previous day
data_rtns_intraday = data_rtns.between_time( start_time='02:30:00',
                                             end_time='08:00:00',
                                             include_start=True,
                                             include_end=False)

In [8]:
#Remove any remaning nulls
data_rtns_intraday.isnull().any(axis=1)
data_rtns_intraday = data_rtns_intraday[~data_rtns_intraday.isnull().any(axis=1)]

# Split Data
50% / 25% / 25% without overlap, check normality

In [13]:
data_train = data_rtns_intraday.iloc[:int(data_rtns_intraday.shape[0]/2)]
data_valid = data_rtns_intraday.iloc[int(data_rtns_intraday.shape[0]/2):int(data_rtns_intraday.shape[0]*3/4)]
data_test  = data_rtns_intraday.iloc[int(data_rtns_intraday.shape[0]*3/4):]

In [14]:
print("Train Set Normality Test")
for c in data_train.columns:
    _stats = stats.jarque_bera(data_train)
    if np.abs(_stats[1] > 0.01): print("%s,%s"%(c, _stats))
print("Valid Set Normality Test")
for c in data_valid.columns:
    _stats = stats.jarque_bera(data_valid[c])
    if np.abs(_stats[1] > 0.01): print("%s,%s"%(c, _stats))
print("Test  Set Normality Test")
for c in data_test.columns:
    _stats = stats.jarque_bera(data_test[c])
    if np.abs(_stats[1] > 0.01): print("%s,%s"%(c, _stats))
        
# for reference compare this output to the original
# Train Set Normality Test
#Valid Set Normality Test
#close-0001.HK,(2.1481750498212047, 0.34160932312809555)
#close-0857.HK,(4.637475587038744, 0.09839770547696114)
#close-1928.HK,(6.163180176298237, 0.04588623545016124)
#Test  Set Normality Test
#close-0011.HK,(1.5136320646723238, 0.46915783482815754)
#close-0083.HK,(6.249342009393319, 0.04395139104629442)
#close-0386.HK,(0.07762786725849848, 0.9619296752596558)
#close-0883.HK,(0.03248964530800116, 0.9838864128838846)
#close-1928.HK,(1.6299724477465558, 0.44264542525881967)
#close-2628.HK,(6.8263428358432705, 0.03293657890775126)
#close-3328.HK,(0.210986153770102, 0.8998807028027639)
#close-3988.HK,(0.39744489306304304, 0.8197773938187253)

Train Set Normality Test
Valid Set Normality Test
0001.HK,(6.064224749785349, 0.048213685104578796)
0857.HK,(8.452645800870712, 0.014605999639945466)
Test  Set Normality Test
0011.HK,(3.2554994473743633, 0.19637096624740646)
0083.HK,(6.814479085838839, 0.03313253519634285)
0386.HK,(1.5497081805806185, 0.4607710070582528)
1928.HK,(4.337978009576364, 0.11429310830561656)
3328.HK,(1.1835132218824163, 0.5533544020994401)
3988.HK,(0.18692085297707353, 0.9107740592407177)
