In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
#One hour returns
#kTPERIOD_RETURNS = 12
#       HK auction ends at 01:30 UTC but have to make sure we dont 
#       carry over from the previous day
#kSTART_TIME = '02:30:00'
#kEND_TIME = '08:00:00' 

In [8]:
#Five minue returns
kTPERIOD_RETURNS = 1
#       HK auction ends at 01:30 UTC but have to make sure we dont 
#       carry over from the previous day
kSTART_TIME = '01:35:00'
kEND_TIME = '08:00:00' 

# Prepare Data
Timeline alignment(done), Return of 51 names, gap filling, filter out overnight / auction time buckets

In [9]:
data_origin = pd.read_csv("close_hk.csv")\
                .set_index("Unnamed: 0")
    
#make the index a proper datetime so we can use between 
#and (later) flexibly change time windows   
data_origin = data_origin.set_index(pd.DatetimeIndex(data_origin.index))

In [10]:
# using close prices
# any gap filling should be BEFORE we create returns. we want to LOCF prices only
# Gap filling - use foward filling
data_close = data_origin.filter(regex="close").ffill()
data_close.columns = [ ticker for _,ticker in [ c.split('-') for c in data_close.columns]]

In [11]:
#manually create period returns, right aligned
#todo: rather than manual want to use pandas period functionality w/ datetime
data_closemt = data_close.shift(kTPERIOD_RETURNS)
data_rtns = data_close / data_closemt - 1

In [12]:
# testing: just visually inspect one name
data_inspect_0001HK = pd.concat([data_close['0001.HK'],data_closemt['0001.HK'],data_rtns['0001.HK']],axis=1)
data_inspect_0001HK.columns = ['close','close-minus-t','rtn']
data_inspect_0001HK

Unnamed: 0_level_0,close,close-minus-t,rtn
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-04-23 01:15:00,92.75,,
2018-04-23 01:25:00,92.75,92.75,0.000000
2018-04-23 01:30:00,92.60,92.75,-0.001617
2018-04-23 01:35:00,92.80,92.60,0.002160
2018-04-23 01:40:00,93.05,92.80,0.002694
2018-04-23 01:45:00,92.85,93.05,-0.002149
2018-04-23 01:50:00,92.80,92.85,-0.000539
2018-04-23 01:55:00,92.65,92.80,-0.001616
2018-04-23 02:00:00,92.75,92.65,0.001079
2018-04-23 02:05:00,92.55,92.75,-0.002156


In [13]:
# filter out overnight / auction time buckets
data_rtns_intraday = data_rtns.between_time( start_time=kSTART_TIME,
                                             end_time=kEND_TIME,
                                             include_start=True,
                                             include_end=False)

In [14]:
#Remove any remaning nulls
data_rtns_intraday.isnull().any(axis=1)
data_rtns_intraday = data_rtns_intraday[~data_rtns_intraday.isnull().any(axis=1)]

# Split Data
roughly 50% / 25% / 25% without overlap, check normality

In [15]:
# not strictly necessary, but aligning the splits by whole days for convenience
unique_dates = np.unique(data_rtns_intraday.index.date)
valid_start_date = unique_dates[int(len(unique_dates)/2)]
test_start_date = unique_dates[int(len(unique_dates)*3/4)]

idx_train = data_rtns_intraday.index.date < valid_start_date
idx_valid = (data_rtns_intraday.index.date >= valid_start_date) & (data_rtns_intraday.index.date < test_start_date)
idx_test = data_rtns_intraday.index.date >= test_start_date

#inspection
pd.DataFrame({'train':idx_train,'idx_valid':idx_valid,'idx_test':idx_test}).describe()

Unnamed: 0,train,idx_valid,idx_test
count,5478,5478,5478
unique,2,2,2
top,False,False,False
freq,2772,4092,4092


In [16]:
data_train = data_rtns_intraday.iloc[idx_train]
data_valid = data_rtns_intraday.iloc[idx_valid]
data_test  = data_rtns_intraday.iloc[idx_test]

In [17]:
print("Train Set Normality Test")
for c in data_train.columns:
    _stats = stats.jarque_bera(data_train)
    if np.abs(_stats[1] > 0.01): print("%s,%s"%(c, _stats))
print("Valid Set Normality Test")
for c in data_valid.columns:
    _stats = stats.jarque_bera(data_valid[c])
    if np.abs(_stats[1] > 0.01): print("%s,%s"%(c, _stats))
print("Test  Set Normality Test")
for c in data_test.columns:
    _stats = stats.jarque_bera(data_test[c])
    if np.abs(_stats[1] > 0.01): print("%s,%s"%(c, _stats))
        

Train Set Normality Test
Valid Set Normality Test
Test  Set Normality Test
3988.HK,(6.738388767369635, 0.03441735335912799)
