In [12]:
%matplotlib inline    
import matplotlib as mplt
from matplotlib import cm
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import matplotlib
from matplotlib.ticker import FuncFormatter
import matplotlib.pyplot as plt
# matplotlib.style.use('ggplot')

from scipy.stats import lognorm
from scipy.stats import norm
from scipy.stats import chisqprob

from numpy import prod
import seaborn as sns

# statiscal models
import statsmodels as sm
from statsmodels.tsa.stattools import acf  
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.api import VAR, DynamicVAR

from statsmodels.stats import diagnostic

# local packages
from utils_libs import *
from utils_data_prep import *

In [13]:
# --- Load pre-processed order book data ---

all_dta_minu = np.load("../dataset/bitcoin/dta_minu.dat")
all_loc_hour = np.load("../dataset/bitcoin/loc_hour.dat")
print len(all_dta_minu), len(all_loc_hour)

# --- Load order book data files ---

# all_dta_minu,all_loc_hour = load_raw_order_book_files('../dataset/bitcoin/order_book/*.csv', True)

539157 10641


In [14]:
# --- calculate price, return and volatility ---

price_minu, req_minu = cal_price_req_minu(all_dta_minu)

print np.shape(price_minu), np.shape(req_minu)

pvol_hour = cal_price_volatility_hour( all_loc_hour, price_minu )
return_minu, rvol_hour = cal_return_volatility_hour( all_loc_hour, price_minu, 'per' )

print len(price_minu),len(return_minu), len(pvol_hour), len(rvol_hour)

(539157,) (539157, 2)
539157 528516 10641 10641


In [15]:
# --- extract features w.r.t. minutes ---

features_minu = [] 
# ask mean price, mean amount, var price, var amount, 
# bid mean price, mean amount, var price, var amount,
# ask skew price, skew amount
# bid skew price, skew amount, 
# ask request, bid request

for i in range( len(all_dta_minu) ):
    
    # shape: # by [price, amount]
    tmp_a = all_dta_minu[i][0]
    tmp_b = all_dta_minu[i][1]

    '''
    tmpft = mle_norm_2d(tmp_a)
    tmp = tmpft[0] + tmpft[1]
    
    tmpft = mle_norm_2d(tmp_b)
    tmp += tmpft[0]
    tmp += tmpft[1]
    
    # skewness feature
    tmp += skewness(tmp_a)
    tmp += skewness(tmp_b)
    
    # amount of requests
    tmp += [len(all_dta_minu[i][0]), len(all_dta_minu[i][1])] 
    
    '''
    features_minu.append( orderbook_stat_features(all_dta_minu, i) )
    
#  shape: [miniutes, features]
features_minu = np.reshape( features_minu, (len(features_minu), -1) )

print np.shape( np.asarray(features_minu) )

(539157, 10)


In [None]:
# --- parameter set-up for preparing trainning and testing data ---
para_order_minu = 10
para_order_hour = 16
para_train_split_ratio = 0.8
bool_feature_selection = True


In [61]:
# --- obtain training and testing data, garch ---

file_postfix = "garch" 

vol_train, rt_train, vol_test, rt_test = training_testing_garch(rvol_hour, all_loc_hour, para_order_hour, \
                                                                para_train_split_ratio, price_minu)

print np.shape(vol_train), np.shape(rt_train), np.shape(vol_test), np.shape(rt_test)
print len(rt_train)+len(rt_test), len(vol_train)+len(vol_test)

np.asarray(vol_train).dump("../dataset/bitcoin/training_data/voltrain_"+file_postfix+".dat")
np.asarray(rt_train).dump("../dataset/bitcoin/training_data/rttrain_"  +file_postfix+".dat")
np.asarray(vol_test).dump("../dataset/bitcoin/training_data/voltest_"  +file_postfix+".dat")
np.asarray(rt_test).dump("../dataset/bitcoin/training_data/rttest_"    +file_postfix+".dat")

(8515,) (8515,) (2125,) (2125,)
10640 10640


In [18]:
# --- obtain training and testing data, arima, structural time series ---

file_postfix = "stat" 

xtrain, extrain, xtest, extest = training_testing_statistic(features_minu, rvol_hour, all_loc_hour, \
                                   para_order_minu, para_order_hour, para_train_split_ratio)

print np.shape(xtrain), np.shape(extrain), np.shape(xtest), np.shape(extest)

np.asarray(xtrain).dump("../dataset/bitcoin/training_data/xtrain_"+file_postfix+".dat")
np.asarray(xtest ).dump("../dataset/bitcoin/training_data/xtest_" +file_postfix+".dat")
np.asarray(extrain).dump("../dataset/bitcoin/training_data/extrain_"+file_postfix+".dat")
np.asarray(extest ).dump("../dataset/bitcoin/training_data/extest_" +file_postfix+".dat")


(8515,) (8515, 140) (2125,) (2125, 140)


In [6]:
# --- obtain training and testing data, plain regression ---

file_postfix = "v_minu_reg" 

# features_minu, req_minu, pvol_hour, all_loc_hour
x, y, var_explain = prepare_feature_target( features_minu, [], rvol_hour, all_loc_hour, \
                                                        para_order_minu, para_order_hour, bool_feature_selection )
print np.shape(x[0][0]), np.shape(x[0][1])
    
xtrain, ytrain, xtest, ytest = training_testing_plain_regression(x, y, para_train_split_ratio)    
print np.shape(xtrain), np.shape(ytrain), np.shape(xtest), np.shape(ytest)

np.asarray(xtrain).dump("../dataset/bitcoin/training_data/xtrain_"+file_postfix+".dat")
np.asarray(xtest ).dump("../dataset/bitcoin/training_data/xtest_" +file_postfix+".dat")
np.asarray(ytrain).dump("../dataset/bitcoin/training_data/ytrain_"+file_postfix+".dat")
np.asarray(ytest ).dump("../dataset/bitcoin/training_data/ytest_" +file_postfix+".dat")


(16,) (10, 4)
(8500, 56) (8500,) (2125, 56) (2125,)


In [16]:
# --- obtain training and testing data, mixture: linear --- 

file_postfix = "v_minu_mix"

# features_minu, req_minu, pvol_hour, all_loc_hour
x, y = prepare_feature_target( features_minu, [], rvol_hour, all_loc_hour, \
                                                        para_order_minu, para_order_hour, bool_feature_selection )
print len(x)

xtrain, ytrain, xtest, ytest = training_testing_mixture_mlp(x, y, para_train_split_ratio)

print len(xtrain[0]), np.shape(ytrain), np.shape(ytest)

np.asarray(xtrain).dump("../dataset/bitcoin/training_data/xtrain_"+file_postfix+".dat")
np.asarray(xtest ).dump("../dataset/bitcoin/training_data/xtest_" +file_postfix+".dat")
np.asarray(ytrain).dump("../dataset/bitcoin/training_data/ytrain_"+file_postfix+".dat")
np.asarray(ytest ).dump("../dataset/bitcoin/training_data/ytest_" +file_postfix+".dat")

#  done for training-test preparaion 

10625
2 (8500,) (2125,)


In [28]:
# --- obtain training and testing data, mixture: rnn --- 

file_postfix = "v_minu_mix_rnn"

# features_minu, req_minu, pvol_hour, all_loc_hour
x, y = prepare_feature_target( features_minu, [], pvol_hour, all_loc_hour, \
                                                        para_order_minu, para_order_hour, bool_feature_selection )
print len(x)

xtrain, ytrain, xtest, ytest = training_testing_mixture_rnn(x, y, para_train_split_ratio)

print len(xtrain[0]), np.shape(ytrain), np.shape(ytest)

np.asarray(xtrain).dump("../dataset/bitcoin/training_data/xtrain_"+file_postfix+".dat")
np.asarray(xtest ).dump("../dataset/bitcoin/training_data/xtest_" +file_postfix+".dat")
np.asarray(ytrain).dump("../dataset/bitcoin/training_data/ytrain_"+file_postfix+".dat")
np.asarray(ytest ).dump("../dataset/bitcoin/training_data/ytest_" +file_postfix+".dat")


10577
2 (8461,) (2116,)


In [None]:
# --- explained variance in feature selection ---

print 'percent: ', sum([1 for i in var_explain if i>=0.95])*1.0/len(var_explain)

fig, ax = plt.subplots()
fig.set_size_inches( 5,5 )
ax.hist(var_explain, histtype='bar',  \
        label=['Variance explained'], bins = 30)
# ax.set_title('Price proposed by ask and bit requests on one minute', fontsize=13)
ax.set_xlabel('Variance', fontsize=13)
ax.set_xlim(0.95,1)