In [None]:
%matplotlib inline    
import matplotlib as mplt
from matplotlib import cm
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import matplotlib
from matplotlib.ticker import FuncFormatter
import matplotlib.pyplot as plt
# matplotlib.style.use('ggplot')

from utils_libs import *
from utils_data_prep import *

from scipy.stats import lognorm
from scipy.stats import norm
# from scipy.stats import chisqprob

from numpy import prod
import seaborn as sns

# statiscal models
import statsmodels as sm
from statsmodels.tsa.stattools import acf  
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.api import VAR, DynamicVAR

from statsmodels.stats import diagnostic
import datetime
import pickle
from datetime import timedelta
import math

In [None]:
#import transaction data market 1 & 2

trx_df_market1 = np.load("../../dataset/bitcoin/large/trx_df_tick_data_bitfinex_market.dat")
print(trx_df_market1.shape)
print(trx_df_market1.head(3))

trx_df_market2 = np.load("../../dataset/bitcoin/large/trx_df_tick_data_bitstamp_market.dat")
print(trx_df_market2.shape)
print(trx_df_market2.head(3))

print('TRX data LOADED...\n')

In [None]:
# extract transaction features fast

a = datetime.datetime.now()
trx_features_market1, trx_features_mins_market1 = extract_trx_features_matrix_fast(trx_df_market1)
b = datetime.datetime.now()
print('Finished extracting TRX features for market1')
print(b-a)
print(trx_features_market1.shape)
print(trx_features_market1[1:5, 0:7])     
print(trx_features_mins_market1[0:5])


a = datetime.datetime.now()
trx_features_market2, trx_features_mins_market2 = extract_trx_features_matrix_fast(trx_df_market2)
b = datetime.datetime.now()
print('Finished extracting TRX features for market 2')
print(b-a)
print(trx_features_market2.shape)
print(trx_features_market2[1:5, 0:7])   
print(trx_features_mins_market2[0:5])

In [None]:
# import OB data for market 1

all_dta_minu_ob_market1 = np.load("../../dataset/bitcoin/large/dta_minu_bitfinex_ob.dat")
all_loc_hour_ob_market1 = np.load("../../dataset/bitcoin/large/loc_hour_bitfinex_ob.dat")
all_loc_month_ob_market1 = np.load("../../dataset/bitcoin/large/loc_month_bitfinex_ob.dat")
all_dta_minu_ticks_ob_market1 = np.load("../../dataset/bitcoin/large/dta_minu_ticks_bitfinex_ob.dat")
print('OB data LOADED...\n')

print len(all_dta_minu_ob_market1), len(all_loc_hour_ob_market1), len(all_loc_month_ob_market1)

# How data looks and is ogranised
print( np.shape(all_dta_minu_ob_market1) )
min_idx = 5; #minute offset 
bid_idx = 1; #bid or ask side
print( 'price amount: \n' )
print( all_dta_minu_ob_market1[min_idx][bid_idx] )

In [None]:
# import OB features for market 1

ob_features_minu_market1 = np.load("../../dataset/bitcoin/large/feature_minu_bitfinex_ob.dat")
ob_features_minu_ticks_market1 = np.load("../../dataset/bitcoin/large/feature_minu_ticks_bitfinex_ob.dat")

print('OB features LOADED...\n')

print(ob_features_minu_market1.shape)
print(ob_features_minu_market1[1:5,0:7])  
print(ob_features_minu_ticks_market1[1:5])
print(ob_features_minu_ticks_market1.shape)

In [None]:
# prepare data for learning (yi, ti) = yi is target volume, ti is the key in the feature_dict
# sources = 1..M., feature_dist[key=(ti, source_id)]=matrix of features

from datetime import datetime
import time

def get_source_feature_matrix(target_idx, features_tuple, feature_dict, source_id, delta1, delta2):
    # y_i target volume extracted from index at target_idx
    # features_tuple - (trx_features_timestamp_market1_sort, trx_features_market1_sort) 
    # feature_dict , #key = (target_id, source_id), value = matrix of features from source_id; - call by reference
    # history [t_i-delta2,t_i-delta1], where t_i is time of target volume y_i
    
    trx_features_timestamp_market_sort = features_tuple[0]
    trx_features_market_sort = features_tuple[1]

    delta = delta2 - delta1;
    ti = trx_features_timestamp_market_sort[target_idx]
    tj = trx_features_timestamp_market_sort[target_idx - delta2]

    yi = trx_features_market_sort[target_idx, 0]
    #(yi,ti) target_var, feature_dict[(ti,source_id)] = features

    if (ti - tj == (delta*60)):
        
        tmp_features_matrix = trx_features_market_sort[target_idx-delta2+1:target_idx-delta1+1,:]
        key_tmp = (ti, source_id)
        feature_dict[key_tmp]=tmp_features_matrix
        
    else:
        
        tmp_features_matrix = np.zeros([delta, trx_features_market_sort.shape[1]])
        offset_idx = 1
        while( (target_idx>=offset_idx) & ((ti-trx_features_timestamp_market_sort[target_idx-offset_idx]) <= (delta*60)) ):
            tmp_feature_offset_idx = delta-((ti-trx_features_timestamp_market_sort[target_idx-offset_idx])//60)
            tmp_features_matrix[tmp_feature_offset_idx,:] = trx_features_market_sort[target_idx-offset_idx,:]    
            offset_idx += 1

        key_tmp = (ti, source_id)
        feature_dict[key_tmp]=tmp_features_matrix
        
    return (yi,ti)

def fillEmptyFeat(tmp_key, my_dict, elem):
    if tmp_key not in my_dict:
        my_dict[tmp_key] = elem
        return 1
    else:
        return 0

def extract_target_feature_pairs(source_data, feature_dict, delta1, delta2):
    
    source_type = source_data[0]
    source_id = source_data[1]
    print(source_type)
    print("source id = " + str(source_id))
    
    
    features_mins_market = source_data[2]
    features_market = source_data[3]

    features_timestamp_market=[int((datetime.strptime(str_var, '%Y-%m-%d %H-%M')-datetime(1970,1,1)).total_seconds())
                                    for str_var in features_mins_market]
    idx_sort_time = np.argsort(features_timestamp_market)

    features_timestamp_market_sort = [features_timestamp_market[x] for x in idx_sort_time]
    features_market_sort = features_market[idx_sort_time,:]

    features_time_tuple_sort = (features_timestamp_market_sort, features_market_sort)

    target_feature_key = [get_source_feature_matrix(idx, 
                                                    features_time_tuple_sort, 
                                                    feature_dict, 
                                                    source_id, 
                                                    delta1, 
                                                    delta2) 
                          for idx in range(len(features_timestamp_market_sort))]
   
    
    print("Extracted!")
    return target_feature_key
    
    
    
feature_dict={}
#key = (timestep, source_id), value = matrix 

delta2 = 11 # look-back time steps
delta1 = 1 # k-step ahead 
delta = delta2 - delta1


source1_trx = ("trx-data", 1, trx_features_mins_market1, trx_features_market1)
# target 
target_feature_key = extract_target_feature_pairs(source1_trx, feature_dict, delta1, delta2)


source2_trx = ("trx-data", 2, trx_features_mins_market2, trx_features_market2)
#for second market we neglect target_feature_key, it only inserts features for key with source2
_ = extract_target_feature_pairs(source2_trx, feature_dict, delta1, delta2)


source3_ob = ("ob-data", 3, ob_features_minu_ticks_market1, ob_features_minu_market1)
_ = extract_target_feature_pairs(source3_ob, feature_dict, delta1, delta2)

source3_ob = ("ob-data", 3, ob_features_minu_ticks_market1, ob_features_minu_market1)
_ = extract_target_feature_pairs(source3_ob, feature_dict, delta1, delta2)


empty_elem = np.zeros([delta, trx_features_market2.shape[1] ])
tmp_inserts = map( lambda x: fillEmptyFeat((x[1],2), feature_dict, empty_elem), target_feature_key)
    
empty_elem = np.zeros([delta, ob_features_minu_market1.shape[1] ])
tmp_inserts = map( lambda x: fillEmptyFeat((x[1],3), feature_dict, empty_elem), target_feature_key)

print("finished")


In [None]:
import pickle

path_data = "../../dataset/bitcoin/"

dta = pickle.load(open(path_data + 'target_market1_features_ob_trx_delta_5_data.pkl', "rb"), encoding = 'latin1')

print(np.shape(dta[0][2][2]))

In [None]:
# ----- create target, time, feature tupple (yi, ti, xi)

# target_feature_key [y_i, timestep]

# feature_dict [(timestep, source_id), matrix]

# source1 feature matrix, source2 feature matrix, source 3 feature matrix

target_feature_data = [(x[0], x[1], [feature_dict[(x[1],1)], feature_dict[(x[1],2)], feature_dict[(x[1],3)]]) 
                       for x in target_feature_key]

print(len(target_feature_data))
# print(target_feature_data[10])

In [None]:
# first column corresponds to the target variable
# timestamp increasing order 
target_feature_data[8]

In [None]:
# load organized data

import pickle

filename = '../../dataset/bitcoin/delta_15_data.pkl'

# filename = '../../dataset/bitcoin/delta_15_data.pkl'

infile = open(filename,'rb')
target_feature_data_agg = pickle.load(infile)
infile.close()
print("loaded")

print(len(target_feature_data_agg))

print(target_feature_data_agg[1])

print([i[1] for i in target_feature_data_agg[0:5]])

In [None]:
target_feature_data = target_feature_data_agg


In [None]:
# remove correlated features

target_feature_data = []

for i in target_feature_data_agg:
    
    tmp_ob = i[2][2]
    
    ob_feature = tmp_ob[:, [0, 1, 3, 5, 6, 8, 9]]
    
    target_feature_data.append([i[0], i[1], [i[2][0], i[2][1], ob_feature]])
    
print(target_feature_data[1])

# --- Load organized data

In [None]:
import pickle

path_data = "../../dataset/bitcoin/"

dta = pickle.load(open(path_data + 'target_market1_features_ob_trx_delta_1_data.pkl', "rb"), encoding = 'latin1')

print(np.shape(dta[0][2][2]))

target_feature_data = dta

In [None]:
# ----- dataset split

cnt = len(target_feature_data)

train_split = int(cnt*0.7)
val_split = int(cnt*0.8)

train_data = target_feature_data[:train_split]
val_data = target_feature_data[train_split:val_split]
test_data = target_feature_data[val_split:]

print(len(train_data), len(val_data), len(test_data))

In [None]:
# ----- normalization

# training data: mean and standard deviation from the training data
src_x = []
src_num = len(train_data[0][2])

src_m = []
src_std = []

for src_idx in range(src_num):
    # [N T D]     
    tmp_src = [tmp[2][src_idx] for tmp in train_data]
    tmp_mean = np.mean(tmp_src, axis = 0)
    tmp_std = np.std(tmp_src, axis = 0)
    
    src_m.append(tmp_mean)
    src_std.append(tmp_std)
    
    print('test', np.shape(tmp_src))

print((train_data[0][2][0]))
print(val_data[0][2][0])
print(test_data[0][2][0])

# normalize training data
for src_idx in range(src_num):
    for tmp_idx, tmp in enumerate(train_data):
        
        tmpdata = list(tmp)
        tmpdata[2] = list(tmpdata[2])
        
        tmpdata[2][src_idx] = (tmpdata[2][src_idx] - src_m[src_idx])/(src_std[src_idx] + 1e-5)
        train_data[tmp_idx] = tmpdata
    
# normalize validattion data
for src_idx in range(src_num):
    for tmp_idx, tmp in enumerate(val_data):
        
        tmpdata = list(tmp)
        tmpdata[2] = list(tmpdata[2])
        
        tmpdata[2][src_idx] = (tmpdata[2][src_idx] - src_m[src_idx])/(src_std[src_idx] + 1e-5)
        val_data[tmp_idx] = tmpdata
        
# normalize testing data
for src_idx in range(src_num):
    for tmp_idx, tmp in enumerate(test_data):
        
        tmpdata = list(tmp)
        tmpdata[2] = list(tmpdata[2])
        
        tmpdata[2][src_idx] = (tmpdata[2][src_idx] - src_m[src_idx])/(src_std[src_idx] + 1e-5)
        test_data[tmp_idx] = tmpdata

print("\n")
print(train_data[0][2][0])
print(val_data[0][2][0])
print(test_data[0][2][0])

In [None]:
src_m

In [None]:
src_std

In [None]:
# auto market 

auto_tr_data = [ [i[0], i[1], [i[2][0], i[2][2]]] for i in train_data ]
print(np.shape(auto_tr_data[0][2][0]))

auto_val_data =  [ [i[0], i[1], [i[2][0], i[2][2]]] for i in val_data ]
auto_test_data = [ [i[0], i[1], [i[2][0], i[2][2]]] for i in test_data ]

train_data = auto_tr_data
val_data = auto_val_data
test_data = auto_test_data


In [None]:
import pickle

pickle.dump(train_data, open("../../datasets/bitcoin/market2_tar10_len10/train.p", "wb" ))
pickle.dump(val_data,   open("../../datasets/bitcoin/market2_tar10_len10/val.p", "wb" ))
pickle.dump(test_data,  open("../../datasets/bitcoin/market2_tar10_len10/test.p", "wb" ))


# ---- load existing splits

In [None]:

import pickle
import numpy as np
import pandas as pd

# [y, timestamp, x]
train_data = pickle.load(open("../../datasets/bitcoin/market2_tar1_len10/train.p", "rb"), encoding = 'latin1')
val_data = pickle.load(open(  "../../datasets/bitcoin/market2_tar1_len10/val.p", "rb"), encoding = 'latin1')
test_data = pickle.load(open( "../../datasets/bitcoin/market2_tar1_len10/test.p", "rb"), encoding = 'latin1')

print(len(train_data), len(val_data), len(test_data))

print([np.shape(i) for i in train_data[0][2]])
print([np.shape(i) for i in val_data[0][2]])
print([np.shape(i) for i in test_data[0][2]])


In [None]:
# ----------------helper functions for removing daily patterns

from datetime import datetime
def get_intra_day_index(tmp_t, agg_level=60):
    tmp = datetime.utcfromtimestamp(tmp_t)
    index = int((tmp.hour*60+tmp.minute)/agg_level)
    return index

def get_avg_train_volume(tr_dta, agg_level=60):
    vol_train = [ x[0] for x in tr_dta ]
    timestamp_train = [ x[1] for x in tr_dta ]
    my_train_dict = {'volume': vol_train, 'timestamp': timestamp_train }
    df_train_tmp = pd.DataFrame(data=my_train_dict)
    df_train_tmp['intraday_idx'] = df_train_tmp['timestamp'].map( lambda x: get_intra_day_index(x) )
    avg_vol_dict_train = dict(df_train_tmp.groupby(['intraday_idx'])['volume'].mean())
    
    #dictionary: key is intraday index, value is average volume
    return avg_vol_dict_train

# ------

tr_hour_avg_vol = get_avg_train_volume(train_data)

for tmp_ins in train_data:
    tmp_y = tmp_ins[0]
    tmp_time = tmp_ins[1]
    
    tmp_y_normlizer = tr_hour_avg_vol[get_intra_day_index(tmp_time)]
    tmp_y_dese = tmp_y/tmp_y_normlizer
    
    tmp_ins[0] = [tmp_y, tmp_y_normlizer, tmp_y_dese]
    
for tmp_ins in val_data:
    tmp_y = tmp_ins[0]
    tmp_time = tmp_ins[1]
    
    tmp_y_normlizer = tr_hour_avg_vol[get_intra_day_index(tmp_time)]
    tmp_y_dese = tmp_y/tmp_y_normlizer
    
    tmp_ins[0] = [tmp_y, tmp_y_normlizer, tmp_y_dese]
    
for tmp_ins in test_data:
    tmp_y = tmp_ins[0]
    tmp_time = tmp_ins[1]
    
    tmp_y_normlizer = tr_hour_avg_vol[get_intra_day_index(tmp_time)]
    tmp_y_dese = tmp_y/tmp_y_normlizer
    
    tmp_ins[0] = [tmp_y, tmp_y_normlizer, tmp_y_dese]


In [None]:
train_data[0][0]

In [None]:
import pickle

pickle.dump(train_data, open("../../datasets/bitcoin/market2_tar1_len10/train_dese.p", "wb" ))
pickle.dump(val_data,   open("../../datasets/bitcoin/market2_tar1_len10/val_dese.p", "wb" ))
pickle.dump(test_data,  open("../../datasets/bitcoin/market2_tar1_len10/test_dese.p", "wb" ))



In [None]:

import pickle
import numpy as np
import pandas as pd

# [y, timestamp, x]
train_data = pickle.load(open("../../datasets/bitcoin/market2_tar10_len10/train_dese.p", "rb"), encoding = 'latin1')
val_data = pickle.load(open(  "../../datasets/bitcoin/market2_tar10_len10/train_dese.p", "rb"), encoding = 'latin1')
test_data = pickle.load(open( "../../datasets/bitcoin/market2_tar10_len10/train_dese.p", "rb"), encoding = 'latin1')

print(len(train_data), len(val_data), len(test_data))

print([np.shape(i) for i in train_data[0][2]])
print([np.shape(i) for i in val_data[0][2]])
print([np.shape(i) for i in test_data[0][2]])

In [None]:
train_data[0][0]