In [2]:
# written in python 3
# Yicheng Li
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
import random
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression

  from pandas.core import datetools


In [3]:
df = pd.read_pickle('df_daily_interpolated.pickle')
df = df.dropna()
df

Unnamed: 0_level_0,BTC_high,BTC_low,BTC_volume,BTC_mean,BTC_weighted_mean,BTC_percent_change,ETH_high,ETH_low,ETH_volume,ETH_mean,ETH_weighted_mean,ETH_percent_change,LTC_high,LTC_low,LTC_volume,LTC_mean,LTC_weighted_mean,LTC_percent_change
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-08-18,576.99,573.00,3964.988429,575.137264,575.244977,-0.002374,11.06,10.76,35781.194561,10.910598,10.903948,-0.008333,3.70,3.62,1.000000,3.620000,3.620000,-0.014519
2016-08-19,576.99,572.68,4038.303001,574.886952,574.991976,-0.000435,10.86,10.65,29172.247700,10.782314,10.783117,-0.011758,4.00,3.70,201.532878,3.750000,3.798663,0.035912
2016-08-20,583.92,572.38,3581.224253,578.583768,578.712458,0.006431,11.37,10.70,35237.831199,11.129508,11.130709,0.032200,4.00,3.85,0.000000,3.875000,3.899331,0.033333
2016-08-21,583.99,579.34,2442.633748,581.963888,581.799733,0.005842,11.30,10.98,17991.202538,11.137447,11.140413,0.000713,4.00,4.00,0.249252,4.000000,4.000000,0.032258
2016-08-22,586.99,579.22,4493.421370,582.617917,583.490733,0.001124,11.34,11.04,36422.563901,11.224444,11.218667,0.007811,3.99,3.99,0.624692,3.990000,3.990000,-0.002500
2016-08-23,586.77,577.47,5092.274625,583.219936,582.467673,0.001033,11.20,10.85,32543.676234,11.042448,11.020368,-0.016214,5.49,3.61,5619.788148,4.136444,4.033563,0.036703
2016-08-24,582.68,575.29,3857.023188,579.767021,579.558619,-0.005920,11.16,10.98,24028.112626,11.059828,11.061050,0.001574,3.96,3.61,5183.135185,3.894078,3.902438,-0.058593
2016-08-25,579.00,573.12,4109.112561,575.374737,575.586207,-0.007576,11.35,10.99,28931.960300,11.193741,11.211083,0.012108,3.88,3.75,4985.640642,3.794897,3.790287,-0.025470
2016-08-26,579.86,574.85,3354.183171,577.718443,577.929145,0.004073,11.47,11.20,27315.076307,11.315585,11.315170,0.010885,3.90,3.77,3336.893101,3.815088,3.816234,0.005321
2016-08-27,579.45,568.95,3059.493089,574.127421,573.842679,-0.006216,11.28,11.17,19819.182591,11.234801,11.231986,-0.007139,3.85,3.62,1150.722207,3.797097,3.782341,-0.004716


# Prepare data as np array

In [69]:
# function to create train, validation, test data given sequence length
def load_data(df, seq_len):
    labels = df['BTC_weighted_mean'].as_matrix()
    
    data_raw = df.as_matrix() # convert to numpy array
    data_raw = preprocessing.scale(data_raw) # standardizing features
    data = []
    
    # create all possible sequences of length seq_len
    for index in range(len(data_raw) - seq_len + 1): 
        data.append(data_raw[index: index + seq_len])
    
    data = np.array(data);
    
    valid_set_size_percentage = 20 
    test_set_size_percentage = 10 
    
    valid_set_size = int(np.round(valid_set_size_percentage/100*data.shape[0]));  
    test_set_size = int(np.round(test_set_size_percentage/100*data.shape[0]));
    train_set_size = data.shape[0] - (valid_set_size + test_set_size);
    
    x_train = data[:train_set_size,:-1,:] # cannot see last day, which we aim to predict
    y_train = labels[seq_len-1:train_set_size+seq_len-1]
    
    x_valid = data[train_set_size:train_set_size+valid_set_size,:-1,:]
    y_valid = labels[seq_len-1+train_set_size:train_set_size+valid_set_size+seq_len-1]
    
    x_test = data[train_set_size+valid_set_size:,:-1,:]
    y_test = labels[seq_len-1+train_set_size+valid_set_size:]
    
    return [x_train, y_train, x_valid, y_valid, x_test, y_test]

In [70]:
# create train, test data
seq_len = 10 # choose sequence length
x_train, y_train, x_valid, y_valid, x_test, y_test = load_data(df, seq_len)
print('x_train.shape = ',x_train.shape)
print('y_train.shape = ', y_train.shape)
print('x_valid.shape = ',x_valid.shape)
print('y_valid.shape = ', y_valid.shape)
print('x_test.shape = ', x_test.shape)
print('y_test.shape = ',y_test.shape)

x_train.shape =  (410, 9, 18)
y_train.shape =  (410,)
x_valid.shape =  (117, 9, 18)
y_valid.shape =  (117,)
x_test.shape =  (58, 9, 18)
y_test.shape =  (58,)


In [28]:
print(x_train[:1])
print(y_train[:3])

[[[-0.82147235 -0.83085295 -0.82862724 -0.82681235 -0.82698313
   -0.19242871 -0.82690143 -0.8299501  -0.81537586 -0.82882685
   -0.82975233 -0.28440096 -0.7273839  -0.73697647 -0.70596808
   -0.73255342 -0.73358613 -0.32671179]
  [-0.82147235 -0.83093033 -0.82256257 -0.82686907 -0.82704059
   -0.14203033 -0.82750635 -0.83032513 -0.85834931 -0.82923572
   -0.83013925 -0.3455584  -0.7238087  -0.73586438 -0.70556874
   -0.73088693 -0.7312927   0.40394137]
  [-0.81998666 -0.83100288 -0.86037273 -0.82603139 -0.82619554
    0.03646754 -0.8259638  -0.83015466 -0.81890898 -0.82812914
   -0.82902621  0.43944333 -0.7238087  -0.73377921 -0.70597007
   -0.72928453 -0.73000045  0.3665867 ]
  [-0.81997165 -0.8293198  -0.95455847 -0.82526546 -0.82549432
    0.02116875 -0.82617552 -0.82920003 -0.93105198 -0.82810384
   -0.82899513 -0.12285165 -0.7238087  -0.73169404 -0.70596958
   -0.72768214 -0.7287082   0.35100792]
  [-0.81932849 -0.82934881 -0.78491456 -0.82511726 -0.82511023
   -0.10149748 -0.826

In [71]:
# shuffle training data
s = np.arange(x_train.shape[0])
np.random.shuffle(s)
x_train = x_train[s]
y_train = y_train[s]

# Define evaluation metrics

In [9]:
def RMSE(y, y_pred):
    if len(y) != len(y_pred):
        raise ValueError('Length of prediction array is not equal to length of y array.')
    return np.sqrt(sum((y_pred-y)**2)/len(y))

In [29]:
def SMAPE(y, y_pred):
    if len(y) != len(y_pred):
        raise ValueError('Length of prediction array is not equal to length of y array.')
    return np.mean(np.abs(y-y_pred)*2/(np.abs(y)+np.abs(y_pred)))

In [14]:
def normalized_RMSE(y, y_pred):
    if len(y) != len(y_pred):
        raise ValueError('Length of prediction array is not equal to length of y array.')
    return np.sqrt(sum((y_pred/y-1)**2)/len(y))

In [15]:
def RMSE_log_price(y, y_pred):
    if len(y) != len(y_pred):
        raise ValueError('Length of prediction array is not equal to length of y array.')
    return np.sqrt(sum((np.log(y_pred)-np.log(y))**2)/len(y))

### Baseline performance

In [75]:
y_pred = np.roll(y_valid,1, axis=0)
print('baseline dev_SMAPE=',SMAPE(y_valid[1:], y_pred[1:]))
y_pred = np.roll(y_test,1, axis=0)
y_pred[0] = y_valid[-1]
# print(y_pred[:3], y_test[:3])
print('baseline test_SMAPE=',SMAPE(y_test, y_pred))

baseline dev_SMAPE= 0.04414065415219852
baseline test_SMAPE= 0.04176151462891089


### Linear regression with different window sizes

In [92]:
linear_reg = LinearRegression()

for window_size in range(1,seq_len):
    feature_set = [2,3,4] # hyperparameter
    feature_vec_len = len(feature_set)
    linear_reg.fit(x_train[:,-window_size:,feature_set].reshape([-1,window_size*feature_vec_len]),\
                   y_train)
    y_pred = linear_reg.predict(x_valid[:,-window_size:,feature_set]\
                                .reshape([-1,window_size*feature_vec_len]))
    print('window_size=',window_size,'dev_SMAPE=',SMAPE(y_valid, y_pred))

# test
window_size = 5
feature_set = [2,3,4] # hyperparameter
feature_vec_len = len(feature_set)
linear_reg.fit(x_train[:,-window_size:,feature_set].reshape([-1,window_size*feature_vec_len]),\
               y_train)
y_pred = linear_reg.predict(x_test[:,-window_size:,feature_set]\
                            .reshape([-1,window_size*feature_vec_len]))
print('\nwindow_size=',window_size,'test_SMAPE=',SMAPE(y_test, y_pred))

window_size= 1 dev_SMAPE= 0.04015161324630197
window_size= 2 dev_SMAPE= 0.03990936829285309
window_size= 3 dev_SMAPE= 0.039573620773084586
window_size= 4 dev_SMAPE= 0.039630757044325816
window_size= 5 dev_SMAPE= 0.03955139045641161
window_size= 6 dev_SMAPE= 0.04024499694882543
window_size= 7 dev_SMAPE= 0.04006974232588165
window_size= 8 dev_SMAPE= 0.04005254515510125
window_size= 9 dev_SMAPE= 0.039925094446536614

window_size= 5 test_SMAPE= 0.035969917395846096


### Ridge regression

In [80]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.5) # hyperparameter
for window_size in range(1,seq_len):
    feature_set = [1,2,3,4,5,6] # hyperparameter
    feature_vec_len = len(feature_set)
    ridge.fit(x_train[:,-window_size:,feature_set].reshape([-1,window_size*feature_vec_len]),\
                   y_train)
    y_pred = ridge.predict(x_valid[:,-window_size:,feature_set]\
                                .reshape([-1,window_size*feature_vec_len]))
    print('window_size=',window_size,'dev_SMAPE=',SMAPE(y_valid, y_pred))

window_size= 1 dev_SMAPE= 0.04946054144834506
window_size= 2 dev_SMAPE= 0.050614321640243314
window_size= 3 dev_SMAPE= 0.052835283142928055
window_size= 4 dev_SMAPE= 0.05469671960110889
window_size= 5 dev_SMAPE= 0.056249623393495395
window_size= 6 dev_SMAPE= 0.05791508837383122
window_size= 7 dev_SMAPE= 0.059429309696637275
window_size= 8 dev_SMAPE= 0.0597667331672142
window_size= 9 dev_SMAPE= 0.060219128002133665


### Lasso Regression

In [85]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=1, max_iter=10000) # hyperparameter
for window_size in range(1,seq_len):
    feature_set = [x for x in range(18)]
    feature_vec_len = len(feature_set)
    lasso.fit(x_train[:,-window_size:,feature_set].reshape([-1,window_size*feature_vec_len]),\
                   y_train)
    y_pred = lasso.predict(x_valid[:,-window_size:,feature_set]\
                                .reshape([-1,window_size*feature_vec_len]))
    print('window_size=',window_size,'dev_SMAPE=',SMAPE(y_valid, y_pred))

window_size= 1 dev_SMAPE= 0.04396856966471148
window_size= 2 dev_SMAPE= 0.04384055542212914
window_size= 3 dev_SMAPE= 0.04406589914346942
window_size= 4 dev_SMAPE= 0.04400130101341173
window_size= 5 dev_SMAPE= 0.043863280981833216
window_size= 6 dev_SMAPE= 0.04407619003855108
window_size= 7 dev_SMAPE= 0.04431141629151225
window_size= 8 dev_SMAPE= 0.0441791130816495
window_size= 9 dev_SMAPE= 0.04414808578043597
