In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import random
import math

In [45]:
df = pd.read_pickle('df_daily_interpolated.pickle')

In [26]:
df.head()

Unnamed: 0_level_0,BTC_high,BTC_low,BTC_volume,BTC_mean,BTC_weighted_mean,BTC_percent_change,ETH_high,ETH_low,ETH_volume,ETH_mean,ETH_weighted_mean,ETH_percent_change,LTC_high,LTC_low,LTC_volume,LTC_mean,LTC_weighted_mean,LTC_percent_change
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-01-01,437.15,427.92,3863.277451,433.287069,433.460852,,,,,,,,,,,,,
2016-01-02,437.56,432.41,3276.709621,435.075572,435.297847,0.004128,,,,,,,,,,,,
2016-01-03,435.75,425.02,3904.325318,431.324176,430.707753,-0.008622,,,,,,,,,,,,
2016-01-04,435.79,431.37,5894.445723,433.615547,433.989539,0.005312,,,,,,,,,,,,
2016-01-05,435.64,430.0,5150.049476,433.37537,433.341902,-0.000554,,,,,,,,,,,,


In [6]:
df['ETH_high'].first_valid_index()

Timestamp('2016-05-18 00:00:00', freq='D')

In [7]:
df['LTC_high'].first_valid_index()

Timestamp('2016-08-17 00:00:00', freq='D')

In [17]:
from datetime import date
train_start_date = date(2016,8,17)
validation_end_date = date(2018,4,3)

In [4]:
total_days = (validation_end_date - train_start_date).days + 1 - 1 # first day has no percent change, so -1
print(total_days)

594


In [5]:
num_days_unused = (train_start_date - date(2016,1,1)).days + 1 # first day has no percent change, so +1
print(num_days_unused)

230


In [6]:
num_days_train = math.floor(total_days*0.7)
num_days_validate = total_days - num_days_train
print(num_days_train, num_days_validate)

415 179


In [46]:
df = df.reset_index()
df = df.drop(columns=['time'])

In [47]:
# # weird normalization
# for feature_name in df.columns:
#     max_value = df[feature_name].max()
#     min_value = df[feature_name].min()
#     df[feature_name] = (df[feature_name] - min_value) * 100000000 / (max_value - min_value)

df['BTC_percent_change'] = (df['BTC_percent_change'] - df['BTC_percent_change'].min())
df['LTC_percent_change'] = (df['LTC_percent_change'] - df['LTC_percent_change'].min())
df['ETH_percent_change'] = (df['ETH_percent_change'] - df['ETH_percent_change'].min())
df['ETH_volume'] = df['ETH_volume']/10
# df['LTC_low'] = df['LTC_low'] * 1e3
# df['LTC_high'] = df['LTC_high'] * 1e3
# df['LTC_mean'] = df['LTC_mean'] * 1e3
# df['LTC_weighted_mean'] = df['LTC_weighted_mean'] * 1e3

In [48]:
df.loc[num_days_unused]

BTC_high               576.990000
BTC_low                573.000000
BTC_volume            3964.988429
BTC_mean               575.137264
BTC_weighted_mean      575.244977
BTC_percent_change       0.145387
ETH_high                11.060000
ETH_low                 10.760000
ETH_volume            3578.119456
ETH_mean                10.910598
ETH_weighted_mean       10.903948
ETH_percent_change       0.238499
LTC_high                 3.700000
LTC_low                  3.620000
LTC_volume               1.000000
LTC_mean                 3.620000
LTC_weighted_mean        3.620000
LTC_percent_change       0.174819
Name: 230, dtype: float64

In [10]:
def SMAPE(y, y_pred):
    if len(y) != len(y_pred):
        raise ValueError('Length of prediction array is not equal to length of y array.')
    return np.mean(np.abs(y-y_pred)*2/(np.abs(y)+np.abs(y_pred)))

def normalized_RMSE(y, y_pred):
    if len(y) != len(y_pred):
        raise ValueError('Length of prediction array is not equal to length of y array.')
    return np.sqrt(sum((y_pred/y-1)**2)/len(y))

def RMSE_log_price(y, y_pred):
    if len(y) != len(y_pred):
        raise ValueError('Length of prediction array is not equal to length of y array.')
    return np.sqrt(sum((np.log(y_pred)-np.log(y))**2)/len(y))

In [62]:
# fixed-window linear regression
def train_with_window_size(window_size, features, loss_func='normalized_RMSE', verbose=True):
    if loss_func != 'normalized_RMSE' and loss_func != 'RMSE_log_price':
        raise ValueError('loss_func must be either normalized_RMSE or RMSE_log_price')
                 
    y = df.loc[num_days_unused+window_size:num_days_unused+num_days_train-1, 'BTC_mean']
    W = (np.random.rand(window_size*len(features)) + np.ones(window_size*len(features)))/window_size # initialize as weighted average
    W_old = np.ones(window_size*len(features))
    b = 0
    if loss_func=='RMSE_log_price':
        learning_rate = 1e-3
    else:
        learning_rate = 1e-7
    error_list = []

    timestep_random_perm = [x for x in range(num_days_unused, num_days_unused + num_days_train - window_size)]
    timestep_random_perm = random.sample(timestep_random_perm, len(timestep_random_perm))
    iteration = 0
    batch_len = 0
    gradient = 0
    while iteration < 15000 and np.abs(W_old.dot(W_old.T) - W.dot(W.T))/(W_old.dot(W_old.T)) > 0.000001:
        for timestep in timestep_random_perm:
            iteration += 1
    #         print('iteration ',iteration)
            X = df.loc[timestep:timestep+window_size-1, features].values.flatten()
#             if math.isnan(X):
#                 raise ValueError('Input has nan')
        
            if batch_len == 10:
                batch_len = 1
                W_old = W
                W = W - learning_rate * np.exp(-iteration/50000) * gradient * X
                b = b - learning_rate * np.exp(-iteration/50000) * gradient
                if loss_func == 'normalized_RMSE':
                    gradient = (W.dot(X.T) + b)*1.0/price[timestep+window_size] - 1
                else:
                    gradient = np.log((W.dot(X.T)+b)/(df.loc[timestep+window_size, 'BTC_mean']))/(W.dot(X.T)+b)
    #             print(gradient, W, b)
            else:
                batch_len += 1
                if loss_func == 'normalized_RMSE':
                    gradient += (W.dot(X.T) + b)*1.0/price[timestep+window_size] - 1
                else:
                    gradient += np.log((W.dot(X.T)+b)/(df.loc[timestep+window_size, 'BTC_mean']))/(W.dot(X.T)+b)

#             if iteration % 20 == 0:
#                 print('iteration', iteration)
#                 y_pred = []
#                 for t in range(num_days_unused, num_days_unused + num_days_train - window_size):
#                     X = df.loc[t:t+window_size-1, features].values.flatten()
#                     y_pred.append(W.dot(X.T) + b)
#                 if loss_func == 'normalized_RMSE':
#                     error = normalized_RMSE(y, y_pred)
#                 else:
#                     error = RMSE_log_price(y, y_pred)
#                 error_list.append(error)
    
    if verbose:
        plt.figure()
        plt.plot([x*20 for x in range(len(error_list))], error_list)
        plt.title('learining curve')
        plt.ylabel('training loss')
        plt.xlabel('iteration')
        print('Training loss:',error_list[-1])
        y_pred = []
        for t in range(num_days_unused, num_days_unused + num_days_train - window_size):
            X = df.loc[t:t+window_size-1, features].values.flatten()
            y_pred.append(W.dot(X.T) + b)
        print('Training SMAPE:',SMAPE(y, y_pred))

        print(W.dot(W.T))
        print(sum(W)) # should be close to 1 because it's essentially weighted average
        print('W=',W)
        print('b=',b)
    return W, b

In [50]:
# validate
def validate_with_window_size(window_size, features, W, b, verbose=True):
    y = df.loc[num_days_unused+num_days_train:num_days_unused+num_days_train+num_days_validate-1, 'BTC_mean']
    y_pred = []

    for timestep in range(num_days_unused+num_days_train-window_size, \
                          num_days_unused+num_days_train+num_days_validate-window_size):
        X = df.loc[timestep:timestep+window_size-1, features].values.flatten()
        y_pred.append(W.dot(X.T) + b)

    validation_SMAPE = SMAPE(y, y_pred)
    if verbose:
        print('Validation SMAPE:', validation_SMAPE)
    
    return validation_SMAPE

In [None]:
# dic = {0:'BTC_high',1:'BTC_low',2:'BTC_volume',3:'BTC_mean',4:'BTC_weighted_mean',5:'BTC_percent_change',\
#        6:'ETH_high',7:'ETH_low',8:'ETH_volume',9:'ETH_mean',10:'ETH_weighted_mean',11:'ETH_percent_change',\
#        12:'LTC_high',13:'LTC_low',14:'LTC_volume' 	LTC_mean 	LTC_weighted_mean 	LTC_percent_change}

In [42]:
all_features = list(df.columns.values)

In [43]:
all_features

['BTC_high',
 'BTC_low',
 'BTC_volume',
 'BTC_mean',
 'BTC_weighted_mean',
 'BTC_percent_change',
 'ETH_high',
 'ETH_low',
 'ETH_volume',
 'ETH_mean',
 'ETH_weighted_mean',
 'ETH_percent_change',
 'LTC_high',
 'LTC_low',
 'LTC_volume',
 'LTC_mean',
 'LTC_weighted_mean',
 'LTC_percent_change']

In [63]:
feature_set = []
current_best_SMAPE = 1

while len(feature_set) < len(all_features):
    performance_with_new_feature = dict()
    for feature in all_features:
        if feature not in feature_set:
            to_use = feature_set + [feature]
            W, b = train_with_window_size(1, features=to_use, loss_func='RMSE_log_price', verbose=False)
            performance_with_new_feature[feature] = validate_with_window_size(1, to_use, W, b, verbose=False)
        print(performance_with_new_feature)
    if min(performance_with_new_feature.values()) > current_best_SMAPE:
        print('Feature selection stopped.')
        print(feature_set)
        break
    feature_set.append(min(performance_with_new_feature, key=performance_with_new_feature.get))
            
        

{'BTC_high': 0.04748722945612306}
{'BTC_low': 1.9554869535468071, 'BTC_high': 0.04748722945612306}
{'BTC_volume': 0.41354622427105314, 'BTC_low': 1.9554869535468071, 'BTC_high': 0.04748722945612306}
{'BTC_volume': 0.41354622427105314, 'BTC_low': 1.9554869535468071, 'BTC_mean': 0.04148175871824706, 'BTC_high': 0.04748722945612306}
{'BTC_volume': 0.41354622427105314, 'BTC_low': 1.9554869535468071, 'BTC_weighted_mean': 0.037278815966984936, 'BTC_mean': 0.04148175871824706, 'BTC_high': 0.04748722945612306}
{'BTC_volume': 0.41354622427105314, 'BTC_percent_change': 1.994705974491497, 'BTC_mean': 0.04148175871824706, 'BTC_high': 0.04748722945612306, 'BTC_low': 1.9554869535468071, 'BTC_weighted_mean': 0.037278815966984936}
{'BTC_volume': 0.41354622427105314, 'BTC_percent_change': 1.994705974491497, 'BTC_mean': 0.04148175871824706, 'BTC_high': 0.04748722945612306, 'ETH_high': 0.2910264731079428, 'BTC_low': 1.9554869535468071, 'BTC_weighted_mean': 0.037278815966984936}
{'BTC_volume': 0.413546224



{'BTC_volume': 0.06429729830904386, 'BTC_percent_change': 0.037273175716443, 'BTC_mean': 0.039377618205351905, 'ETH_volume': nan, 'BTC_high': 0.03729399602958333, 'ETH_high': 0.06094924969057638, 'BTC_low': 0.03809846628075997, 'ETH_low': 0.05061010172563344}
{'BTC_volume': 0.06429729830904386, 'BTC_percent_change': 0.037273175716443, 'BTC_mean': 0.039377618205351905, 'ETH_volume': nan, 'BTC_high': 0.03729399602958333, 'ETH_high': 0.06094924969057638, 'BTC_low': 0.03809846628075997, 'ETH_mean': 0.07594413158329455, 'ETH_low': 0.05061010172563344}
{'BTC_volume': 0.06429729830904386, 'BTC_percent_change': 0.037273175716443, 'BTC_mean': 0.039377618205351905, 'ETH_volume': nan, 'BTC_high': 0.03729399602958333, 'ETH_high': 0.06094924969057638, 'ETH_weighted_mean': 0.07509306670840364, 'BTC_low': 0.03809846628075997, 'ETH_mean': 0.07594413158329455, 'ETH_low': 0.05061010172563344}
{'BTC_volume': 0.06429729830904386, 'BTC_percent_change': 0.037273175716443, 'BTC_mean': 0.039377618205351905, '



{'BTC_volume': 0.06429729830904386, 'BTC_percent_change': 0.037273175716443, 'LTC_high': 0.03909377730195442, 'ETH_mean': 0.07594413158329455, 'ETH_volume': nan, 'ETH_high': 0.06094924969057638, 'BTC_low': 0.03809846628075997, 'ETH_low': 0.05061010172563344, 'LTC_volume': nan, 'LTC_low': 0.03901460423487388, 'BTC_mean': 0.039377618205351905, 'BTC_high': 0.03729399602958333, 'ETH_percent_change': 0.03727636267940546, 'ETH_weighted_mean': 0.07509306670840364}
{'BTC_volume': 0.06429729830904386, 'BTC_percent_change': 0.037273175716443, 'LTC_high': 0.03909377730195442, 'ETH_mean': 0.07594413158329455, 'ETH_volume': nan, 'ETH_high': 0.06094924969057638, 'BTC_low': 0.03809846628075997, 'ETH_low': 0.05061010172563344, 'LTC_volume': nan, 'LTC_low': 0.03901460423487388, 'LTC_mean': 0.04002571940129554, 'BTC_mean': 0.039377618205351905, 'BTC_high': 0.03729399602958333, 'ETH_percent_change': 0.03727636267940546, 'ETH_weighted_mean': 0.07509306670840364}
{'BTC_volume': 0.06429729830904386, 'BTC_pe

{'BTC_volume': nan, 'BTC_mean': 0.039568154615497536, 'ETH_volume': nan, 'BTC_high': 0.03771996718572344, 'ETH_high': 0.056444204772738114, 'ETH_weighted_mean': 0.06341267252211377, 'BTC_low': 0.040811929163988556, 'LTC_high': 0.039048196725819, 'ETH_mean': 0.08092188841993729, 'ETH_low': 0.05209613052725075}
{'LTC_low': 0.03922164904511135, 'BTC_volume': nan, 'BTC_mean': 0.039568154615497536, 'ETH_volume': nan, 'BTC_high': 0.03771996718572344, 'ETH_high': 0.056444204772738114, 'ETH_weighted_mean': 0.06341267252211377, 'BTC_low': 0.040811929163988556, 'LTC_high': 0.039048196725819, 'ETH_mean': 0.08092188841993729, 'ETH_low': 0.05209613052725075}
{'BTC_volume': nan, 'LTC_high': 0.039048196725819, 'ETH_mean': 0.08092188841993729, 'ETH_volume': nan, 'ETH_high': 0.056444204772738114, 'BTC_low': 0.040811929163988556, 'ETH_low': 0.05209613052725075, 'LTC_volume': nan, 'LTC_low': 0.03922164904511135, 'BTC_mean': 0.039568154615497536, 'BTC_high': 0.03771996718572344, 'ETH_weighted_mean': 0.063

{'LTC_low': nan, 'ETH_mean': nan, 'BTC_mean': nan, 'ETH_volume': nan, 'BTC_high': nan, 'ETH_high': 0.11509284691611295, 'ETH_weighted_mean': nan, 'BTC_low': nan, 'ETH_low': nan}
{'LTC_low': nan, 'ETH_mean': nan, 'BTC_mean': nan, 'ETH_volume': nan, 'BTC_high': nan, 'ETH_high': 0.11509284691611295, 'ETH_weighted_mean': nan, 'BTC_low': nan, 'ETH_low': nan, 'LTC_volume': nan}
{'LTC_low': nan, 'ETH_mean': nan, 'BTC_mean': nan, 'ETH_volume': nan, 'BTC_high': nan, 'ETH_high': 0.11509284691611295, 'ETH_weighted_mean': nan, 'BTC_low': nan, 'ETH_low': nan, 'LTC_volume': nan}
{'LTC_low': nan, 'ETH_mean': nan, 'LTC_weighted_mean': 0.22977376562134288, 'BTC_mean': nan, 'ETH_volume': nan, 'BTC_high': nan, 'ETH_high': 0.11509284691611295, 'ETH_weighted_mean': nan, 'BTC_low': nan, 'ETH_low': nan, 'LTC_volume': nan}
{'LTC_percent_change': nan, 'ETH_volume': nan, 'ETH_mean': nan, 'ETH_high': 0.11509284691611295, 'BTC_low': nan, 'ETH_low': nan, 'LTC_volume': nan, 'LTC_low': nan, 'LTC_weighted_mean': 0.22

KeyboardInterrupt: 

0.06869539390780695

In [170]:
clean_dict = {k: performance_with_new_feature[k] for k in performance_with_new_feature if not math.isnan(performance_with_new_feature[k])}
import operator
sorted(clean_dict.items(), key=operator.itemgetter(1))

[('BTC_weighted_mean', 0.037275934724246264),
 ('BTC_mean', 0.04148287179743869),
 ('BTC_high', 0.0476803808079486),
 ('ETH_weighted_mean', 0.2859962614658656),
 ('ETH_mean', 0.2883615221118441),
 ('ETH_high', 0.2958137088587737),
 ('ETH_low', 0.36885684993752876),
 ('BTC_volume', 0.5053290892216526),
 ('ETH_volume', 0.7446427871700374),
 ('BTC_low', 1.9532046230981701),
 ('ETH_percent_change', 1.9942434948525865),
 ('BTC_percent_change', 1.9943317156315155),
 ('LTC_volume', 1.9999999639650992)]

In [64]:
feature_set

['BTC_weighted_mean',
 'BTC_percent_change',
 'ETH_percent_change',
 'BTC_volume',
 'LTC_high',
 'LTC_mean',
 'LTC_percent_change',
 'ETH_mean',
 'ETH_high',
 'LTC_low']