In [1]:
# STANDARD
import pandas as pd
import numpy as np
from datetime import datetime
from random import randint
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import xgboost as xgb
from xgboost import XGBClassifier, DMatrix



In [2]:
# helper to get a specific number of features
def set_features(df, target_col, n_features):
    
    # get list of features and slice the n we need, then add back target
    features_list = [col for col in df.columns if col != target_col][:n_features]
    features_list.append(target_col)
    
    return features_list

# reshape data so that LSTM doesn't get mad
def reshape_data(array, time_steps):
    
    # set length / number of samples
    L = array.shape[0] - time_steps + 1
    
    # use strides for something
    strided = np.lib.stride_tricks.as_strided
    m, n = array.strides
    
    # set width / number of features
    N = array.shape[1]
    
    # and finally, reshape data according to specifications
    reshaped = strided(array, shape = (L, time_steps, N), strides = (N * n, m, n))
    
    return reshaped.copy()

# get X, Y, and column headers / names for random forest
def get_lstm_matrix(df, target_col, periods_ahead, drop_target = False, rate_of_change = False):
    
    # read in all cols with closing prices
    df = df[df[target_col].notnull()]
    
    # clean data for BTC_ETH starts on 2015-08-09
    df = df[df.index >= datetime(2015, 8, 9)]
    
    # select target column and create it in dataset
    target_col_name = target_col + '_target'
    df[target_col_name] = df[target_col].shift(-periods_ahead)
    
    # if looking at the rate of change instead of actual values
    if rate_of_change:
        df[target_col_name] = (df[target_col_name] - df[target_col]) / df[target_col]
    
    # drop the original target col
    if drop_target:
        df = df.drop(target_col, 1)
    
    # drop null values
    df.dropna(inplace = True)
    
    # convert to matrix
    dat = df.as_matrix()
    
    # convert to float (just in case)
    dat = dat.astype('float32')
    
    # get dates for QA
    dates = df.index.tolist()
    
    return df, dat, dates

# get lagged version of X (straight up, no RoC)
def get_lagged_x_straight(_array, timesteps):
    
    # reshpae to add timesteps
    reshaped_array = reshape_data(_array, timesteps)
    
    # flatten to two dimensions
    flattened_array = np.array([list(sub_arr.reshape(1,-1)[0]) for sub_arr in reshaped_array])
    
    return flattened_array

# get a lagged version of the dataset for a specific n of timesteps
def get_lagged_dataset(df, target_col, periods_ahead, n_features, timesteps, roc):
    
    # filter dowm to n features needed
    df = df[set_features(df, target_col, n_features)]
    
    # pre-process data
    adf, dat, dates = get_lstm_matrix(df,
                                      target_col = target_col,
                                      periods_ahead = periods_ahead,
                                      drop_target = True,
                                      rate_of_change = True
                                     )
    
    # add the target col
    adf['direction'] = adf[target_col + '_target'].apply(lambda x: np.sign(x))

    # reverse order of df so it is more intuitive
    adf = adf.sort_index(ascending = False)

    # set X and y
    X = adf.ix[:,:-2].as_matrix().copy()
    y = adf.ix[:,-1].as_matrix().copy()
    
    if roc:
        # transform X to add n lag
        X = get_lagged_x_roc(X, timesteps).copy()
    elif not roc:
        # transform X to add n lag
        X = get_lagged_x_straight(X, timesteps).copy()
        
    else: 'Please choose lag type!'
    
    # trim y to match X
    y = y[:-timesteps+1].copy()
    
    return X, y, adf

def get_wf_start_date(df, param_dict, _target, _wfw, _periods_ahead):

    _target = _target
    walk_forward_window = _wfw

    warnings.filterwarnings("ignore")

    ''''''''''''''
    # set empty dict for grid search results
    iter_results = {}

    # loop through each model
    for ens_model in range(len(xgboost_lagged_ens)):

        # get current iterations model
        current_model = xgboost_lagged_ens[ens_model]

        # set hyperparams
        _features = int(param_dict['features'])
        _timesteps = param_dict['timesteps']
        
        # get the data
        X, y, ndf = get_lagged_dataset(df,
                                       target_col = _target,
                                       periods_ahead = _periods_ahead,
                                       n_features = _features,
                                       timesteps = _timesteps,
                                       roc = False
                                      )

        # the get_lagged_dataset() func returns the data flipped for intuitive testing
        # but it needs to go in ascending order for walk forward validation, so flip it!
        X = np.flipud(X).copy()
        y = np.flipud(y).copy()
        ndf = ndf.sort_index()

        prediction_list = []

        for wf in list(reversed(range(1, walk_forward_window + 1))):

            # get the test y
            test_y = y[-wf].copy()

            print('Starting Date:', ndf.index[-wf])
            print('Target:', test_y)
            print()

            break

In [4]:
# read in training data
df6 = pd.read_csv('../_trainingData/train_filtered_6hr_2017-09-27.csv')
df24 = pd.read_csv('../_trainingData/24hr_newpair_train_filtered.csv')

# clean import
df6.index = pd.to_datetime(df6.date)
df24.index = pd.to_datetime(df24.date)
df6 = df6.drop('date', 1)
df24 = df24.drop('date', 1)

In [16]:
# #this is the export from the grid search df, reformat this below for validation
# top_xgb_model_records = [
    
#     # top six
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 1500, 'lag_type': 'straight-lag', 'max_depth': 20, 'learning_rate': 0.005},
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 1000, 'lag_type': 'straight-lag', 'max_depth': 30, 'learning_rate': 0.01},
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 1000, 'lag_type': 'straight-lag', 'max_depth': 35, 'learning_rate': 0.01},
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 1000, 'lag_type': 'straight-lag', 'max_depth': 25, 'learning_rate': 0.01},
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 2000, 'lag_type': 'straight-lag', 'max_depth': 20, 'learning_rate': 0.005},
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 2000, 'lag_type': 'straight-lag', 'max_depth': 15, 'learning_rate': 0.005},
    
#     # next top six
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 1500, 'lag_type': 'straight-lag', 'max_depth': 15, 'learning_rate': 0.005},
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 2000, 'lag_type': 'straight-lag', 'max_depth': 30, 'learning_rate': 0.005},
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 2000, 'lag_type': 'straight-lag', 'max_depth': 35, 'learning_rate': 0.005},
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 2000, 'lag_type': 'straight-lag', 'max_depth': 25, 'learning_rate': 0.005},
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 1500, 'lag_type': 'straight-lag', 'max_depth': 35, 'learning_rate': 0.005},
#     {'timesteps': '24', 'model': '6hrs_4steps_straight-lag_20features_24timesteps', 'features': '20', 'n_estimators': 1500, 'lag_type': 'straight-lag', 'max_depth': 25, 'learning_rate': 0.005}
    
# ]

In [5]:
# model params in ensemble
xgboost_lagged_ens = [
    
    {'features': 20, 'timesteps': 24, 'max_depth': 20, 'learning_rate': 0.005, 'n_estimators': 1500},
    {'features': 20, 'timesteps': 24, 'max_depth': 30, 'learning_rate': 0.01, 'n_estimators': 1000},
    {'features': 20, 'timesteps': 24, 'max_depth': 35, 'learning_rate': 0.01, 'n_estimators': 1000},
    {'features': 20, 'timesteps': 24, 'max_depth': 25, 'learning_rate': 0.01, 'n_estimators': 1000},
    {'features': 20, 'timesteps': 24, 'max_depth': 20, 'learning_rate': 0.005, 'n_estimators': 2000},
    {'features': 20, 'timesteps': 24, 'max_depth': 15, 'learning_rate': 0.005, 'n_estimators': 2000},
    {'features': 20, 'timesteps': 24, 'max_depth': 15, 'learning_rate': 0.005, 'n_estimators': 1500},
    {'features': 20, 'timesteps': 24, 'max_depth': 30, 'learning_rate': 0.005, 'n_estimators': 2000},
    {'features': 20, 'timesteps': 24, 'max_depth': 35, 'learning_rate': 0.005, 'n_estimators': 2000},
    {'features': 20, 'timesteps': 24, 'max_depth': 25, 'learning_rate': 0.005, 'n_estimators': 2000},
    {'features': 20, 'timesteps': 24, 'max_depth': 35, 'learning_rate': 0.005, 'n_estimators': 1500},
    {'features': 20, 'timesteps': 24, 'max_depth': 25, 'learning_rate': 0.005, 'n_estimators': 1500}

]

In [1]:
_target = 'polo_usdteth_median_trade_price'
periods_ahead = 4
walk_forward_window = 140

''''''''''''''
# set empty dict for grid search results
iter_results = {}

# loop through each model
for ens_model in range(len(xgboost_lagged_ens)):
    
    # get current iterations model
    current_model = xgboost_lagged_ens[ens_model]
    
    # set hyperparams
    _features = int(current_model['features']),
    _timesteps = current_model['timesteps'],
    _max_depth = current_model['max_depth'],
    _learning_rate = current_model['learning_rate'],
    _n_estimators = current_model['n_estimators']
    
    _features = int(_features[0])
    _timesteps = int(_timesteps[0])
    _max_depth = int(_max_depth[0])
    _learning_rate = _learning_rate[0]
    #_n_estimators = int(_n_estimators[0])
    
    # set the hyperparams of the xgboost model
    xgb_params = {
        'objective': 'binary:logistic',
        'max_depth': _max_depth,
        'n_estimators': _n_estimators,
        'learning_rate': _learning_rate
    }
    
    # set model ID
    id_tuple = (_features, _n_estimators, _timesteps, _max_depth, _learning_rate)
    iter_id = 'xgb_6hrs_4steps_%sfeatures_%sestimators_%stimesteps_%smaxdepth_%slearningrate' % id_tuple
    
    # progress print
    print('starting', iter_id)

    # get the data
    X, y, tdf = get_lagged_dataset(df6,
                                   target_col = _target,
                                   periods_ahead = periods_ahead,
                                   n_features = _features,
                                   timesteps = _timesteps,
                                   roc = False
                                  )
    
    # this is returned for the start date check, just delete it for actual testing
    del tdf
    
    # the get_lagged_dataset() func returns the data flipped for intuitive testing
    # but it needs to go in ascending order for walk forward validation, so flip it!
    X = np.flipud(X).copy()
    y = np.flipud(y).copy()
    
    '''
    TEST ARRAY (FOR... TESTING)
    '''
    ## get X and Y
    #X = test_array[:,:-1].copy()
    #y = test_array[:,-1:].copy()
    #print('X Shape:', X.shape)
    #print('Y Shape:', y.shape)
    #print()
    
    prediction_list = []

    for wf in list(reversed(range(1, walk_forward_window + 1))):
        
        # stop testing when we run out of target data
        if wf < periods_ahead: break
        
        # get the test y
        test_y = y[-wf+periods_ahead-1]

        # create training set using scaled data
        train_X = X[:-wf,:].copy()
        train_y = y[:-wf].copy()
        
        print('Last 5 of Train X:\n', train_X[-5:,:])
        print('Last 5 of Train y:\n', train_y[-5:])
        
        # create, scale, and reshape test set
        if wf > periods_ahead:
            test_X = X[:-wf+periods_ahead,:].copy()
            test_y_acc_check = y[-wf:-wf+periods_ahead].copy()
            
            print('Last 5 of Test X:\n', test_X[-5:,:])
            print('Last 5 of Test y Acc Check:\n', test_y_acc_check[-5:])
            
        elif wf == periods_ahead:
            test_X = X.copy()
            test_y_acc_check = y.copy()
            
            print('Last 5 of Test X:\n', test_X[-5:,:])
            print('Last 5 of Test y Acc Check:\n', test_y_acc_check[-5:])

        print('Ultimate Test y:', test_y)
        
        # fit model no training data
        model = XGBClassifier(**xgb_params)
        model.fit(train_X, train_y)
        
        # make predictions for test data
        y_pred = model.predict_proba(test_X)
        
        print('Test X shape:', test_X.shape)
        print('predictions shape:', y_pred.shape)
        
        # get the last value of the predictions (the ultimate target)
        yhat = tuple(y_pred[-1])
        
        print('\n', wf - 1, 'steps left')
        print('predicted:', yhat)
        print('actual:', test_y, '\n')
        
        # create a target and sequence tuple of results for saving
        tar_tuple = (test_y, yhat)
        
        prediction_list.append(tar_tuple)
        if wf % 25 == 0: print(prediction_list)
        print()
    
    iter_results[iter_id] = prediction_list
    print('\n\n\n')
    print(iter_results)
    print('\n\n\n')

# Get the Start Date for the Data

In [63]:
xgboost_lagged_ens = [
    
    {'features': 20, 'timesteps': 24, 'max_depth': 20, 'learning_rate': 0.005, 'n_estimators': 1500}
    
]

In [64]:
get_wf_start_date(df6,
                  param_dict = xgboost_lagged_ens[0],
                  _target = _target,
                  #_wfw = walk_forward_window,
                  _wfw = 425,
                  _periods_ahead = 4
                 )

Starting Date: 2017-06-12 00:00:00
Target: 1.0

