In [52]:
# STANDARD
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from random import randint
import operator

# SKLEARN
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import PolynomialFeatures

# XGBOOST
import xgboost as xgb
from xgboost import XGBClassifier, DMatrix

# RANDOM FOREST
from sklearn.ensemble import RandomForestRegressor

# KERAS
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

# Ensemble Builder (Partially Redacted)

In [None]:
# helper to get a specific number of features
def set_features(df, target_col, n_features):
    
    # get list of features and slice the n we need, then add back target
    features_list = [col for col in df.columns if col != target_col][:n_features]
    features_list.append(target_col)
    
    return features_list

# reshape data so that LSTM doesn't get mad
def reshape_data(array, time_steps):
    
    # set length / number of samples
    L = array.shape[0] - time_steps + 1
    
    # use strides for something
    strided = np.lib.stride_tricks.as_strided
    m, n = array.strides
    
    # set width / number of features
    N = array.shape[1]
    
    # and finally, reshape data according to specifications
    reshaped = strided(array, shape = (L, time_steps, N), strides = (N * n, m, n))
    
    return reshaped.copy()

# get X, Y, and column headers / names for random forest
def get_lstm_matrix(df, target_col, periods_ahead, drop_target = False, rate_of_change = False):
    
    # read in all cols with closing prices
    df = df[df[target_col].notnull()]
    
    # clean data for BTC_ETH starts on 2015-08-09
    df = df[df.index >= datetime(2015, 8, 9)]
    
    # select target column and create it in dataset
    target_col_name = target_col + '_target'
    df[target_col_name] = df[target_col].shift(-periods_ahead)
    
    # if looking at the rate of change instead of actual values
    if rate_of_change:
        df[target_col_name] = (df[target_col_name] - df[target_col]) / df[target_col]
    
    # drop the original target col
    if drop_target:
        df = df.drop(target_col, 1)
    
    # drop null values
    df.dropna(inplace = True)
    
    # convert to matrix
    dat = df.as_matrix()
    
    # convert to float (just in case)
    dat = dat.astype('float32')
    
    # get dates for QA
    dates = df.index.tolist()
    
    return df, dat, dates

# get lagged version of X (straight up, no RoC)
def get_lagged_x_straight(_array, timesteps):
    
    # reshpae to add timesteps
    reshaped_array = reshape_data(_array, timesteps)
    
    # flatten to two dimensions
    flattened_array = np.array([list(sub_arr.reshape(1,-1)[0]) for sub_arr in reshaped_array])
    
    return flattened_array

# get a lagged version of the dataset for a specific n of timesteps
def get_lagged_dataset(df, target_col, periods_ahead, n_features, timesteps, roc):
    
    # filter dowm to n features needed
    df = df[set_features(df, target_col, n_features)]
    
    # pre-process data
    adf, dat, dates = get_lstm_matrix(df,
                                      target_col = target_col,
                                      periods_ahead = periods_ahead,
                                      drop_target = True,
                                      rate_of_change = True
                                     )
    
    # add the target col
    adf['direction'] = adf[target_col + '_target'].apply(lambda x: np.sign(x))

    # reverse order of df so it is more intuitive
    adf = adf.sort_index(ascending = False)

    # set X and y
    X = adf.ix[:,:-2].as_matrix().copy()
    y = adf.ix[:,-1].as_matrix().copy()
    
    if roc:
        # transform X to add n lag
        X = get_lagged_x_roc(X, timesteps).copy()
    elif not roc:
        # transform X to add n lag
        X = get_lagged_x_straight(X, timesteps).copy()
        
    else: 'Please choose lag type!'
    
    # trim y to match X
    y = y[:-timesteps+1].copy()
    
    return X, y, adf

# helper to return the primary KPIs of a regression model from a list of predicted and true y values
def regression_kpis(prediction_list, scaler = None):
    
    y_true = []
    y_pred = []

    for true, pred in prediction_list:
        y_true.append(true)
        y_pred.append(pred)
    
    # invert scale / predictions
    if scaler != None:
        y_true = scaler.inverse_transform(y_true)
        y_pred = scaler.inverse_transform(y_pred)
    
    print('MAE:', mae(y_true, y_pred))
    print('RMSE:', mse(y_true, y_pred) ** 0.5)
    print('Sign Accuracy:', sum([np.sign(true) == np.sign(pred) for true, pred in zip(y_true, y_pred)]) / len(prediction_list))

# run walk forward validation on a sklearn model (has to be fed in as model(), meaning params must be init then)
def sklearn_wfv_regression(X, y, _model, walk_forward_window, verbose = 0):
    
    prediction_list = []

    for wf in list(reversed(range(1, walk_forward_window + 1))):
        
        # get the test y
        test_Y = y[-wf].copy()

        # create training set using scaled data
        train_X = X[:-wf,:].copy()
        train_y = y[:-wf].copy()

        # create, scale, and reshape test set
        if wf > 1:
            test_X = X[:-wf+1,:].copy()
            test_Y_acc_check = y[:-wf+1].copy()
        elif wf == 1:
            test_X = X.copy()
            test_Y_acc_check = y.copy()

        # fit model no training data
        model = _model
        model.fit(train_X, train_y)

        # make predictions for test data
        y_pred = model.predict(test_X)

        # get the last value of the predictions (the only on that is a test pred)
        yhat = y_pred[-1]
        
        if verbose == 1:
            print('\n', wf - 1, 'steps left')
        elif verbose == 2:
            print('\n', wf - 1, 'steps left')
            print('predicted:', yhat)
            print('actual:', test_Y, '\n')

        # create a target and sequence tuple of results for saving
        tar_tuple = (test_Y, yhat)

        prediction_list.append(tar_tuple)
    
    print('complete!')
    return prediction_list

# helper to duplicate the 24hr predictions to join the 6hr predictions
def duplicate_24hr_predictions_for_6hr(df):
    
    time_suffix = [' 00:00:00', ' 06:00:00', ' 12:00:00', ' 18:00:00']
    dup_24_df = pd.DataFrame()

    for time in time_suffix:
        merged_24hr['date'] = merged_24hr['Date'].apply(lambda x: x + time)
        dup_24_df = dup_24_df.append(merged_24hr)
    
    dup_24_df = dup_24_df.drop('Date', 1)
    dup_24_df.index = pd.to_datetime(dup_24_df.date)
    dup_24_df = dup_24_df.drop('date', 1)
    dup_24_df = dup_24_df.sort_index()
    
    return dup_24_df

# helper to get ranked features using random forrest regression
def get_ranked_features(X, Y, model_params, col_names, nb_epochs):
    
    scores = {}
    
    for i in range(nb_epochs):
        
        # create / fit new random forest model
        rf = RandomForestRegressor(**model_params)
        rf.fit(X, Y)
        
        # loop through each col and add up the scores
        for score, col in zip(map(lambda x: x, rf.feature_importances_), col_names):
            if i == 0:
                scores[col] = score
            else:
                scores[col] += score
    
    # sort the scores in descending order
    sorted_scores = sorted(scores.items(), key=operator.itemgetter(1), reverse = True)
    
    return sorted_scores

# helper to get top n features from ranked list
def get_top_rf_features(scores, n):
    top_features = [item[0] for item in scores[:n]]
    return top_features

# Load Data

In [3]:
# read in training data
df6 = pd.read_csv('_trainingData/6hr_newpair_train_filtered.csv')
df24 = pd.read_csv('_trainingData/24hr_newpair_train_filtered.csv')

# clean import
df6.index = pd.to_datetime(df6.date)
df24.index = pd.to_datetime(df24.date)
df6 = df6.drop('date', 1)
df24 = df24.drop('date', 1)

# Setup DataFrame

In [4]:
# filter 6hr down to the relevant cols
df = pd.DataFrame()
df['usdteth_median_price'] = df6['polo_usdteth_median_trade_price'].dropna()
df['usdteth_median_price_t+4'] = df['usdteth_median_price'].shift(-4)
df.dropna(inplace = True)

# calc target
df['target'] = (df['usdteth_median_price_t+4'] - df['usdteth_median_price']) / df['usdteth_median_price']

# 2017-06-19 18:00:00 marks the start of the LSTM 6hr predictions
df = df[df.index >= datetime(2017, 6, 19, 18)]

df.head()

Unnamed: 0_level_0,usdteth_median_price,usdteth_median_price_t+4,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-19 18:00:00,354.96,346.0,-0.025242
2017-06-20 00:00:00,359.91,341.82,-0.050263
2017-06-20 06:00:00,360.8,330.09,-0.085116
2017-06-20 12:00:00,362.7,320.99,-0.114999
2017-06-20 18:00:00,346.0,304.29,-0.120549


# LSTM Regression (6hr)

In [6]:
lstm_6hr_preds = {}

# save the predictions of each model into a dict for easy df
for m in lstm_6hr_model_raw.keys():
    m_preds = [pred[0][1] for pred in lstm_6hr_model_raw[m]]
    lstm_6hr_preds[m] = m_preds

# convert to df
lstm_6hr_pred_df = pd.DataFrame(lstm_6hr_preds)

# add dt index to the preds
lstm_6hr_pred_df.index = df.index

# and then join them together
df = lstm_6hr_pred_df.join(df)

# XGBoost Classification (6hr)

In [8]:
# # predictions start 2017-06-11 18:00:00
# xgb_6hr_results = pd.DataFrame(xgb_6hr_model_raw)

# # split up predicted and actual values and write to csv for inspection
# xgb_6hr_results_split = {}

# for k in xgb_6hr_results:
    
#     true_list = []
#     pred_list = []
    
#     for true, pred in xgb_6hr_results[k]:
#         true_list.append(true)
#         pred_list.append(pred)
    
#     xgb_6hr_results_split[k + '_ACTUAL'] = true_list
#     xgb_6hr_results_split[k + '_PREDICTED'] = pred_list

# # export to csv
# pd.DataFrame(xgb_6hr_results_split).to_csv('xgb_6hr_temp_export.csv')

In [9]:
# read in the binary targets to line up date
binary_y_true_6hr = pd.read_csv('binary_6hr_target_through-2017-08-28.csv')

# set empty dict for saving and easy df conversion
xgb_6hr_preds = {}

# parse out the predicted values for each model
for k in xgb_6hr_model_raw:
    xgb_6hr_preds[k] = [pred[1] for pred in xgb_6hr_model_raw[k]]

# save as df
xgb_6hr_df = pd.DataFrame(xgb_6hr_preds)

# set a dt version of the index for easy filtering
binary_y_true_6hr.index = pd.to_datetime(binary_y_true_6hr['date'])

# filter down to where the predictions start (2017-06-11 18:00:00)
binary_y_true_6hr = binary_y_true_6hr[binary_y_true_6hr.index >= datetime(2017, 6, 11, 18)]

# add to the df with the date index
for col in xgb_6hr_df.columns.tolist():
    binary_y_true_6hr[col] = xgb_6hr_df[col].values

# and drop the non-index date col and save
xgb_6hr_df = binary_y_true_6hr.drop('date', 1)

# AND MERGE TOGETHER THE LSTM (6HR) WITH THE XGB (6HR) :D
df = df.join(xgb_6hr_df)

df.head()

Unnamed: 0_level_0,6hrs_4steps_40features_1layers_300epochs_1timesteps_0.4dropout_52units,6hrs_4steps_40features_2layers_300epochs_1timesteps_0.1dropout_128units,6hrs_4steps_40features_2layers_300epochs_1timesteps_0.1dropout_96units,6hrs_4steps_40features_2layers_300epochs_1timesteps_0.2dropout_64units,6hrs_4steps_80features_1layers_300epochs_1timesteps_0.2dropout_52units,usdteth_median_price,usdteth_median_price_t+4,target,direction,xgb_6hrs_4steps_20features_1000estimators_20timesteps_6maxdepth_0.01learningrate,xgb_6hrs_4steps_20features_1000estimators_20timesteps_8maxdepth_0.01learningrate,xgb_6hrs_4steps_20features_1500estimators_20timesteps_6maxdepth_0.01learningrate,xgb_6hrs_4steps_20features_250estimators_24timesteps_8maxdepth_0.1learningrate,xgb_6hrs_4steps_20features_500estimators_20timesteps_8maxdepth_0.01learningrate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017-06-19 18:00:00,0.013479,0.034896,0.032555,0.018816,0.009536,354.96,346.0,-0.025242,-1.0,1.0,1.0,1.0,1.0,1.0
2017-06-20 00:00:00,-0.015428,-0.001259,0.001297,0.006405,-0.013744,359.91,341.82,-0.050263,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2017-06-20 06:00:00,-0.043379,-0.042296,-0.052553,-0.043515,-0.025409,360.8,330.09,-0.085116,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2017-06-20 12:00:00,-0.076908,-0.048142,-0.061239,-0.063892,-0.036947,362.7,320.99,-0.114999,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2017-06-20 18:00:00,-0.075177,-0.105758,-0.070346,-0.101881,-0.036658,346.0,304.29,-0.120549,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


# LSTM Classification (24hr)

In [11]:
lstm_24hr_preds = {}

# save the predictions of each model into a dict for easy df
for m in lstm_24hr_model_raw.keys():
    m_pred = [pred[0][1] for pred in lstm_24hr_model_raw[m]]
    m_true = [pred[0][0] for pred in lstm_24hr_model_raw[m]]
    lstm_24hr_preds[m + '_PREDICTION'] = m_pred
    lstm_24hr_preds[m + '_ACTUAL'] = m_true

# convert to df
lstm_24hr_pred_df = pd.DataFrame(lstm_24hr_preds)

In [12]:
# export the results to reorg manually
#lstm_24hr_pred_df.to_csv('lstm_24hr_temp_export_to_find_starting_dates.csv')

In [13]:
# read in the manually organized data and drop the nans
merged_24hr = pd.read_csv('lstm_24hr_clean.csv')
merged_24hr = merged_24hr.dropna()

pred_from_24hr = duplicate_24hr_predictions_for_6hr(merged_24hr)

# read in the manually organized data and drop the nans
merged_24hr = pd.read_csv('lstm_24hr_clean_shift_to_predict.csv')
merged_24hr = merged_24hr.dropna()

pred_to_24hr = duplicate_24hr_predictions_for_6hr(merged_24hr)

In [14]:
cleaned_24hr = pred_to_24hr.join(pred_from_24hr, lsuffix = '_TO', rsuffix = '_FROM')

# Merge All Dat Data

In [15]:
df = df.join(cleaned_24hr).dropna()

In [16]:
print('\nSHAPE:', df.shape)
df.tail()


SHAPE: (245, 77)


Unnamed: 0_level_0,6hrs_4steps_40features_1layers_300epochs_1timesteps_0.4dropout_52units,6hrs_4steps_40features_2layers_300epochs_1timesteps_0.1dropout_128units,6hrs_4steps_40features_2layers_300epochs_1timesteps_0.1dropout_96units,6hrs_4steps_40features_2layers_300epochs_1timesteps_0.2dropout_64units,6hrs_4steps_80features_1layers_300epochs_1timesteps_0.2dropout_52units,usdteth_median_price,usdteth_median_price_t+4,target,direction,xgb_6hrs_4steps_20features_1000estimators_20timesteps_6maxdepth_0.01learningrate,...,24hrs_8steps_50features_1layers_300epochs_1timesteps_0.3dropout_64units_FROM,24hrs_8steps_50features_1layers_300epochs_1timesteps_0.3dropout_96units_FROM,9step_actual,24hrs_9steps_50features_1layers_200epochs_1timesteps_0.1dropout_96units_FROM,24hrs_9steps_50features_1layers_300epochs_1timesteps_0.3dropout_64units_FROM,24hrs_9steps_50features_1layers_300epochs_1timesteps_0.3dropout_96units_FROM,10step_actual,24hrs_10steps_50features_1layers_200epochs_1timesteps_0.1dropout_96units_FROM,24hrs_10steps_50features_1layers_300epochs_1timesteps_0.3dropout_64units_FROM,24hrs_10steps_50features_1layers_300epochs_1timesteps_0.3dropout_96units_FROM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-18 18:00:00,-0.008593,0.010314,0.004281,0.001965,-0.053413,291.09,293.09,0.006871,1.0,-1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999998,1.0
2017-08-19 00:00:00,-0.005088,0.028011,0.011873,0.000457,-0.027544,295.32,290.35,-0.016829,-1.0,-1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999986,1.0,1.0
2017-08-19 06:00:00,-0.013192,-0.020298,0.001502,-0.03645,-0.023411,287.24979,294.26,0.024405,1.0,-1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999986,1.0,1.0
2017-08-19 12:00:00,0.018723,0.045532,0.040635,0.01621,0.022772,287.494146,294.0,0.02263,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999986,1.0,1.0
2017-08-19 18:00:00,0.001108,0.022419,0.011674,0.048761,-0.029013,293.09,296.18,0.010543,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999986,1.0,1.0


# Progress:

 - Finished gathering 6hr LSTM
 - Finished gathering 6hr XGB
 - Merged the 6hr LSTM & XGB
 - Also running the predictions for the 24hr LSTM. That should be done by the afternoon (if nothing dies)
 
 - After the above is finished:
     - Reorganize the 24hr LSTM predictions and duplicate them so they can align with 6hr
         - Do it both ways, for current and for future (staggering)
     - Align 6hr XGB predictions
 - Join all predictions
 - Clean up the dataset
 - Train models

# Predict w/ [INSERT_ALGO_TYPE_HERE]

In [18]:
#df.columns.tolist()

In [19]:
wfw = 125

In [58]:
col_types = [
    lstm_6hr_regr_cols,
    xgb_6hr_clf_cols,
    lstm_24hr_clf_to_cols,
    lstm_24hr_clf_from_cols,
]

cols_to_include = []

for col_type in col_types:
    cols_to_include += col_type
cols_to_include += ['target']

In [59]:
# filter down to cols w/ preds only
fdf = df[cols_to_include]

# set X and y
X = fdf.ix[:,:-1].as_matrix()
y = fdf.ix[:,-1].as_matrix()


# init standard scaler
X_standard_scaler = StandardScaler()
y_standard_scaler = StandardScaler()

# standardize X and Y
standardized_X = X_standard_scaler.fit_transform(X)
standardized_y = y_standard_scaler.fit_transform(y)


# init minmax scaler
X_minmax_scaler = MinMaxScaler(feature_range=(0, 1))
y_minmax_scaler = MinMaxScaler(feature_range=(0, 1))

# scale X and Y
scaled_X = X_minmax_scaler.fit_transform(X)
scaled_y = y_minmax_scaler.fit_transform(y)

# test out polynomial features
poly = PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False)
poly_X = poly.fit_transform(X)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [60]:
print('shape of X:', X.shape)
print('shape of poly X:', poly_X.shape)

shape of X: (245, 64)
shape of poly X: (245, 2080)


## BASELINE PERFORMANCE

In [61]:
# return perormance metrics
regression_kpis(list(zip(fdf.target, df[baseline_cols].mean(1).tolist())))

MAE: 0.0389022390611
RMSE: 0.0506513394149
Sign Accuracy: 0.808163265306


## Linear Regression

In [62]:
print('X Shape:', X.shape)
print('y Shape:', y.shape)

X Shape: (245, 64)
y Shape: (245,)


In [63]:
print('RAW INPUT')
# get prediction list
pl = sklearn_wfv_regression(X, y, LinearRegression(), wfw, verbose = False)

# return perormance metrics
regression_kpis(pl)

print()

print('STANDARDIZED INPUT')
# get prediction list
pl = sklearn_wfv_regression(standardized_X, standardized_y, LinearRegression(), wfw, verbose = False)

# return perormance metrics
regression_kpis(pl, y_standard_scaler)

print()

print('SCALED INPUT')
# get prediction list
pl = sklearn_wfv_regression(scaled_X, scaled_y, LinearRegression(), wfw, verbose = False)

# return perormance metrics
regression_kpis(pl, y_minmax_scaler)

print('POLY INPUT')
# get prediction list
pl = sklearn_wfv_regression(poly_X, y, LinearRegression(), wfw, verbose = False)

# return perormance metrics
regression_kpis(pl)

RAW INPUT
complete!
MAE: 11906808065.2
RMSE: 49107432612.5
Sign Accuracy: 0.696

STANDARDIZED INPUT
complete!
MAE: 9274559092.04
RMSE: 35291046375.9
Sign Accuracy: 0.68

SCALED INPUT
complete!
MAE: 5494337356.19
RMSE: 19025176950.4
Sign Accuracy: 0.648
POLY INPUT




complete!
MAE: 0.125080246371
RMSE: 0.18032036061
Sign Accuracy: 0.56


## Sklearn NN Regressor

In [29]:
#print('RAW INPUT')
# get prediction list
#pl = sklearn_wfv_regression(X, y, MLPRegressor(batch_size = 1), wfw, verbose = False)
# return perormance metrics
#regression_kpis(pl)
#print()

print('STANDARDIZED INPUT')
# get prediction list
pl = sklearn_wfv_regression(standardized_X, standardized_y, MLPRegressor(batch_size = 1), wfw, verbose = False)

# return perormance metrics
regression_kpis(pl, y_standard_scaler)

print()

print('SCALED INPUT')
# get prediction list
pl = sklearn_wfv_regression(scaled_X, scaled_y, MLPRegressor(batch_size = 1), wfw, verbose = False)

# return perormance metrics
regression_kpis(pl, y_minmax_scaler)

RAW INPUT
complete!
MAE: 0.114354320933
RMSE: 0.154675162788
Sign Accuracy: 0.568

STANDARDIZED INPUT
complete!
MAE: 0.038550235194
RMSE: 0.05067314203
Sign Accuracy: 0.728

SCALED INPUT
complete!
MAE: 0.0526837322194
RMSE: 0.065450698371
Sign Accuracy: 0.6




## RF Regressor

In [30]:
# get prediction list
pl = sklearn_wfv_regression(X, y, RandomForestRegressor(), wfw, verbose = False)

# return perormance metrics
regression_kpis(pl)

complete!
MAE: 0.033141612177
RMSE: 0.0427581290105
Sign Accuracy: 0.736


### Mini RF Grid Search

In [31]:
n_estimators = [40, 50, 60, 70, 80]
max_depth = [10, 20, 30, 40, 50]

for est in n_estimators:
    for dep in max_depth:
        
        rf_id = '%sEstimators_%sMaxDepth' % (est, dep)
        
        # get prediction list
        pl = sklearn_wfv_regression(X, y, RandomForestRegressor(n_estimators = est, max_depth = dep), wfw, verbose = False)
        
        print()
        print(rf_id)
        
        # return perormance metrics
        regression_kpis(pl)

complete!

40Estimators_10MaxDepth
MAE: 0.0297345161387
RMSE: 0.0397834987169
Sign Accuracy: 0.728
complete!

40Estimators_20MaxDepth
MAE: 0.0302394682319
RMSE: 0.0404184489603
Sign Accuracy: 0.752
complete!

40Estimators_30MaxDepth
MAE: 0.0287942270257
RMSE: 0.0387464577814
Sign Accuracy: 0.744
complete!

40Estimators_40MaxDepth
MAE: 0.0300642404448
RMSE: 0.0406639073046
Sign Accuracy: 0.752
complete!

40Estimators_50MaxDepth
MAE: 0.0298429688774
RMSE: 0.040317985352
Sign Accuracy: 0.744
complete!

50Estimators_10MaxDepth
MAE: 0.0296077625623
RMSE: 0.0395037987755
Sign Accuracy: 0.736
complete!

50Estimators_20MaxDepth
MAE: 0.0294467685121
RMSE: 0.0405240051644
Sign Accuracy: 0.736
complete!

50Estimators_30MaxDepth
MAE: 0.028511423171
RMSE: 0.0386141532792
Sign Accuracy: 0.76
complete!

50Estimators_40MaxDepth
MAE: 0.0294667930065
RMSE: 0.0398043902598
Sign Accuracy: 0.752
complete!

50Estimators_50MaxDepth
MAE: 0.0298261444218
RMSE: 0.0394517401842
Sign Accuracy: 0.784
complete!

60

KeyboardInterrupt: 

### Mini RF Grid Search (w/ Polynomial Features)

In [1]:
n_estimators = [1000, 2500]
max_depth = [50]

for est in n_estimators:
    for dep in max_depth:
        
        rf_id = '%sEstimators_%sMaxDepth' % (est, dep)
        
        # get prediction list
        pl = sklearn_wfv_regression(poly_X, y, RandomForestRegressor(n_estimators = est, max_depth = dep), wfw, verbose = 2)
        
        print()
        print(rf_id)
        
        # return perormance metrics
        regression_kpis(pl)

# Best Ensembles

**Random Forest**
 - Features: all models except "FROM"
     - 32Estimators_10MaxDepth: 2.9862% MAE
     - 32Estimators_20MaxDepth: 2.9865% MAE
 - Features: all models
     - 45Estimators_25MaxDepth: 2.8292% MAE
     - 40Estimators_40MaxDepth: 2.8421% MAE
     - 50Estimators_35MaxDepth: 2.8430% MAE
     - 40Estimators_10MaxDepth: 2.8471% MAE
     - 50Estimators_30MaxDepth: 2.8511% MAE
     - 35Estimators_10MaxDepth: 2.8516% MAE
     - 35Estimators_15MaxDepth: 2.8524% MAE
     

**Linear Regression**
 - 6hr LSTM regression
     - 3.0821% MAE
 - 6hr LSTM regression, 6hr XGB classification
     - 3.2053% MAE

# Miscelaneous Analyses
### What Happens When I Use All LSTM 6hr Models in a Random Forest?

In [4]:
predictions = {}
actual = {}

for m in all_lstm_6hr_regr_models:
    pred = [y[0][1] for y in all_lstm_6hr_regr_models[m]]
    true = [y[0][0] for y in all_lstm_6hr_regr_models[m]]
    predictions[m] = pred
    actual[m] = true

all_lstm_6hr_df = pd.DataFrame(predictions)
all_lstm_6hr_df['target'] = pd.DataFrame(actual).mean(1)

In [5]:
# set X and y
X = all_lstm_6hr_df.ix[:,:-1].as_matrix()
y = all_lstm_6hr_df.ix[:,-1].as_matrix()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


In [9]:
##### get prediction list
pl = sklearn_wfv_regression(X, y, RandomForestRegressor(), wfw, verbose = False)

# return perormance metrics
regression_kpis(pl)

complete!
MAE: 0.0343085512257
RMSE: 0.0439898016517
Sign Accuracy: 0.696


In [10]:
# n_estimators = [25, 50, 75, 100]
# max_depth = [25, 50, 75]

# for est in n_estimators:
#     for dep in max_depth:
        
#         rf_id = '%sEstimators_%sMaxDepth' % (est, dep)
        
#         # get prediction list
#         pl = sklearn_wfv_regression(X, y, RandomForestRegressor(n_estimators = est, max_depth = dep), wfw, verbose = False)
        
#         print()
#         print(rf_id)
        
#         # return perormance metrics
#         regression_kpis(pl)

## Use Random Forest to Select the Top Models for Use in the Ensemble

In [2]:
hyperparams = {'n_estimators': 75, 'max_depth': 25}

ranked_models = get_ranked_features(X, y,
                                    hyperparams,
                                    all_lstm_6hr_df.iloc[:,:-1].columns.tolist(),
                                    nb_epochs = 250
                                   )

pd.DataFrame(ranked_models)[:10].iloc[:,0].tolist()

## Use Linear Regression to Select the Top Models for Use in the Ensemble

In [25]:
# init standard scaler
X_standard_scaler = StandardScaler()
y_standard_scaler = StandardScaler()

# standardize X and Y to analyze coefs
standardized_X = X_standard_scaler.fit_transform(X)
standardized_y = y_standard_scaler.fit_transform(y)

# fit a linear regression model
model = LinearRegression()
model.fit(standardized_X, standardized_y)

# get the coeficients of each model
lr_coef = [{'model': m, 'coef': c} for m, c in zip(all_lstm_6hr_df.iloc[:,:-1].columns.tolist(), list(model.coef_))]



In [3]:
pd.DataFrame(lr_coef).sort_values('coef', ascending = False).model.tolist()[:10]

In [4]:
# top unique models, as per rf and lr
list(set(top_lstm_6hr_models_as_per_rf + top_lstm_6hr_models_as_per_lr))