In [1]:
# custom
from wrappers import data_processing as dp
from wrappers import technical_analysis as ta
from wrappers import modeling as mdl

# basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import datetime
from datetime import datetime, date, timedelta
from random import randint
from time import sleep

# ml
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score as cv_score
from sklearn.ensemble import RandomForestRegressor
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

%matplotlib inline

Using TensorFlow backend.


In [2]:
# get the true y values / target for the ensemble data
def get_target_values_for_ens(train_data, start_date, target_col, periods_ahead):
    
    # set index to dt
    train_data.index = pd.to_datetime(train_data.date)
    
    # shift target col based on target periods ahead
    orig_targ_col = train_data[target_col]
    train_data['targ_shift'] = orig_targ_col.shift(-periods_ahead)
    
    # get RoC between current and future values
    train_data['target'] = (train_data['targ_shift'] - orig_targ_col) / orig_targ_col
    
    # filter down to when the ensemble starts
    date_filtered_target_col = train_data[train_data.index >= start_date]['target']
    
    return date_filtered_target_col

# helper to get ranked features using random forrest regression
def get_ranked_features(X, Y, model_params, col_names, nb_epochs):
    
    scores = {}
    
    for i in range(nb_epochs):
        
        # create / fit new random forest model
        rf = RandomForestRegressor(**model_params)
        rf.fit(X, Y)
        
        # loop through each col and add up the scores
        for score, col in zip(map(lambda x: x, rf.feature_importances_), col_names):
            if i == 0:
                scores[col] = score
            else:
                scores[col] += score
    
    # sort the scores in descending order
    sorted_scores = sorted(scores.items(), key=operator.itemgetter(1), reverse = True)
    
    return sorted_scores

# helper to get top n features from ranked list
def get_top_rf_features(scores, n):
    top_features = [item[0] for item in scores[:n]]
    return top_features

# LOAD DATA

In [3]:
# load data
df = pd.read_csv('train_full_6hr_2017-09-28.csv')

# add dt index and filter down to dates that have twitter data
data_start_date = datetime(2017, 3, 2)
df.index = pd.to_datetime(df.date)
df = df[df.index >= data_start_date]

# add target col
target_col = 'polo_usdteth_median_trade_price'
df['target'] = get_target_values_for_ens(df, data_start_date, target_col, 4)
df = df.drop(['date', 'targ_shift'], 1)

# QA shape / NaN values
print('Init Samples:', df.shape[0])
print('NaN Samples:', df.shape[0]- df.dropna().shape[0])
df = df.dropna()

Init Samples: 841
NaN Samples: 4


In [4]:
df.head(3)

Unnamed: 0_level_0,polo_btceth_open,polo_btceth_high,polo_btceth_low,polo_btceth_close,polo_btceth_volume,polo_btceth_mean_trade_price,polo_btceth_median_trade_price,polo_btceth_open_close_change,polo_btceth_candle_range,polo_btceth_high_low_ratio,...,twitter_hashETH_tweets_bear_multiprt_avg,twitter_hashETH_tweets_bull_multifav_avg,twitter_hashETH_tweets_bear_multifav_avg,twitter_hashETH_tweets_bull_multipfollow_avg,twitter_hashETH_tweets_bear_multipfollow_avg,twitter_hashETH_tweets_bull_multipverif_avg,twitter_hashETH_tweets_bear_multipverif_avg,twitter_hashETH_tweets_bull_multipcustom_avg,twitter_hashETH_tweets_bear_multipcustom_avg,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-03-02 00:00:00,0.0137,0.01425,0.013599,0.0141,363291.274791,0.013978,0.014,0.029222,0.000651,1.047869,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156534
2017-03-02 06:00:00,0.014115,0.014903,0.013901,0.014879,209002.744141,0.014381,0.014265,0.054153,0.001002,1.072079,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144889
2017-03-02 12:00:00,0.01488,0.01535,0.01453,0.014999,629504.941161,0.014988,0.015,0.007997,0.00082,1.056435,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028622


In [5]:
# set dfs for X and y
X_df = df.iloc[:,:-1]
y = df.iloc[:,-1].as_matrix()

# filter down to non-pattern features
X_df_no_paterns = X_df[[col for col in X_df.columns if '_pat_' not in col]]

# init PolynomialFeatures object
poly = PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False).fit(X_df_no_paterns)

# create a df with the polynomial features
poly_df = pd.DataFrame(poly.transform(X_df_no_paterns),
                       columns = poly.get_feature_names(X_df_no_paterns.columns)
                      )

# set the polynomial X
poly_X = poly_df.as_matrix()

print('poly shape:', poly_X.shape)
print('y shape:', y.shape)

poly shape: (837, 140715)
y shape: (837,)


In [6]:
poly_df.head(3)

Unnamed: 0,polo_btceth_open,polo_btceth_high,polo_btceth_low,polo_btceth_close,polo_btceth_volume,polo_btceth_mean_trade_price,polo_btceth_median_trade_price,polo_btceth_open_close_change,polo_btceth_candle_range,polo_btceth_high_low_ratio,...,twitter_hashETH_tweets_bear_multipfollow_avg twitter_hashETH_tweets_bull_multipverif_avg,twitter_hashETH_tweets_bear_multipfollow_avg twitter_hashETH_tweets_bear_multipverif_avg,twitter_hashETH_tweets_bear_multipfollow_avg twitter_hashETH_tweets_bull_multipcustom_avg,twitter_hashETH_tweets_bear_multipfollow_avg twitter_hashETH_tweets_bear_multipcustom_avg,twitter_hashETH_tweets_bull_multipverif_avg twitter_hashETH_tweets_bear_multipverif_avg,twitter_hashETH_tweets_bull_multipverif_avg twitter_hashETH_tweets_bull_multipcustom_avg,twitter_hashETH_tweets_bull_multipverif_avg twitter_hashETH_tweets_bear_multipcustom_avg,twitter_hashETH_tweets_bear_multipverif_avg twitter_hashETH_tweets_bull_multipcustom_avg,twitter_hashETH_tweets_bear_multipverif_avg twitter_hashETH_tweets_bear_multipcustom_avg,twitter_hashETH_tweets_bull_multipcustom_avg twitter_hashETH_tweets_bear_multipcustom_avg
0,0.0137,0.01425,0.013599,0.0141,363291.274791,0.013978,0.014,0.029222,0.000651,1.047869,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.014115,0.014903,0.013901,0.014879,209002.744141,0.014381,0.014265,0.054153,0.001002,1.072079,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.01488,0.01535,0.01453,0.014999,629504.941161,0.014988,0.015,0.007997,0.00082,1.056435,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# sample for testing
#slice_X = poly_X[:,:1000]

In [8]:
n_iterations = 10

In [9]:
# set an empty df for appending
xdf = pd.DataFrame()

# loop through n times to reduce possibility of losing correlated features
for _ in range(n_iterations):
    
    # init model
    model = RandomForestRegressor(n_estimators = 10, verbose = 2)
    model.fit(poly_X, y)
    
    scores = {}
    
    # add the cols and there score to a dict for easy df conversion
    for score, col in zip(map(lambda x: x, model.feature_importances_), poly_df.columns.tolist()):
        scores[col] = score
    
    # add the dict of cols and their scores to a df
    score_df = pd.DataFrame(list(scores.items()),
                            columns = ['col', 'score']
                           )
    
    # append this to the master df
    xdf = xdf.append(score_df)

building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 11.2min finished


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 12.0min finished


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 12.5min finished


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 12.9min finished


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 12.6min finished


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 13.1min finished


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 13.0min finished


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 13.9min finished


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   59.8s remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 11.1min finished


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   59.1s remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 11.8min finished


### Note

Start: 10:10pm

End: 

In [18]:
feature_scores = xdf.groupby('col').sum().sort_values('score', ascending = False)

In [20]:
top_1000_features = feature_scores[:1000].index.tolist()

## Select Top 100 Features from Top 1000

In [23]:
filtered_1000_poly_df = poly_df[top_1000_features]

In [28]:
filtered_1000_poly_df.head()

Unnamed: 0,polo_usdtbtc_ultimate_osc polo_usdtbtc_close_roc_15,polo_btceth_open polo_usdteth_ema_180,polo_btceth_volume polo_usdteth_stoch_rsi_k,polo_usdteth_high_low_ratio2 polo_usdteth_stoch_rsi_k,polo_btceth_ema_180 polo_usdteth_ema_150,polo_btceth_ema_150 polo_usdteth_ema_150,polo_usdteth_roc_par_sar_5 twitter_hashETH_tweets_bull_count_avg,polo_usdtbtc_close_roc_15 polo_usdteth_adx_rating,polo_btceth_roc_tema_5 polo_usdtbtc_roc_mesa_fama_1,polo_btceth_ema_150 polo_usdteth_ema_180,...,polo_usdtbtc_stoch_rsi_k twitter_hashBTC_sent_neu_avg,polo_btceth_roc_dema_5 polo_btceth_roc_hilb_trans_5,polo_usdtbtc_open polo_usdteth_htp_cycleperiod,twitter_bitcoin_tweets_bull_multipfollow_avg twitter_cashETH_tweets_bull_count_avg,polo_usdteth_roc_ema_180_2 twitter_cashETH_tweets_bear_multiprt_avg,polo_usdtbtc_roc_par_sar_1 twitter_cashETH_tweets_bull_count_avg,polo_usdteth_high_low_ratio polo_usdteth_cmo,polo_btceth_roc_mesa_mama_3 polo_usdtbtc_roc_par_sar_1,polo_usdtbtc_roc_ema_60_5 polo_usdtbtc_roc_mesa_fama_5,twitter_bitcoin_sent_neg_avg twitter_ethereum_sent_pos_avg.1
0,2.598183,0.165489,23953340.0,4.941895,0.143974,0.144763,-0.0,1.628971,0.000413,0.141294,...,0.0,0.000845,32167.119665,0.0,0.0,0.0,63.256471,3.8e-05,0.00023,0.0
1,3.132187,0.17147,20900270.0,7.60656,0.145332,0.146204,-0.0,2.085499,0.000515,0.142609,...,0.0,0.001141,32773.847063,0.0,0.0,0.0,72.442434,0.00047,0.00034,0.002752
2,6.152602,0.181926,62950490.0,8.873706,0.146835,0.147793,-0.0,3.799457,5.1e-05,0.14405,...,0.0,0.001578,34624.024888,0.0,0.0,0.0,78.963114,0.000632,0.000388,0.002092
3,4.342691,0.184239,6336064.0,1.048397,0.148272,0.149316,-0.0,2.991584,6.3e-05,0.145438,...,0.0,0.002236,36971.797309,0.0,0.0,0.0,64.99822,0.001305,0.000403,0.002433
4,4.226024,0.18623,32170200.0,7.265899,0.150082,0.151241,0.0,3.253531,7.6e-05,0.147187,...,0.0,0.003122,39742.897163,0.0,0.0,0.0,76.201531,0.000618,0.000273,0.002226


In [52]:
cols_w_no_negs = [col for b, col in zip(((filtered_1000_poly_df < 0).sum() == 0).tolist(), filtered_1000_poly_df.columns.tolist()) if b]

In [53]:
# get the log of all non-negative columns
log_transformed_1000_poly_df = np.log(filtered_1000_poly_df[cols_w_no_negs])

# delete all cols with infinity values
log_transformed_1000_poly_df = log_transformed_1000_poly_df.replace([np.inf, -np.inf], np.nan)
log_transformed_1000_poly_df = log_transformed_1000_poly_df.dropna(1)

# add the 'log' prefix
log_transformed_1000_poly_df.columns = ['log ' + str(col) for col in log_transformed_1000_poly_df.columns.tolist()]

# reset index for easy joining
log_transformed_1000_poly_df = log_transformed_1000_poly_df.reset_index(drop = True)

  if __name__ == '__main__':


In [89]:
log_transformed_1000_poly_df.shape

(837, 100)

In [83]:
# get the log of all non-negative columns
sqrt_transformed_1000_poly_df = np.sqrt(filtered_1000_poly_df[cols_w_no_negs])

# delete all cols with infinity values
sqrt_transformed_1000_poly_df = sqrt_transformed_1000_poly_df.replace([np.inf, -np.inf], np.nan)
sqrt_transformed_1000_poly_df = sqrt_transformed_1000_poly_df.dropna(1)

# add the 'log' prefix
sqrt_transformed_1000_poly_df.columns = ['sqrt ' + str(col) for col in sqrt_transformed_1000_poly_df.columns.tolist()]

# reset index for easy joining
sqrt_transformed_1000_poly_df = sqrt_transformed_1000_poly_df.reset_index(drop = True)

In [84]:
sqrt_transformed_1000_poly_df.shape

(837, 236)

In [91]:
# join the two transformed dfs for joining to main df
trans_1000_poly_df = log_transformed_1000_poly_df.join(sqrt_transformed_1000_poly_df)

In [95]:
# and do one more join to add it all together
_1000_df = filtered_1000_poly_df.reset_index(drop = True).join(trans_1000_poly_df)

In [99]:
# create X matrix
poly_1000_trans_X = _1000_df.as_matrix()

print('poly 1000 X:', poly_1000_trans_X.shape)
print('shape y:', y.shape)

poly 1000 X: (837, 1336)
shape y: (837,)


In [104]:
# set params
n_estimators = 1000
n_iterations = 100

In [105]:
# set an empty df for appending
xdf_1000 = pd.DataFrame()

# loop through n times to reduce possibility of losing correlated features
for i in range(n_iterations):
    
    print('Starting iteration', i)
    
    # init model
    model = RandomForestRegressor(n_estimators = n_estimators)
    model.fit(poly_1000_trans_X, y)
    
    scores = {}
    
    # add the cols and there score to a dict for easy df conversion
    for score, col in zip(map(lambda x: x, model.feature_importances_), _1000_df.columns.tolist()):
        scores[col] = score
    
    # add the dict of cols and their scores to a df
    score_df = pd.DataFrame(list(scores.items()),
                            columns = ['col', 'score']
                           )
    
    # append this to the master df
    xdf_1000 = xdf_1000.append(score_df)

Starting iteration 0
Starting iteration 1
Starting iteration 2
Starting iteration 3
Starting iteration 4
Starting iteration 5
Starting iteration 6
Starting iteration 7
Starting iteration 8
Starting iteration 9
Starting iteration 10
Starting iteration 11
Starting iteration 12
Starting iteration 13
Starting iteration 14
Starting iteration 15
Starting iteration 16
Starting iteration 17
Starting iteration 18
Starting iteration 19
Starting iteration 20
Starting iteration 21
Starting iteration 22
Starting iteration 23
Starting iteration 24
Starting iteration 25
Starting iteration 26
Starting iteration 27
Starting iteration 28
Starting iteration 29
Starting iteration 30
Starting iteration 31
Starting iteration 32
Starting iteration 33
Starting iteration 34
Starting iteration 35
Starting iteration 36
Starting iteration 37
Starting iteration 38
Starting iteration 39
Starting iteration 40
Starting iteration 41
Starting iteration 42
Starting iteration 43
Starting iteration 44
Starting iteration 4

In [144]:
# sort the cols by score and export to save a record of all col scores
tran_poly_feature_scores = xdf_1000.groupby('col').sum().sort_values('score', ascending = False)
tran_poly_feature_scores.to_csv('top_100_polynomial_features.csv')

# filter down to the top 100 features
final_features = tran_poly_feature_scores[:120].index.tolist()
final_trans_poly_train_df = _1000_df[final_features]

# add back the target and the index
final_trans_poly_train_df.index = df.index
final_trans_poly_train_df[target_col] = df[target_col]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [145]:
# write the final df to a csv for use in training!
final_trans_poly_train_df.to_csv('filtered_trans_poly_train_2017-10-03.csv')