In [2]:
# this is still a very early version, and still working on making a framework that will get basic things going. Will enhance and polish all areas afterwards.
# todo:
#   add more features
#   add more data sources, options, vix index, 
#   implement bayesian optimization

In [3]:
# import libraries
from marketstackAPI import Marketstack
from yahooFinanceAPI import YahooFinance
from scipy import stats 
import pandas as pd
import numpy as np
import cufflinks as cf
import ta
import holidays
import matplotlib as plt
import plotly.graph_objects as go
import lightgbm as lgb
import multiprocessing
from datetime import datetime, timedelta
from imblearn.over_sampling import SMOTE
from IPython.core.display import display, HTML
import warnings
warnings.filterwarnings(action='ignore')


In [4]:
# jupyter notebook settings and chart size configs
display(HTML("<style>.container { width:100% !important; }</style>"))
plt.rcParams['figure.figsize'] = [12, 5]
plt.rcParams['figure.dpi'] = 200
pd.options.plotting.backend = "plotly"
cf.set_config_file(theme='henanigans',sharing='public',offline=True)

In [5]:
def raw_data_preprocessing(raw_data):
    """
    Clean raw_data by removing extra columns, renaming columns, order by date in descending order, reset index number.
    this data format will be used as the standard format for all other feature engineering related function calls.
    
    Parameters
    ----------
    raw_data : pandas dataframe that contains ['date','adj_high','adj_low','adj_close','adj_open','adj_volume'] columns, ordered by date in ascending order.
    
    Return:
    ----------
    standard_data: pandas dataframe that contains ['date','high','low','close','open','volume'] columns, ordered by date in descending order.
    
    """
    data = raw_data[:]
    data = data[['date','adj_high','adj_low','adj_close','adj_open','adj_volume']]
    data.columns = ['date','high','low','close','open','volume']
    data = data[::-1]
    data.reset_index(inplace=True, drop=True)
    return data

In [6]:
def get_ta_indicators(standard_data, prefix = ''):
    """
    Compute technical indicators for every period, each row within standard_data is a period.
    
    Parameters
    ----------
    standard_data : pandas dataframe that contains ['date','high','low','close','open','volume'] columns, ordered by date in descending order.
    prefix: string that will be concatennated to before all technical indicators names
    
    Return:
    ----------
    data: pandas dataframe that contains computed technical indicators with corrsponding name
    
    """
    data = standard_data[:]
    df = pd.DataFrame()
    df.insert(0, prefix+'_stochrsi_14' if prefix else 'stochrsi_14', ta.momentum.stochrsi(close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_mfi_14' if prefix else 'mfi_14', ta.volume.money_flow_index(high = data.high, low = data.low, close = data.close, volume= data.volume)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_adx_14' if prefix else 'adx_14', ta.trend.adx(high = data.high, low = data.low, close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_adx_neg_14' if prefix else 'adx_neg_14', ta.trend.adx_neg(high = data.high, low = data.low, close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_adx_pos_14' if prefix else 'adx_pos_14', ta.trend.adx_pos(high = data.high, low = data.low, close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_aroon_up_25' if prefix else 'aroon_up_25', ta.trend.aroon_up(close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_aroon_down_25' if prefix else 'aroon_down_25', ta.trend.aroon_down(close = data.close)/100) # range 0 to 100 rescaled to 0 to 1
    df.insert(0, prefix+'_aroon_25' if prefix else 'aroon_25', (ta.trend.aroon_up(close = data.close) - ta.trend.aroon_down(close = data.close))/100) # range 0 to 100 rescaled to 0 to 1
    
    return df

In [7]:
def get_percent_changes(standard_data, prefix = ''):
    """
    Compute basic % changes
    
    Parameters
    ----------
    standard_data : pandas dataframe that contains ['date','high','low','close','open','volume'] columns, ordered by date in descending order.
    prefix: string that will be concatennated to before all technical indicators names
    
    Return:
    ----------
    data: pandas dataframe that contains computed % changes indicators with corrsponding name
    
    """
    data = standard_data[:]
    df = pd.DataFrame()
    #add volume % change from yesterday to today
    df.insert(0,'volume_change',data.volume/data.volume.shift(1)-1)
    #add price % change from yesterday to today
    df.insert(0,'price_change',data.close/data.close.shift(1)-1)
    return df

In [109]:
def get_target_variable(standard_data):
    """
    Compute target variable.
    the target variable indicates three classes.
    2 : next day is going up significantly
    0: next day is going down significantly
    1 : no significant movement for the next day.
    
    How significant change is defined using more than 1% change at the moment. could be changing to something else.
    
    
    Parameters
    ----------
    standard_data : pandas dataframe that contains ['date','high','low','close','open','volume'] columns, ordered by date in descending order.
    
    Return:
    ----------
    data: pandas dataframe that contains target variable
    
    """
    data = standard_data[:]
    #creating Y
    #calculate daily % change using daily close using the NEXT day close / today close
    df = pd.DataFrame()
    target = data.close.shift(-1)/data.close-1
    target[target >  0.005] = 1 ## buy
    target[target < -0.005] = -1
    target[(target < 1) & (target > -1)] = 0
    target += 1
    df.insert(0,'target', target)
    df.insert(1,'change', data.close.shift(-1)/data.close-1)
    df.insert(2,'predictions', list(pd.get_dummies(df.target).values))
    return df

In [9]:
# remove common rows with nan from full_data and target and return new dataset
def remove_nan(full_data, target):
    to_keep = [not x for x in np.array(list(map(any,full_data.isna().values))) | np.array(list(map(any,target.isna().values)))]
    full_data = full_data[to_keep]
    target = target[to_keep]
    return full_data, target

In [67]:
def eval_strategy(results, data, prefix = ''):
    # the evaluation process would be a simulation of trading the stocks at the close.
    #   when the prediction is:
    #    0 : short/sell
    #    1 : unclear thus liquidate and wait for long/short signals
    #    2 : long/buy
    #   when there are consective singnals of buy or sell, the action would be to hold
    # assuming no commission per trade, and orders always fill at the close.
    max_balance = 10000
    eval_data = results.merge(data, left_index=True, right_index=True)
    beginning_balance = [10000]
    beginning_cash = [10000]
    shares_owned = [0]
    ending_balance = [10000]
    ending_cash = [10000]
    draw_down = [0]
    actions = ['liquidate']
    predictions = ['liquidate']
    pred_to_action = {
        0:'short',
        1:'liquidate',
        2:'long'
    }
    for idx, row in eval_data.iterrows():
        beginning_balance.append(ending_balance[-1])
        ending_balance.append(ending_cash[-1]+(shares_owned[-1])*row.close)
        beginning_cash.append(ending_cash[-1])
        ending_cash.append(ending_cash[-1])
        shares_owned.append(shares_owned[-1])
        # when VIX is in extreme outlier range would want to get out since we dont have enough training data during extrem times don't want to risk it
        # force liquidate if VIX is being an outlier in training dataset
        # doing boxcox transformation so the data can be as close to a normal distribution as possible to properly calculate mean and standard deviation to detect outlier.
        selected_VIX = raw_VIX.close[(raw_VIX.date > row.date + timedelta(days=-200)) & (raw_VIX.date < row.date)]
        normalized_VIX, fitted_lambda = stats.boxcox(selected_VIX) 
        n_VIX_mean = np.mean(normalized_VIX)
        n_VIX_std = np.std(normalized_VIX)
        transformed_VIX = (np.log(row.close_VIX) if fitted_lambda == 0 else (row.close_VIX**fitted_lambda - 1) / fitted_lambda)
#         print('len:',len(selected_VIX),'VIX:', row.close_VIX,'trans_VIX:',transformed_VIX, 'lambda: ', fitted_lambda, 'n_VIX_mean:', n_VIX_mean, 'n_VIX_std:', n_VIX_std, 'upper:', n_VIX_mean+1.5*n_VIX_std, 'lower:', n_VIX_mean-1.5*n_VIX_std)
        if ((transformed_VIX > (n_VIX_mean + 2 * n_VIX_std)) or (transformed_VIX < (n_VIX_mean - 2 * n_VIX_std))):
            ending_cash[-1] += shares_owned[-1] * row.close
            shares_owned[-1] = 0
            action = 'liquidate'
        # if same as previous
        elif pred_to_action[row.predictions.argmax()] == predictions[-1]:
            action = 'hold'
        # if liquidate 
        elif (row.predictions.argmax() == 1):
            #liquidate since vix is in outlier range don't want to extrapolate when vix is in extreme ranges
            ending_cash[-1] += shares_owned[-1] * row.close
            shares_owned[-1] = 0
            action = pred_to_action[row.predictions.argmax()]
        # if long
        elif row.predictions.argmax() == 2:
            ending_cash[-1] += shares_owned[-1] * row.close
            shares_owned[-1] = 0
#             shares_owned[-1] += np.floor((ending_cash[-1]/row.close)*(row.predictions[2]/(row.predictions[2]+row.predictions[0])))
#             ending_cash[-1] -=  np.floor((ending_cash[-1]/row.close)*(row.predictions[2]/(row.predictions[2]+row.predictions[0]))) * row.close
            shares_owned[-1] += np.floor((ending_cash[-1]/row.close))
            ending_cash[-1] -=  np.floor((ending_cash[-1]/row.close)) * row.close
            action = pred_to_action[row.predictions.argmax()]
        # if short
        elif row.predictions.argmax() == 0:
            ending_cash[-1] += shares_owned[-1] * row.close
            shares_owned[-1] = 0
#             shares_owned[-1] -= np.floor((ending_cash[-1]/row.close)*(row.predictions[0]/(row.predictions[2]+row.predictions[0])))
#             ending_cash[-1] -= -np.floor((ending_cash[-1]/row.close)*(row.predictions[0]/(row.predictions[2]+row.predictions[0]))) * row.close
            shares_owned[-1] -= np.floor((ending_cash[-1]/row.close))
            ending_cash[-1] -= -np.floor((ending_cash[-1]/row.close)) * row.close
            action = pred_to_action[row.predictions.argmax()]
        #calc
        max_balance = max(max_balance,ending_balance[-1])
        draw_down.append(ending_balance[-1]/max_balance - 1)
        actions.append(action)
        predictions.append(pred_to_action[row.predictions.argmax()])
    df = pd.DataFrame()
    df.insert(0, 'beginning_balance', beginning_balance)
    df.insert(1, 'beginning_cash', beginning_cash)
    df.insert(2, 'shares_owned', shares_owned)
    df.insert(3, 'ending_cash', ending_cash)
    df.insert(4, 'ending_balance', ending_balance)
    df.insert(5, 'draw_down', draw_down)
    df.insert(6, 'actions', actions)
    df.insert(7, 'ending_balance_percent_change', 1 - df.ending_balance.shift(1)/df.ending_balance)
    df.columns = [prefix+'_'+x if prefix else x for x in df.columns]
    return df[1:]

In [11]:
def eval_buy_and_hold(results, data, prefix = ''):
    # the evaluation process would be a simulation of trading the stocks at the close and always long/short as much as possible and liquidate the next day
    # assuming no commission per trade, and orders always fill at the close.
    max_balance = 10000
    eval_data = results.merge(data, left_index=True, right_index=True)
    beginning_balance = [10000]
    beginning_cash = [10000]
    shares_owned = [0]
    ending_balance = [10000]
    ending_cash = [10000]
    draw_down = [0]
    actions = ['liquidate']
    
    for idx, row in eval_data.iterrows():
        beginning_balance.append(ending_balance[-1])
        ending_balance.append(ending_cash[-1]+(shares_owned[-1])*row.close)
        beginning_cash.append(ending_cash[-1])
        ending_cash.append(ending_cash[-1])
        shares_owned.append(shares_owned[-1])
        
        #liquidate
        ending_cash[-1] += shares_owned[-1] * row.close
        shares_owned[-1] = 0
        action = 'liquidate'
        # long
        shares_owned[-1] += np.floor(ending_cash[-1]/row.close)
        ending_cash[-1] -= (ending_cash[-1]/row.close) * row.close
        action = 'long'

        #calc
        max_balance = max(max_balance,ending_balance[-1])
        draw_down.append(ending_balance[-1]/max_balance - 1)
        actions.append(action)
    df = pd.DataFrame()
    df.insert(0, 'beginning_balance', beginning_balance)
    df.insert(1, 'beginning_cash', beginning_cash)
    df.insert(2, 'shares_owned', shares_owned)
    df.insert(3, 'ending_cash', ending_cash)
    df.insert(4, 'ending_balance', ending_balance)
    df.insert(5, 'draw_down', draw_down)
    df.insert(6, 'actions', actions)
    df.columns = [prefix+'_'+x if prefix else x for x in df.columns]
    return df[1:]

In [69]:
# Since this is a timeseries dataset and because of my personal trading experience I'm claming/assuming the underlying relationship between features and target variables isn't stationary, 
# thus the traning method will be in walk-forward style instead of cross-validation; and for each iteration the model isn't going to using everydata available since they aren't as relevant.
# will try to play with weights in the future to see if applying less weights to "outdated" data will help the model.

def train_and_eval(full_data, target):
    training_window_size = 500 # days of data to train the model for each iteration. using about 3 years of data
    validation_window_size = 100
    predict_window_size  = 10  # days of data used to test and eval the model for each iteration. Using about 2 weeks of data 
                               # ideally the predict_window_size should be set to 1, but that would take too long to train.
                               # maybe I would try it when I have a good model with good parameters
    best_iteration = 1
    # check full_data len is more than training_window_size + predict_window_size if false throw error
    assert len(full_data) > training_window_size + predict_window_size, "full_data lenght is less than training_window_size + predict_window_size"
    predictions = []
    truths = []
    prediction_results = target[training_window_size+validation_window_size:]
    for i in range(training_window_size+validation_window_size,len(full_data),predict_window_size):
        validation_size = validation_window_size
        # setup train and test data
        train_x = full_data[i-training_window_size-validation_window_size:i-validation_window_size]
        valid_x = full_data[i-validation_window_size:i]
        test_x  = full_data[i:i+predict_window_size]
        
        train_y = target[i-training_window_size-validation_window_size:i-validation_window_size]
        valid_y = target[i-validation_window_size:i]
        test_y  = target[i:i+predict_window_size]
        
        # oversample trainning data to balance the dataset
        
        train_x['weights'] = np.array(range(1,len(train_x)+1))**1.5
        
        oversample = SMOTE(k_neighbors = 5, random_state = 0) 
        train_x, train_y = oversample.fit_resample(train_x, train_y.target)
        weights = train_x.pop('weights')
        validation_set_skip = False
        try:
            valid_x, valid_y = oversample.fit_resample(valid_x, valid_y.target)
            validation_data = lgb.Dataset(valid_x, label=valid_y, reference=train_data,free_raw_data=False)
        except:
            validation_set_skip = True # not enough samples vs neighbors, skipping validation for this iteration and keep using last iteration model parameters but trained with new data

        # create lgb.Dataset for both train and test for lightgbm library use
        train_data = lgb.Dataset(train_x, weight=weights, label=train_y)
        
        # setup lightgbm parameters
        param = {'metric': 'multi_logloss', 'objective': 'multiclass', 'num_class':3}
        param['learning_rate'] = 0.01
        param['max_depth'] = 5
        param['num_leaves'] = 5
        param['min_data_in_leaf'] = 2
        param['min_sum_hessian_in_leaf'] = 1e-3
        param['bagging_fraction'] = 0.9
        param['bagging_freq'] = 5
        param['bagging_seed'] = 3
        param['feature_fraction'] = 0.9
        param['feature_fraction_bynode'] = 0.9
        param['feature_fraction_seed'] = 2
        param['lambda_l1'] = 0.01
        param['lambda_l2'] = 0.01
        param['force_col_wise'] = True
        param['num_threads'] = multiprocessing.cpu_count()
        param['verbose'] = -1
        
        if not validation_set_skip:
#             print('using validation data to find best iteration')
            num_round = 1000
            bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data], early_stopping_rounds=5, verbose_eval=False)
            best_iteration = bst.best_iteration
#         print('using training only with last known good number of iterations')
        num_round = best_iteration
        train_x = full_data[i-training_window_size:i]
        train_y = target[i-training_window_size:i]
        train_x['weights'] = np.array(range(1,len(train_x)+1))**1.5
        train_x, train_y = oversample.fit_resample(train_x, train_y.target)
        weights = train_x.pop('weights')
        train_data = lgb.Dataset(train_x,label=train_y, weight=weights)
        
        bst = lgb.train(param, train_data, num_round, verbose_eval=False)
        pred = bst.predict(test_x, num_iteration_predict = bst.best_iteration)
        
        predictions.extend(pred)
        truths.extend(test_y)
    prediction_results.insert(2, 'predictions', predictions)
    
    return bst, prediction_results

In [13]:
def plot(standard_data):
    """
    Draw interactive candle stick chart OHLC Volume
    
    Parameters
    ----------
    standard_data : pandas dataframe that contains ['date','high','low','close','open','volume'] columns.
    
    """
    qf = cf.QuantFig(standard_data,legend='bottom')
    qf.add_volume()
    qf.iplot()

In [14]:
# get raw data

# switching from Marketstack to yahoofinance API because Marketstack doesn't have VIX info
# MS = Marketstack() # requires API key from Marketstack with basic plan to get 10 years worth of data
# raw_data2 = MS.get('IWM') 
YF = YahooFinance()
raw_data = YF.get('SPY')
raw_VIX = YF.get('^VIX')
raw_VIX3M = YF.get('^VIX3M')
# options = pd.read_csv('./QOA-MSFT.csv')

In [15]:
# msft_options

In [16]:
# # options.iloc[:,:39]
# # pd.concat([options.Date,options.iloc[:,27:39]],axis=1)
# msft_options = options.iloc[:,:4]
# columns_names = msft_options.columns.tolist()
# columns_names[0] = 'date'
# msft_options.columns = columns_names

# msft_options.date = [datetime.strptime(x, '%Y-%m-%d') for x in msft_options.date]

In [17]:
# data cleaning part 1
# changing datetime to values to have hour minute second to 0, 
# because datetime from yahoo finance has different hours and minutes due to daylight savings
# which causes problems with merging tables and it's extra info the model doesn't need

raw_data.date = raw_data.date.apply(lambda x:x.replace(hour=0, minute = 0, second = 0))
raw_VIX.date = raw_VIX.date.apply(lambda x:x.replace(hour=0, minute = 0, second = 0))
raw_VIX3M.date = raw_VIX3M.date.apply(lambda x:x.replace(hour=0, minute = 0, second = 0))


In [110]:
# runtimewarnings are produced from ta library but it's nothing to worry about for this project, will need to figure out a way to suppress this warning message.
# data prep steps then
# drop rows with nan
# naturally the last row contains nan since we don't have info from tomorrow. so we are also dropping the last row.
# but in production we will want to keep the last row so we can use it to make prediction
# for modeling and evaluation purposes it's not useful
# after this step the data is ready to use as training dataset
data = raw_data_preprocessing(raw_data)
VIX = raw_data_preprocessing(raw_VIX)
VIX3M = raw_data_preprocessing(raw_VIX3M)
indicators = get_ta_indicators(data, 'daily')
data = pd.concat([data,indicators], axis = 1) 
# concat/merge datasets to create full_data
data = pd.merge(data,VIX[['date','close']], on='date', suffixes=(None,'_VIX'))
data = pd.merge(data,VIX3M[['date','close']], on='date', suffixes=(None,'_VIX3M'))
# data = pd.merge(data,msft_options, on='date')
# creating a new feature using vix3m / vix
data['VIX3M/VIX'] = data.close_VIX3M/data.close_VIX
target = get_target_variable(data)
full_data = data.iloc[:,6:]

# last step is to remove rows with nan. i.e. first few rows that don't have enough days of data to compute averages etc, and the last row without future data to compute the targer.
full_data, target = remove_nan(full_data, target)


In [70]:
# this will take a while. To see training progress turn on verbose in lightgbm parameters
model, prediction_results = train_and_eval(full_data, target)

In [71]:
# show simulated trading vs buy and hold as benchmark
buy_and_hold_result = eval_buy_and_hold(prediction_results, data)
strategy_result = eval_strategy(prediction_results, data)
print('Max Drawdown:')
print('  - Buy and Hold:', str(round(100*min(buy_and_hold_result.draw_down),2))+'%')
print('  - LGB Strategy:', str(round(100*min(strategy_result.draw_down),2))+'%')
print('  - Longs:', sum(strategy_result.actions=='long'))
print('  - Shorts:', sum(strategy_result.actions=='short'))
print('  - Liquidate:', sum(strategy_result.actions=='liquidate'))

#align index number
strategy_result.index = prediction_results.index
buy_and_hold_result.index = prediction_results.index
#insert based on index number
def plot_result(data, benchmark, strategy):
    data = data[:] # making a copy so original data isn't altered.
    data.insert(1,'BNH_Strategy', benchmark.ending_balance)
    data.insert(1,'LGB_Strategy', strategy.ending_balance)
    data.insert(1,'LGB_Ending_Balance_Percent_Change', strategy.ending_balance_percent_change)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=data.date[data.BNH_Strategy.notnull()], y=data.BNH_Strategy[data.BNH_Strategy.notnull()], mode='lines', name='Buy and Hold Strategy'))
    fig.add_trace(go.Scatter(x=data.date[data.BNH_Strategy.notnull()], y=data.LGB_Strategy[data.BNH_Strategy.notnull()], mode='lines', name='Lightgbm Strategy'))
    fig.show()
    return data
trading_data = plot_result(data, buy_and_hold_result, strategy_result)

Max Drawdown:
  - Buy and Hold: -33.72%
  - LGB Strategy: -48.58%
  - Longs: 126
  - Shorts: 146
  - Liquidate: 397


In [21]:
# # holiday info
# min_year = int(min(data['date'])[:4])-2
# max_year = int(max(data['date'])[:4])+2
# min_date = str(min_year)+'-01-01'
# max_date = str(max_year)+'-12-31'
# dates = pd.date_range(min_date,max_date).values
# holidays = holidays.UnitedStates(years=range(min_year,max_year))

In [None]:
trading_data.nlargest(10,'LGB_Ending_Balance_Percent_Change')

In [None]:
trading_data.nsmallest(10,'LGB_Ending_Balance_Percent_Change')

In [None]:
detailed_results = pd.concat((data,strategy_result),axis=1)
detailed_results['next_day_change'] = detailed_results.ending_balance_percent_change.shift(-1)
detailed_results.to_csv('./results.csv')

In [None]:
detailed_results.nsmallest(10,'next_day_change')

In [None]:
detailed_results.nlargest(10,'next_day_change')

(75, 77, 674)

In [83]:
p = np.array(list(map(list,prediction_results.predictions))).argmax(axis=1)

In [85]:
np.mean(p==1) ## liquidate

0.6211726384364821

In [87]:
np.mean(p==2) ## buy

0.18013029315960913

In [88]:
np.mean(p==0) ## sell

0.1986970684039088

In [89]:
np.bincount(p)

array([ 610, 1907,  553])

In [90]:
target

Unnamed: 0,target,change
0,1.0,0.005108
1,2.0,0.013874
2,1.0,-0.006842
3,1.0,-0.007050
4,2.0,0.018233
...,...,...
3665,1.0,0.007222
3666,1.0,-0.000666
3667,1.0,-0.000436
3668,1.0,0.001615


In [91]:
prediction_results

Unnamed: 0,target,change,predictions
600,2.0,0.024042,"[0.3891805432615454, 0.22261114358503092, 0.38..."
601,0.0,-0.023133,"[0.3891805432615454, 0.22261114358503092, 0.38..."
602,2.0,0.030832,"[0.39162894257651587, 0.2217188325012146, 0.38..."
603,2.0,0.034914,"[0.3898598615305706, 0.22299971341295338, 0.38..."
604,0.0,-0.016483,"[0.3898598615305706, 0.22299971341295338, 0.38..."
...,...,...,...
3665,1.0,0.007222,"[0.25793933498504407, 0.4727444133626097, 0.26..."
3666,1.0,-0.000666,"[0.25534640206112563, 0.4780446455663999, 0.26..."
3667,1.0,-0.000436,"[0.33512845210423203, 0.38715302793788536, 0.2..."
3668,1.0,0.001615,"[0.3590579302091627, 0.3433933647175502, 0.297..."


array([[0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       ...,
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0]], dtype=uint8)

array([[0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       ...,
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0]], dtype=uint8)

In [107]:
# target['predictions'] = pd.get_dummies(target.target).values
target['predictions'] = list(pd.get_dummies(target.target).values)

In [108]:
target

Unnamed: 0,target,change,predictions
0,1.0,0.005108,"[0, 1, 0]"
1,2.0,0.013874,"[0, 0, 1]"
2,1.0,-0.006842,"[0, 1, 0]"
3,1.0,-0.007050,"[0, 1, 0]"
4,2.0,0.018233,"[0, 0, 1]"
...,...,...,...
3665,1.0,0.007222,"[0, 1, 0]"
3666,1.0,-0.000666,"[0, 1, 0]"
3667,1.0,-0.000436,"[0, 1, 0]"
3668,1.0,0.001615,"[0, 1, 0]"


In [113]:
# show simulated trading of perfect trades vs buy and hold as benchmark
buy_and_hold_result = eval_buy_and_hold(prediction_results, data)
strategy_result = eval_strategy(target[600:], data)
print('Max Drawdown:')
print('  - Buy and Hold:', str(round(100*min(buy_and_hold_result.draw_down),2))+'%')
print('  - LGB Strategy:', str(round(100*min(strategy_result.draw_down),2))+'%')
print('  - Longs:', sum(strategy_result.actions=='long'))
print('  - Shorts:', sum(strategy_result.actions=='short'))
print('  - Liquidate:', sum(strategy_result.actions=='liquidate'))

#align index number
strategy_result.index = prediction_results.index
buy_and_hold_result.index = prediction_results.index
#insert based on index number
def plot_result(data, benchmark, strategy):
    data = data[:] # making a copy so original data isn't altered.
    data.insert(1,'BNH_Strategy', benchmark.ending_balance)
    data.insert(1,'LGB_Strategy', strategy.ending_balance)
    data.insert(1,'LGB_Ending_Balance_Percent_Change', strategy.ending_balance_percent_change)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=data.date[data.BNH_Strategy.notnull()], y=data.BNH_Strategy[data.BNH_Strategy.notnull()], mode='lines', name='Buy and Hold Strategy'))
    fig.add_trace(go.Scatter(x=data.date[data.BNH_Strategy.notnull()], y=data.LGB_Strategy[data.BNH_Strategy.notnull()], mode='lines', name='Lightgbm Strategy'))
    fig.show()
    return data
trading_data = plot_result(data, buy_and_hold_result, strategy_result)

Max Drawdown:
  - Buy and Hold: -33.72%
  - LGB Strategy: 0.0%
  - Longs: 596
  - Shorts: 436
  - Liquidate: 872
