In [None]:
# ========================================================= Describe Stock Returns ============================================================= #
# Input data files are available in the read-only "../input/" directory on Kaggle
# ======================================================================================================================================== #
# # Install packages on the Kaggle notebook
# !pip install yfinance
# !pip install pandas-datareader
# !pip install seaborn
# !pip install arch
# !pip install mplfinance

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import mplfinance as fplt
import matplotlib.dates as mdates
import seaborn as sns
import yfinance as yf
import pandas_datareader.data as pdr # to access FRED
from pandas_datareader.famafrench import  get_available_datasets


# enter the FRED API key (cf. https://fred.stlouisfed.org/docs/api/api_key.html)
fred_api_key = os.getenv("./Test_data/.fred_apikey")

# Define a function to download stock, bond, inflation data
def get_stock_data(ticker: str, start_date: str, end_date: str):
    """
        INPUT
            ticker: a Yahoo! Finance stock ticker
            start_date, end_date: start and end dates of data
            
            out_dir: a path to the directory where data will be stored
        OUTPUT
            a dataframe of all variables to be downloaded
    """
    stock = yf.Ticker(ticker)
    stock_df = stock.history(start=start_date, end=end_date)
    stock_df.index = pd.to_datetime(stock_df.index).tz_localize(None)
    stock_df['raw_return'] = ( (stock_df.Close + stock_df.Dividends) / ( stock_df.Close.shift(1) + stock_df.Dividends.shift(1) ) ) - 1.
    stock_df['log_return'] = np.log(stock_df['raw_return'] + 1.)
    stock_df['price'] = stock_df.Close
    stock_df['Dividends_ffill'] = stock_df.Dividends.where(stock_df.Dividends > 0, np.nan).fillna(method = 'ffill', axis = 0)
    stock_df['dp'] = stock_df.Dividends_ffill / stock_df.Close
    stock_df.drop(columns = ['Dividends_ffill'], inplace = True)

    # do the log-transform of volumes
    stock_df['log_volume'] = np.log(stock_df['Volume'])

    # get risk-free rate from Fama & French's data library
    ff_factors_daily_df = pdr.DataReader('F-F_Research_Data_Factors_daily', 'famafrench',  start = start_date, end = end_date, api_key = fred_api_key)
    stock_df = pd.merge(stock_df, ff_factors_daily_df[0].RF / 100, how = 'inner', left_index=True, right_index=True).drop_duplicates(keep='first').rename_axis('date').reset_index()
    stock_df.dropna(inplace = True)
    return stock_df

##### Download data
ticker = 'SPY'
start_date = '2010-01-01'
end_date = '2023-05-10'

stock_df = get_stock_data(ticker, start_date, end_date)
display(stock_df.head() )

##### Plot candlestick chart
stock_small_df = stock_df.iloc[-100:, :].set_index('date')
MAs = (10, 30, 50)
kwargs = dict(type = 'candle', 
                        mav = MAs, # set sizes of moving averages
                        volume = True,
                        show_nontrading = True, 
                        returnfig = True,
                        figsize = (13, 10), 
                        figscale = 1.)
style = fplt.make_mpf_style(base_mpl_style = 'bmh', y_on_right = False, facecolor = 'w', figcolor = 'w', gridstyle = ':')
fig, axes = fplt.plot(stock_small_df, style = style, **kwargs, title = ticker, ylabel = 'Price ($)', ylabel_lower = 'Volume')
axes[0].legend( [None]*(len(MAs) + 2) )
handles = axes[0].get_legend().legendHandles
axes[0].legend(handles=handles[2:], labels= ['MA(10)', 'MA(30)', 'MA(50)'])
fig.savefig('./Results/candlestick_chart.png', dpi=300, bbox_inches="tight")

##### Line plot close prices
fig, ax = plt.subplots( figsize=(13, 10) )
ax.plot(stock_df.date, stock_df.Close, color='cyan', linestyle='-', linewidth=2.5, label='Closing Price')

# set minor ticks every quarter
ax.xaxis.set_minor_locator( mdates.MonthLocator(interval=3) )

# set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d') )
plt.grid(ls=':')
plt.xlabel('Date', fontsize=15)
plt.ylabel(' ', fontsize=15)
plt.xticks(rotation = 10)
plt.legend(fontsize=10, loc='lower right')
print(ax)

##### Line plot returns
fig, ax = plt.subplots( figsize=(13, 10) )
ax.plot(stock_df.date, stock_df.raw_return, color='magenta', linestyle='-', linewidth=2.5, label='return')

# set minor ticks every quarter
ax.xaxis.set_minor_locator( mdates.MonthLocator(interval=3) )

# set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d') )
plt.grid(ls=':')
plt.xlabel('Date', fontsize=15)
plt.ylabel(' ', fontsize=15)
plt.xticks(rotation = 10)
plt.legend(fontsize=10, loc='lower right')
print(ax)

##### Histogram plot returns
sns.displot(stock_df.raw_return, bins=20, kde=True, color="r")
plt.show()

##### Plot autocorrelation of returns
fig, ax = plt.subplots( figsize=(10, 8) )
plt.acorr(stock_df.raw_return)
plt.axhline(0, ls = '--', linewidth = 2, color ='red') 
plt.grid(ls=':')
plt.xlabel('Lags', fontsize=15)
plt.ylabel('Autocorrelation', fontsize=15)
plt.show()


In [None]:
# ===============================Plot Importance Scores, SHAP Values, and Performance Statistics of a Trading Strategy invested in SPY over Time ===================================== #
# ====================================================================================================================================================== #
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import gc
import os
import sys
import datetime
from varname import nameof
import time

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.transforms as transforms
import seaborn as sns

#Gradient Color Bar Plots
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from matplotlib import colors as mcolors, path

from dask.distributed import Client, LocalCluster
import joblib
import multiprocessing
# import dask
# import distributed
# dask.config.set({"distributed.comm.timeouts.tcp": "100000s", "distributed.scheduler.allowed-failures": 999})
# num_cores = multiprocessing.cpu_count()
num_cores = 30


##### Set the current working directory
path="e:/Copy/SCRIPTS/Forecast_Stocks/Jupyter_notebooks/"
os.chdir(path)

##### parse dates and times
def date_parser(date): 
    dt = datetime.datetime.strptime(date, '%m/%d/%Y')
    return dt.strftime('%Y-%m-%d') 

##### parse dates and times
def date_parser2(date): 
    dt = datetime.datetime.strptime(date, '%Y:%m:%d')
    return dt.strftime('%Y-%m-%d') 

##### Plot importance scores, SHAP values, and cross-validation scores generated by a ML algorithm with a given loss function and a scoring function
def plot_test_scores(   algo = 'LGBM',
                                    loss_fn = 'As2',
                                    score_fn = 'Gain_to_pain_ratio_fixed_trans_cost',
                                    n_trials = 30,
                                    init_wealth = 1000,
                                    fixed_trans_cost_train = 10,
                                    variable_trans_cost_train = 0.005):
    ''' Plot importance scores, SHAP values, and cross-validation scores generated by a ML algorithm with a given loss function and a scoring function.
    
    '''

    out_dir = f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/graphs'
    if not os.path.exists( out_dir ):
    # create the directory if it does not exist .
        os.makedirs( out_dir )

    importances_df = pd.read_csv(f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/' \
                                                        f'importances_1000_n_trials_{n_trials}_init_wealth_{init_wealth}_fixed_trans_cost_{fixed_trans_cost_train}_variable_trans_cost_{variable_trans_cost_train}.csv', \
                                                        engine = 'python', encoding = 'utf-8', skipinitialspace = True, sep = ',', parse_dates = ['start_date', 'trans_date', 'end_date'], index_col = 'end_date')
    importances_df = importances_df.loc[:, 'dp':].astype(np.float64)
    importances_df = importances_df.where(importances_df <= 50., 50.) # scale down the cells with their values greater than 50
    importances_df.index = importances_df.index.to_series().dt.strftime("%Y-%m-%d")
    # display( importances_df.head() )
    print('The shape of the importance scores dataframe is ', importances_df.shape)

    shap_vals_df = pd.read_csv(f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/'
                                                    f'SHAP_1000_n_trials_{n_trials}_init_wealth_{init_wealth}_fixed_trans_cost_{fixed_trans_cost_train}_variable_trans_cost_{variable_trans_cost_train}.csv', \
                                                        engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'trans_date', 'end_date'], index_col = 'end_date')
    shap_vals_df = shap_vals_df.loc[:, 'dp':]
    shap_vals_df = shap_vals_df.where(shap_vals_df <= 1., 1.) # scale down the cells with their values greater than one
    shap_vals_df.index = shap_vals_df.index.to_series().dt.strftime("%Y-%m-%d")
    # display( shap_vals_df.head() )
    print('The shape of the SHAP dataframe is ', shap_vals_df.shape)

    cv_scores_df = pd.read_csv(f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/'
                                                    f'scores_1000_n_trials_{n_trials}_init_wealth_{init_wealth}_fixed_trans_cost_{fixed_trans_cost_train}_variable_trans_cost_{variable_trans_cost_train}.csv', \
                                                            engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'trans_date', 'end_date'], index_col = 'end_date')
    
    VIX_df = pd.read_csv(f'../Data/VIX.csv', engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates = ['date'], index_col = 'date')
    
    cv_scores_VIX_df = pd.merge(cv_scores_df, VIX_df, left_index = True, right_index = True, how = 'left')

    columns_to_remove = [  'start_date',	'trans_date',	'fit_time',	'score_time', 'test_Average_Precision', 'test_Precision',	'test_F1_score', 'test_Cross_entropy',	'test_As1_score', 'test_As2_score',	\
                                            'test_Boost_score',	'test_Brier_score', 'test_Gain_to_pain_ratio_variable_trans_cost', 'test_Calmar_ratio_fixed_trans_cost', 'test_Calmar_ratio_variable_trans_cost', \
                                            'test_Sharpe_ratio_variable_trans_cost', 'test_Sortino_ratio_variable_trans_cost', 'test_CECPP_variable_trans_cost']
    cv_scores_VIX_df = cv_scores_VIX_df.loc[:, ~cv_scores_VIX_df.columns.isin(columns_to_remove)] # drop some columns
    column_mapping = {  'test_Accuracy' : 'Accuracy', 
                                        'test_AUC': 'AUC', 
                                        'test_Gain_to_pain_ratio_fixed_trans_cost': 'Gain-to-pain ratio (fixed trans. cost)',
                                        'test_Sharpe_ratio_fixed_trans_cost': 'Sharpe ratio (fixed trans. cost)',
                                        'test_Sortino_ratio_fixed_trans_cost': 'Sortino ratio (fixed trans. cost)',
                                        'test_CECPP_fixed_trans_cost': 'CECPP (fixed trans. cost)'
                                    }
    cv_scores_VIX_df.rename(columns = column_mapping, inplace = True) # rename columns

    # columns_to_std = [ 'Calmar ratio (fixed trans. cost)', 'Calmar ratio (variable trans. cost)']
    # cv_scores_VIX_df[columns_to_std] = pd.DataFrame(StandardScaler(with_mean=False).fit_transform(cv_scores_VIX_df[columns_to_std].values), columns = columns_to_std, index = cv_scores_VIX_df.index)
    cv_scores_VIX_df.index = cv_scores_VIX_df.index.to_series().dt.strftime("%Y-%m-%d")
    # display( cv_scores_VIX_df.head() )
    print('The shape of the CV scores dataframe is ', cv_scores_VIX_df.shape)

    ##### plot heatmap of importance scores
    fig, ax = plt.subplots( figsize=(15, 9) )
    ax = sns.heatmap(importances_df.T, cmap = plt.get_cmap('cool') )
    ax.tick_params(axis='x', which='major', labelsize=9, labelrotation=70)
    ax.set_xlabel('End Date', fontsize=15)
    ax.set_ylabel('Features', fontsize=15)
    # print(ax)
    fig.savefig(f'{out_dir}/importance_scores.png', dpi=150, bbox_inches="tight")
    plt.close()

    ##### plot heatmap of SHAP values
    fig, ax = plt.subplots( figsize=(15, 9) )
    ax = sns.heatmap(shap_vals_df.T, cmap = plt.get_cmap('cool') )
    ax.tick_params(axis='x', which='major', labelsize=9, labelrotation=70)
    ax.set_xlabel('End Date', fontsize=15)
    ax.set_ylabel('Features', fontsize=15)
    # print(ax)
    fig.savefig(f'{out_dir}/shap_vals.png', dpi=150, bbox_inches="tight")
    plt.close()

    ##### line plot cross-validation scores
    axes = cv_scores_VIX_df.iloc[:, :-1].plot(kind = 'line', subplots = True, figsize = (20, 20), rot = 30, cmap = plt.get_cmap('hsv_r'), sharex = True, sharey = False)
    for i, ax in enumerate(axes):
        ax.axhline(0, ls = '--', linewidth = 3, color ='black') 
        ax.grid(ls=':')
        ax1 = ax.twinx()
        ax1.plot(cv_scores_VIX_df.index, cv_scores_VIX_df.VIX, color = "lightsteelblue")
        if i == 0:
            ax1.legend(['VIX'], loc='best', shadow = False)
        # ax.legend(fontsize=15, bbox_to_anchor=(1.0, 1.0), shadow=True)
        [l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
        [l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
        ax.set_xlabel('')
    fig=axes[0].figure
    fig.text(0.5, 0.07, "End Date", ha="center", va="center", fontsize = 15)
    fig.text(0.1, 0.5, "CV Scores", ha="center", va="center", rotation=90, fontsize = 15)
    fig.savefig(f'{out_dir}/cv_scores_lineplot.png', dpi=150, bbox_inches="tight")
    # plt.show()
    plt.close()

    del cv_scores_VIX_df, cv_scores_df, VIX_df
    gc.collect()

    return True

##### Plot the out-of-sample performance scores [of a trading strategy] generated by a ML algorithm with a given loss function and a scoring function
def plot_perf_scores(  algo = 'LGBM',
                                    loss_fn = 'As2',
                                    score_fn = 'Gain_to_pain_ratio_fixed_trans_cost',
                                    init_wealth = 1000,
                                    invest_window = 200,
                                    use_strategy = 'fixed_trans_cost',
                                    trans_cost = 0.5):
    ''' Plot the out-of-sample performance scores [of a trading strategy] generated by a ML algorithm with a given loss function and a scoring function.
    INPUT
        algo: a ML algorithm used to forecast
        loss_fn: a loss function used to train the model
        score_fn: a scoring function used to cross validate the model
        init_wealth: an initial wealth used to invest
        invest_window: an investment horizon
        use_strategy: the trading strategy using fixed/variable transaction costs
        trans_cost: an amount of transaction cost (i.e., fixed_trans_costs = [0.05, 0.1, 0.5, 1.0] for the fixed cost strategy 
                                                                            and  variable_trans_costs = [0.0005, 0.001, 0.005, 0.01] for the variable cost strategy)
    OUTPUT
        Matplotlib graphs
    '''

    out_dir = f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/graphs'
    if not os.path.exists( out_dir ):
    # create the directory if it does not exist .
        os.makedirs( out_dir )

    try:
        performance_df = pd.read_csv(f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/performance/' \
                                                            f'performance_hper_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_cost}.csv', \
                                                                engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'end_date'], index_col = 'end_date')
    except:
        raise Exception(f'Strategy \'{use_strategy}\' does not exist!')

    performance_df.drop(columns = ['start_date', 'ratio_profit_over_total_loss',  'annualized_return', 'annualized_return_bh'], axis = 1, inplace = True)

    VIX_df = pd.read_csv(f'../Data/VIX.csv', engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates = ['date'], index_col = 'date')
    performance_df = pd.merge(performance_df, VIX_df, left_index = True, right_index = True, how = 'left')
    
    column_mapping = { 'average_number_of_trades': 'Average number of trades',
                                        'percentage_of_winning_trades': 'Percentage of winning trades',
                                        'largest_raw_return': 'Largest raw return',	
                                        'smallest_raw_return': 'Smallest raw return',	
                                        'ratio_win_loss': 'Win/loss ratio',
                                        'max_number_of_consecutive_winners': 'Maximum number of consecutive winners',
                                        'max_number_of_consecutive_losers': 'Maximum number of consecutive losers',
                                        'annualized_excess_return': 'Annualized excess return',
                                        'annualized_standard_deviation': 'Annualized standard deviation',	
                                        'max_drawdown': 'Maximum drawdown',
                                        'Schwager_gain-to-pain_ratio': 'Schwager\'s gain/pain ratio',
                                        'Calmar_ratio': 'Calmar ratio',
                                        'Sharpe_ratio': 'Sharpe ratio',
                                        'Sortino_ratio': 'Sortino ratio',
                                        'cecpp': 'CECPP',
                                        'mrar': 'Morningstar\'s risk-adjusted rating'
                                    }
    performance_df.rename(columns = column_mapping, inplace = True)
    # display( performance_df.head() )
    print('The shape of the performance dataframe is ', performance_df.shape)
    
    ##### line plot the annualized excess return of a trading strategy
    ax = performance_df.loc[:, 'Annualized excess return'].plot(kind='line', subplots = False, figsize=(10, 7), rot=30, color = ['magenta'], sharex=True, sharey=True)
    ax.axhline(0, ls = '--', linewidth = 3, color ='black') 
    ax.grid(ls=':')
    ax1 = ax.twinx()
    ax1.plot(performance_df.index, performance_df.VIX, color = "lightsteelblue")
    ax1.legend(['VIX'], loc='best', shadow = False)
    # ax.legend(fontsize=15, bbox_to_anchor=(1.0, 1.0), shadow=True)
    [l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
    [l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
    ax.set_xlabel('End Date', fontsize = 15, fontweight = 'medium')
    ax.set_ylabel('Annualized excess return', fontsize = 15, fontweight = 'medium')
    fig=ax.figure
    # fig.text(0.5, 0.15, "End Date", ha="center", va="center", fontsize = 15, fontweight = 'medium')
    # fig.text(0.09, 0.5, "Performance Statistics", ha="center", va="center", rotation = 90, fontsize = 15, fontweight = 'medium')
    fig.savefig(f'{out_dir}/OoS_perf_lineplot_AERet_{invest_window}_{use_strategy}_{trans_cost}.png', dpi=150, bbox_inches="tight")
    # plt.show()
    plt.close()


    # ##### line plot the statistics of a trading strategy
    # axes = performance_df.iloc[:, :-1].plot(kind='line', subplots = True, figsize=(20, 20), rot=30, cmap = plt.get_cmap('gist_rainbow'), sharex=True, sharey=False)
    # for i, ax in enumerate(axes):
    #     ax.axhline(0, ls = '--', linewidth = 3, color ='black') 
    #     ax.grid(ls=':')
    #     ax1 = ax.twinx()
    #     ax1.plot(performance_df.index, performance_df.VIX, color = "lightsteelblue")
    #     if i == 0:
    #         ax1.legend(['VIX'], loc='best', shadow = False)
    #     # ax.legend(fontsize=15, bbox_to_anchor=(1.0, 1.0), shadow=True)
    #     [l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
    #     [l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
    #     ax.set_xlabel('')
    # fig=axes[0].figure
    # fig.text(0.5, 0.15, "End Date", ha="center", va="center", fontsize = 15, fontweight = 'medium')
    # fig.text(0.09, 0.5, "Performance Statistics", ha="center", va="center", rotation = 90, fontsize = 15, fontweight = 'medium')
    # fig.savefig(f'{out_dir}/OoS_perf_lineplot_{invest_window}_{use_strategy}_{trans_cost}.png', dpi=150, bbox_inches="tight")
    # # plt.show()
    # plt.close()

    # # ##### box plot the statistics of a trading strategy
    # performance_df.drop(columns = ['Average number of trades', 'Largest raw return', 'Smallest raw return', 'Maximum number of consecutive winners', \
    #                                                          'Maximum number of consecutive losers', 'Maximum drawdown', 'Calmar ratio'], axis = 1, inplace = True)

    # # scale up some performance metrics to put all the metrics roughly on the same scale                                               
    # # performance_df['Annualized standard deviation'] = 10*performance_df['Annualized standard deviation']
    # # performance_df['CECPP'] = 10*performance_df['CECPP']
    # performance_df['Morningstar\'s risk-adjusted rating'] = 10*performance_df['Morningstar\'s risk-adjusted rating']
    # performance_df['Annualized excess return'] = 10*performance_df['Annualized excess return']

    # fig, ax = plt.subplots( figsize=(13, 8) )
    # ax = sns.boxplot(data=performance_df.iloc[:, :-1])
    # ax = sns.stripplot(data=performance_df.iloc[:, :-1], jitter=0.2, dodge=True, color='gray', size=3, alpha=0.9)
    # ax.axhline(0, ls = '--', linewidth = 2, color ='red') 
    # ax.axhline(1, ls = '--', linewidth = 1, color ='red') 
    # # trans = transforms.blended_transform_factory(ax.get_yticklabels()[0].get_transform(), ax.transData)
    # ax.text(-0.62, 1, "1", ha="center", va="center")
    # ax.grid(ls=':')
    # ax.legend(fontsize=15)
    # ax.set_xlabel('Performance Statistics', fontsize=15)
    # ax.set_ylabel('')
    # ax.tick_params(axis='x', which='major', labelsize=9, labelrotation=45)
    # fig.savefig(f'{out_dir}/OoS_perf_boxplot_{invest_window}_{use_strategy}_{trans_cost}.png', dpi=150, bbox_inches="tight")
    # # print(ax)
    # plt.close()
    
    del performance_df, VIX_df
    gc.collect()
    
    return True

if __name__ == "__main__":
    startTime = time.time()

    # Define a list of algorithms employed
    # algos = ['XGB_SPY_all_vars', 'XGB_SPY_all_vars_plus_patterns']
    algos = ['RF_SPY_all_vars', 'RF_SPY_all_vars_plus_patterns']
    # algos = ['LGBM_SPY_all_vars', 'LGBM_SPY_all_vars_plus_patterns']

    # Define a list of loss functions used to train a ML model
    # loss_fns = ['CE', 'Brier', 'Boost', 'As1', 'As2']
    loss_fns = ['CE']

    # Define a list of score functions used to cross validate a ML algorithm
    score_fns = ['Accuracy', 'AUC',  'Gain_to_pain_ratio_fixed_trans_cost', 'Gain_to_pain_ratio_variable_trans_cost', 'Calmar_ratio_fixed_trans_cost', \
                        'Calmar_ratio_variable_trans_cost', 'Sharpe_ratio_fixed_trans_cost', 'Sharpe_ratio_variable_trans_cost', 'Sortino_ratio_fixed_trans_cost', \
                        'Sortino_ratio_variable_trans_cost', 'CECPP_fixed_trans_cost', 'CECPP_variable_trans_cost']    
    score_fns_fixed_trans_cost = ['Accuracy', 'AUC',  'Gain_to_pain_ratio_fixed_trans_cost', 'Calmar_ratio_fixed_trans_cost', \
                                                        'Sharpe_ratio_fixed_trans_cost',  'Sortino_ratio_fixed_trans_cost',  'CECPP_fixed_trans_cost']   
    score_fns_variable_trans_cost = ['Accuracy', 'AUC',  'Gain_to_pain_ratio_variable_trans_cost', 'Calmar_ratio_variable_trans_cost', \
                                                            'Sharpe_ratio_variable_trans_cost', 'Sortino_ratio_variable_trans_cost', 'CECPP_variable_trans_cost']  

    # Define a list of holding periods
    invest_windows = [100, 200]

    # Define transaction costs
    fixed_trans_costs = [0.05, 0.1, 0.5, 1.0] 
    variable_trans_costs = [0.0005, 0.001, 0.005, 0.01]

    try:
        # client = Client('tcp://localhost:8786', timeout='2s')
        cluster = LocalCluster(n_workers=num_cores, processes=True, memory_limit='auto', threads_per_worker=1, scheduler_port=8786, dashboard_address='localhost:8787')
        client = Client(cluster)
    except OSError:
        client.close()
        cluster.close()
        time.sleep(20)
        cluster = LocalCluster(n_workers=num_cores, processes=True, memory_limit='auto', threads_per_worker=1, scheduler_port=8786, dashboard_address='localhost:8787')
        client = Client(cluster)
    print(client)

    # with joblib.parallel_backend('dask'):
    #     job_run = joblib.Parallel(verbose=20) (joblib.delayed(plot_test_scores)(algo = algo, 
    #                                                                                                                         loss_fn = loss_fn, 
    #                                                                                                                         score_fn = score_fn) \
    #                                                                     for algo in algos
    #                                                                         for loss_fn in loss_fns
    #                                                                             for score_fn in score_fns)

    # time.sleep(30)

    with joblib.parallel_backend('dask'):
        job_run = joblib.Parallel(verbose=20) (joblib.delayed(plot_perf_scores)(algo = algo, 
                                                                                                                            loss_fn = loss_fn, 
                                                                                                                            score_fn = score_fn, 
                                                                                                                            invest_window = invest_window, 
                                                                                                                            use_strategy = 'fixed_trans_cost', 
                                                                                                                            trans_cost = trans_cost) \
                                                                    for algo in algos
                                                                        for loss_fn in loss_fns
                                                                            for score_fn in score_fns_fixed_trans_cost
                                                                                for invest_window in invest_windows
                                                                                    for trans_cost in fixed_trans_costs)
    # time.sleep(30)

    # with joblib.parallel_backend('dask'):
    #     job_run = joblib.Parallel(verbose=20) (joblib.delayed(plot_perf_scores)(algo = algo, 
    #                                                                                                                         loss_fn = loss_fn, 
    #                                                                                                                         score_fn = score_fn, 
    #                                                                                                                         invest_window = invest_window, 
    #                                                                                                                         use_strategy = 'variable_trans_cost', 
    #                                                                                                                         trans_cost = trans_cost) \
    #                                                                 for algo in algos
    #                                                                     for loss_fn in loss_fns
    #                                                                         for score_fn in score_fns_variable_trans_cost
    #                                                                             for invest_window in invest_windows
    #                                                                                 for trans_cost in variable_trans_costs)
    
    # time.sleep(30)

    client.close()
    cluster.close()

    print( 'The script took {} second !'.format(time.time() - startTime) )

In [1]:
# ===============================Plot Importance Scores, SHAP Values, and Performance Statistics of a Trading Strategy invested in BTC-USD over Time ================================= #
# ======================================================================================================================================================= #
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import gc
import os
import sys
import datetime
import time

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.transforms as transforms
import seaborn as sns

#Gradient Color Bar Plots
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from matplotlib import colors as mcolors, path

from dask.distributed import Client, LocalCluster
import joblib
import multiprocessing
# import dask
# import distributed
# dask.config.set({"distributed.comm.timeouts.tcp": "100000s", "distributed.scheduler.allowed-failures": 999})
# num_cores = multiprocessing.cpu_count()
num_cores = 30


# ##### Set the current working directory
# path="e:/Copy/SCRIPTS/Forecast_Stocks/Jupyter_notebooks/"
# os.chdir(path)

##### parse dates and times
def date_parser(date): 
    dt = datetime.datetime.strptime(date, '%m/%d/%Y')
    return dt.strftime('%Y-%m-%d') 

##### parse dates and times
def date_parser2(date): 
    dt = datetime.datetime.strptime(date, '%Y:%m:%d')
    return dt.strftime('%Y-%m-%d') 

##### Plot importance scores, SHAP values, and cross-validation scores generated by a ML algorithm with a given loss function and a scoring function
def plot_test_scores(   algo = 'LGBM',
                                    loss_fn = 'As2',
                                    tau: int = 1, # a forecast horizon
                                    score_fn = 'Gain_to_pain_ratio_fixed_trans_cost',
                                    n_trials = 30,
                                    init_wealth = 1000,
                                    fixed_trans_cost_train = 10,
                                    variable_trans_cost_train = 0.005):
    ''' Plot importance scores, SHAP values, and cross-validation scores generated by a ML algorithm with a given loss function and a scoring function.
    
    '''

    out_dir = f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/graphs'
    if not os.path.exists( out_dir ):
    # create the directory if it does not exist .
        os.makedirs( out_dir )

    importances_df = pd.read_csv(f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/tau={tau}/' \
                                                        f'importances_1000_n_trials_{n_trials}_init_wealth_{init_wealth}_fixed_trans_cost_{fixed_trans_cost_train}_variable_trans_cost_{variable_trans_cost_train}.csv', \
                                                        engine = 'python', encoding = 'utf-8', skipinitialspace = True, sep = ',', parse_dates = ['start_date', 'trans_date', 'end_date'], index_col = 'end_date')
    importances_df = importances_df.loc[:, 'dp':].astype(np.float64)
    importances_df = importances_df.where(importances_df <= 50., 50.) # scale down the cells with their values greater than 50
    importances_df.index = importances_df.index.to_series().dt.strftime("%Y-%m-%d")
    # display( importances_df.head() )
    print('The shape of the importance scores dataframe is ', importances_df.shape)

    shap_vals_df = pd.read_csv(f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/tau={tau}/'
                                                    f'SHAP_1000_n_trials_{n_trials}_init_wealth_{init_wealth}_fixed_trans_cost_{fixed_trans_cost_train}_variable_trans_cost_{variable_trans_cost_train}.csv', \
                                                        engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'trans_date', 'end_date'], index_col = 'end_date')
    shap_vals_df = shap_vals_df.loc[:, 'dp':]
    shap_vals_df = shap_vals_df.where(shap_vals_df <= 1., 1.) # scale down the cells with their values greater than one
    shap_vals_df.index = shap_vals_df.index.to_series().dt.strftime("%Y-%m-%d")
    # display( shap_vals_df.head() )
    print('The shape of the SHAP dataframe is ', shap_vals_df.shape)

    cv_scores_df = pd.read_csv(f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/tau={tau}/'
                                                    f'scores_1000_n_trials_{n_trials}_init_wealth_{init_wealth}_fixed_trans_cost_{fixed_trans_cost_train}_variable_trans_cost_{variable_trans_cost_train}.csv', \
                                                            engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'trans_date', 'end_date'], index_col = 'end_date')
    
    VIX_df = pd.read_csv(f'../Data/VIX.csv', engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates = ['date'], index_col = 'date')
    
    cv_scores_VIX_df = pd.merge(cv_scores_df, VIX_df, left_index = True, right_index = True, how = 'left')

    columns_to_remove = [  'start_date',	'trans_date',	'fit_time',	'score_time', 'test_Average_Precision', 'test_Precision',	'test_F1_score', 'test_Cross_entropy',	'test_As1_score', 'test_As2_score',	\
                                            'test_Boost_score',	'test_Brier_score', 'test_Gain_to_pain_ratio_variable_trans_cost', 'test_Calmar_ratio_fixed_trans_cost', 'test_Calmar_ratio_variable_trans_cost', \
                                            'test_Sharpe_ratio_variable_trans_cost', 'test_Sortino_ratio_variable_trans_cost', 'test_CECPP_variable_trans_cost']
    cv_scores_VIX_df = cv_scores_VIX_df.loc[:, ~cv_scores_VIX_df.columns.isin(columns_to_remove)] # drop some columns
    column_mapping = {  'test_Accuracy' : 'Accuracy', 
                                        'test_AUC': 'AUC', 
                                        'test_Gain_to_pain_ratio_fixed_trans_cost': 'Gain-to-pain ratio (fixed trans. cost)',
                                        'test_Sharpe_ratio_fixed_trans_cost': 'Sharpe ratio (fixed trans. cost)',
                                        'test_Sortino_ratio_fixed_trans_cost': 'Sortino ratio (fixed trans. cost)',
                                        'test_CECPP_fixed_trans_cost': 'CECPP (fixed trans. cost)'
                                    }
    cv_scores_VIX_df.rename(columns = column_mapping, inplace = True) # rename columns

    # columns_to_std = [ 'Calmar ratio (fixed trans. cost)', 'Calmar ratio (variable trans. cost)']
    # cv_scores_VIX_df[columns_to_std] = pd.DataFrame(StandardScaler(with_mean=False).fit_transform(cv_scores_VIX_df[columns_to_std].values), columns = columns_to_std, index = cv_scores_VIX_df.index)
    cv_scores_VIX_df.index = cv_scores_VIX_df.index.to_series().dt.strftime("%Y-%m-%d")
    # display( cv_scores_VIX_df.head() )
    print('The shape of the CV scores dataframe is ', cv_scores_VIX_df.shape)

    ##### plot heatmap of importance scores
    fig, ax = plt.subplots( figsize=(15, 9) )
    ax = sns.heatmap(importances_df.T, cmap = plt.get_cmap('cool') )
    ax.tick_params(axis='x', which='major', labelsize=9, labelrotation=70)
    ax.set_xlabel('End Date', fontsize=15)
    ax.set_ylabel('Features', fontsize=15)
    # print(ax)
    fig.savefig(f'{out_dir}/importance_scores_tau={tau}.png', dpi=150, bbox_inches="tight")
    plt.close()

    ##### plot heatmap of SHAP values
    fig, ax = plt.subplots( figsize=(15, 9) )
    ax = sns.heatmap(shap_vals_df.T, cmap = plt.get_cmap('cool') )
    ax.tick_params(axis='x', which='major', labelsize=9, labelrotation=70)
    ax.set_xlabel('End Date', fontsize=15)
    ax.set_ylabel('Features', fontsize=15)
    # print(ax)
    fig.savefig(f'{out_dir}/shap_vals_tau={tau}.png', dpi=150, bbox_inches="tight")
    plt.close()

    ##### line plot cross-validation scores
    axes = cv_scores_VIX_df.iloc[:, :-1].plot(kind = 'line', subplots = True, figsize = (20, 20), rot = 30, cmap = plt.get_cmap('hsv_r'), sharex = True, sharey = False)
    for i, ax in enumerate(axes):
        ax.axhline(0, ls = '--', linewidth = 3, color ='black') 
        ax.grid(ls=':')
        ax1 = ax.twinx()
        ax1.plot(cv_scores_VIX_df.index, cv_scores_VIX_df.VIX, color = "lightsteelblue")
        if i == 0:
            ax1.legend(['VIX'], loc='best', shadow = False)
        # ax.legend(fontsize=15, bbox_to_anchor=(1.0, 1.0), shadow=True)
        [l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
        [l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
        ax.set_xlabel('')
    fig=axes[0].figure
    fig.text(0.5, 0.07, "End Date", ha="center", va="center", fontsize = 15)
    fig.text(0.1, 0.5, "CV Scores", ha="center", va="center", rotation=90, fontsize = 15)
    fig.savefig(f'{out_dir}/cv_scores_lineplot_tau={tau}.png', dpi=150, bbox_inches="tight")
    # plt.show()
    plt.close()

    del cv_scores_VIX_df, cv_scores_df, VIX_df
    gc.collect()

    return True

##### Plot the out-of-sample performance scores [of a trading strategy] generated by a ML algorithm with a given loss function and a scoring function
def plot_perf_scores(  algo = 'LGBM',
                                    loss_fn = 'As2',
                                    tau: int = 1, # a forecast horizon
                                    score_fn = 'Gain_to_pain_ratio_fixed_trans_cost',
                                    init_wealth = 1000,
                                    invest_window = 200,
                                    use_strategy = 'fixed_trans_cost',
                                    trans_cost = 0.5):
    ''' Plot the out-of-sample performance scores [of a trading strategy] generated by a ML algorithm with a given loss function and a scoring function.
    INPUT
        algo: a ML algorithm used to forecast
        loss_fn: a loss function used to train the model
        score_fn: a scoring function used to cross validate the model
        init_wealth: an initial wealth used to invest
        invest_window: an investment horizon
        use_strategy: the trading strategy using fixed/variable transaction costs
        trans_cost: an amount of transaction cost (i.e., fixed_trans_costs = [0.05, 0.1, 0.5, 1.0] for the fixed cost strategy 
                                                                            and  variable_trans_costs = [0.0005, 0.001, 0.005, 0.01] for the variable cost strategy)
    OUTPUT
        Matplotlib graphs
    '''

    out_dir = f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/graphs'
    if not os.path.exists( out_dir ):
    # create the directory if it does not exist .
        os.makedirs( out_dir )

    try:
        performance_df = pd.read_csv(f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fn}/tau={tau}/performance/' \
                                                            f'performance_hper_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_cost}.csv', \
                                                                engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'end_date'], index_col = 'end_date')
    except:
        raise Exception(f'Strategy \'{use_strategy}\' does not exist!')

    performance_df.drop(columns = ['start_date', 'ratio_profit_over_total_loss',  'annualized_return', 'annualized_return_bh'], axis = 1, inplace = True)

    BTC_df = pd.read_csv(f'../Data/BTC-USD/BTC-USD.csv', engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates = ['date'], index_col = 'date')
    performance_df = pd.merge(performance_df, BTC_df['price'], left_index = True, right_index = True, how = 'left')
    
    column_mapping = { 'average_number_of_trades': 'Average number of trades',
                                        'percentage_of_winning_trades': 'Percentage of winning trades',
                                        'largest_raw_return': 'Largest raw return',	
                                        'smallest_raw_return': 'Smallest raw return',	
                                        'ratio_win_loss': 'Win/loss ratio',
                                        'max_number_of_consecutive_winners': 'Maximum number of consecutive winners',
                                        'max_number_of_consecutive_losers': 'Maximum number of consecutive losers',
                                        'annualized_excess_return': 'Annualized excess return',
                                        'annualized_standard_deviation': 'Annualized standard deviation',	
                                        'max_drawdown': 'Maximum drawdown',
                                        'Schwager_gain-to-pain_ratio': 'Schwager\'s gain/pain ratio',
                                        'Calmar_ratio': 'Calmar ratio',
                                        'Sharpe_ratio': 'Sharpe ratio',
                                        'Sortino_ratio': 'Sortino ratio',
                                        'cecpp': 'CECPP',
                                        'mrar': 'Morningstar\'s risk-adjusted rating',
										'price': 'BTC-USD'
                                    }
    performance_df.rename(columns = column_mapping, inplace = True)
    # display( performance_df.head() )
    print('The shape of the performance dataframe is ', performance_df.shape)
    
    ##### line plot the annualized excess return of a trading strategy
    ax = performance_df.loc[:, 'Annualized excess return'].plot(kind='line', subplots = False, figsize=(10, 7), rot=30, color = ['magenta'], sharex=True, sharey=True)
    ax.axhline(0, ls = '--', linewidth = 3, color ='black') 
    ax.grid(ls=':')
    ax1 = ax.twinx()
    ax1.plot(performance_df.index, performance_df['BTC-USD'], color = "lightsteelblue")
    ax1.legend(['BTC-USD'], loc='best', shadow = False)
    # ax.legend(fontsize=15, bbox_to_anchor=(1.0, 1.0), shadow=True)
    [l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
    [l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
    ax.set_xlabel('End Date', fontsize = 15, fontweight = 'medium')
    ax.set_ylabel('Annualized excess return', fontsize = 15, fontweight = 'medium')
    fig=ax.figure
    # fig.text(0.5, 0.15, "End Date", ha="center", va="center", fontsize = 15, fontweight = 'medium')
    # fig.text(0.09, 0.5, "Performance Statistics", ha="center", va="center", rotation = 90, fontsize = 15, fontweight = 'medium')
    fig.savefig(f'{out_dir}/OoS_perf_lineplot_AERet_{invest_window}_{use_strategy}_{trans_cost}_tau={tau}.png', dpi=150, bbox_inches="tight")
    # plt.show()
    plt.close()


    ##### line plot the statistics of a trading strategy
    axes = performance_df.iloc[:, :-1].plot(kind='line', subplots = True, figsize=(20, 20), rot=30, cmap = plt.get_cmap('gist_rainbow'), sharex=True, sharey=False)
    for i, ax in enumerate(axes):
        ax.axhline(0, ls = '--', linewidth = 3, color ='black') 
        ax.grid(ls=':')
        ax1 = ax.twinx()
        ax1.plot(performance_df.index, performance_df['BTC-USD'], color = "lightsteelblue")
        if i == 0:
            ax1.legend(['BTC-USD'], loc='best', shadow = False)
        # ax.legend(fontsize=15, bbox_to_anchor=(1.0, 1.0), shadow=True)
        [l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
        [l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
        ax.set_xlabel('')
    fig=axes[0].figure
    fig.text(0.5, 0.15, "End Date", ha="center", va="center", fontsize = 15, fontweight = 'medium')
    fig.text(0.09, 0.5, "Performance Statistics", ha="center", va="center", rotation = 90, fontsize = 15, fontweight = 'medium')
    fig.savefig(f'{out_dir}/OoS_perf_lineplot_{invest_window}_{use_strategy}_{trans_cost}.png', dpi=150, bbox_inches="tight")
    # plt.show()
    plt.close()

    # # ##### box plot the statistics of a trading strategy
    # performance_df.drop(columns = ['Average number of trades', 'Largest raw return', 'Smallest raw return', 'Maximum number of consecutive winners', \
    #                                                          'Maximum number of consecutive losers', 'Maximum drawdown', 'Calmar ratio'], axis = 1, inplace = True)

    # # scale up some performance metrics to put all the metrics roughly on the same scale                                               
    # # performance_df['Annualized standard deviation'] = 10*performance_df['Annualized standard deviation']
    # # performance_df['CECPP'] = 10*performance_df['CECPP']
    # performance_df['Morningstar\'s risk-adjusted rating'] = 10*performance_df['Morningstar\'s risk-adjusted rating']
    # performance_df['Annualized excess return'] = 10*performance_df['Annualized excess return']

    # fig, ax = plt.subplots( figsize=(13, 8) )
    # ax = sns.boxplot(data=performance_df.iloc[:, :-1])
    # ax = sns.stripplot(data=performance_df.iloc[:, :-1], jitter=0.2, dodge=True, color='gray', size=3, alpha=0.9)
    # ax.axhline(0, ls = '--', linewidth = 2, color ='red') 
    # ax.axhline(1, ls = '--', linewidth = 1, color ='red') 
    # # trans = transforms.blended_transform_factory(ax.get_yticklabels()[0].get_transform(), ax.transData)
    # ax.text(-0.62, 1, "1", ha="center", va="center")
    # ax.grid(ls=':')
    # ax.legend(fontsize=15)
    # ax.set_xlabel('Performance Statistics', fontsize=15)
    # ax.set_ylabel('')
    # ax.tick_params(axis='x', which='major', labelsize=9, labelrotation=45)
    # fig.savefig(f'{out_dir}/OoS_perf_boxplot_{invest_window}_{use_strategy}_{trans_cost}.png', dpi=150, bbox_inches="tight")
    # # print(ax)
    # plt.close()
    
    del performance_df, BTC_df
    gc.collect()
    
    return True

if __name__ == "__main__":
    startTime = time.time()

    # Define a list of algorithms employed
    # algos = ['XGB_SPY_all_vars', 'XGB_SPY_all_vars_plus_patterns']
    algos = ['RF_BTC_all_vars', 'RF_BTC_all_vars_plus_patterns']
    # algos = ['LGBM_SPY_all_vars', 'LGBM_SPY_all_vars_plus_patterns']

    # Define a list of loss functions used to train a ML model
    # loss_fns = ['CE', 'Brier', 'Boost', 'As1', 'As2']
    loss_fns = ['CE']
    
    # Define the forecast horizon
    tau = 1

    # Define a list of score functions used to cross validate a ML algorithm
    score_fns = ['Accuracy', 'AUC',  'Gain_to_pain_ratio_fixed_trans_cost', 'Gain_to_pain_ratio_variable_trans_cost', 'Calmar_ratio_fixed_trans_cost', \
                        'Calmar_ratio_variable_trans_cost', 'Sharpe_ratio_fixed_trans_cost', 'Sharpe_ratio_variable_trans_cost', 'Sortino_ratio_fixed_trans_cost', \
                        'Sortino_ratio_variable_trans_cost', 'CECPP_fixed_trans_cost', 'CECPP_variable_trans_cost']    
    score_fns_fixed_trans_cost = ['Accuracy', 'AUC',  'Gain_to_pain_ratio_fixed_trans_cost', 'Calmar_ratio_fixed_trans_cost', \
                                                        'Sharpe_ratio_fixed_trans_cost',  'Sortino_ratio_fixed_trans_cost',  'CECPP_fixed_trans_cost']   
    score_fns_variable_trans_cost = ['Accuracy', 'AUC',  'Gain_to_pain_ratio_variable_trans_cost', 'Calmar_ratio_variable_trans_cost', \
                                                            'Sharpe_ratio_variable_trans_cost', 'Sortino_ratio_variable_trans_cost', 'CECPP_variable_trans_cost']  

    # Define a list of holding periods
    invest_windows = [100, 200]

    # Define transaction costs
    fixed_trans_costs = [0.05, 0.1, 0.5, 1.0] 
    variable_trans_costs = [0.0005, 0.001, 0.005, 0.01]

    try:
        # client = Client('tcp://localhost:8786', timeout='2s')
        cluster = LocalCluster(n_workers=num_cores, processes=True, memory_limit='auto', threads_per_worker=1, scheduler_port=8786, dashboard_address='localhost:8787')
        client = Client(cluster)
    except OSError:
        client.close()
        cluster.close()
        time.sleep(20)
        cluster = LocalCluster(n_workers=num_cores, processes=True, memory_limit='auto', threads_per_worker=1, scheduler_port=8786, dashboard_address='localhost:8787')
        client = Client(cluster)
    print(client)

    # with joblib.parallel_backend('dask'):
    #     job_run = joblib.Parallel(verbose=20) (joblib.delayed(plot_test_scores)(algo = algo, 
    #                                                                                                                         tau = tau,
    #                                                                                                                         loss_fn = loss_fn, 
    #                                                                                                                         score_fn = score_fn) \
    #                                                                     for algo in algos
    #                                                                         for loss_fn in loss_fns
    #                                                                             for score_fn in score_fns)

    # time.sleep(30)

    with joblib.parallel_backend('dask'):
        job_run = joblib.Parallel(verbose=20) (joblib.delayed(plot_perf_scores)(algo = algo, 
                                                                                                                            tau = tau,
                                                                                                                            loss_fn = loss_fn, 
                                                                                                                            score_fn = score_fn, 
                                                                                                                            invest_window = invest_window, 
                                                                                                                            use_strategy = 'fixed_trans_cost', 
                                                                                                                            trans_cost = trans_cost) \
                                                                    for algo in algos
                                                                        for loss_fn in loss_fns
                                                                            for score_fn in score_fns_fixed_trans_cost
                                                                                for invest_window in invest_windows
                                                                                    for trans_cost in fixed_trans_costs)
    # time.sleep(30)

    # with joblib.parallel_backend('dask'):
    #     job_run = joblib.Parallel(verbose=20) (joblib.delayed(plot_perf_scores)(algo = algo, 
    #                                                                                                                         loss_fn = loss_fn, 
    #                                                                                                                         score_fn = score_fn, 
    #                                                                                                                         invest_window = invest_window, 
    #                                                                                                                         use_strategy = 'variable_trans_cost', 
    #                                                                                                                         trans_cost = trans_cost) \
    #                                                                 for algo in algos
    #                                                                     for loss_fn in loss_fns
    #                                                                         for score_fn in score_fns_variable_trans_cost
    #                                                                             for invest_window in invest_windows
    #                                                                                 for trans_cost in variable_trans_costs)
    
    # time.sleep(30)

    client.close()
    cluster.close()

    print( 'The script took {} second !'.format(time.time() - startTime) )

<Client: 'tcp://127.0.0.1:8786' processes=30 threads=30, memory=89.95 GiB>


[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 30 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done  15 tasks      |

The script took 93.76251888275146 second !


In [None]:
# ============================================= Compare the Performance of a Trading Strategy Across Various Score Functions ==================================== #
# =========================================================================================================================================== #
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import re
import os
import sys
import datetime
from varname import nameof
import time
import gc

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

#Gradient Color Bar Plots
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from matplotlib import colors as mcolors, path

from dask.distributed import Client, LocalCluster
import joblib
import multiprocessing
# import dask
# import distributed
# dask.config.set({"distributed.comm.timeouts.tcp": "100000s", "distributed.scheduler.allowed-failures": 999})
# num_cores = multiprocessing.cpu_count()
num_cores = 30

##### Set the current working directory
path="e:/Copy/SCRIPTS/Forecast_Stocks/Jupyter_notebooks/"
os.chdir(path)

##### Joint-boxplot a performance statistic of a trading strategy over different values of the fixed/variable transaction cost with hue = score function
def joint_plot( algo = 'LGBM_SPY_all_vars',
                        loss_fn = 'CE',
                        score_fns_dict = {  'Accuracy': 'Accuracy', 'AUC': 'AUC',  'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_variable_trans_cost', \
                                                        'Calmar ratio': 'Calmar_ratio_variable_trans_cost', 'Sharpe ratio': 'Sharpe_ratio_variable_trans_cost', \
                                                        'Sortino ratio': 'Sortino_ratio_variable_trans_cost',  'CECPP': 'CECPP_variable_trans_cost'},
                        init_wealth = 1000,
                        invest_window = 100,
                        trans_costs = [0.0005, 0.005],
                        perf_stats_dict = {  'Average Number of Trades': 'average_number_of_trades',
                                                        'Percentage of Winning Trades': 'percentage_of_winning_trades',	
                                                        'Largest Simple Return':   'largest_raw_return',	
                                                        'Smallest Simple Return':    'smallest_raw_return',	
                                                        'Ratio of Average Winning Trade to Average Losing Trade': 'ratio_win_loss',	
                                                        'Maximum Number of Consecutive Winners': 'max_number_of_consecutive_winners',	
                                                        'Maximum Number of Consecutive Losers':    'max_number_of_consecutive_losers', 
                                                        'Annualized Excess Return':    'annualized_excess_return',	
                                                        'Annualized Standard Deviation':    'annualized_standard_deviation',	
                                                        'Maximum Drawdown':   'max_drawdown',	
                                                        'Schwager\'s Gain to Pain Ratio': 'Schwager_gain-to-pain_ratio',	
                                                        'Calmar Ratio':  'Calmar_ratio',	
                                                        'Sharpe Ratio':    'Sharpe_ratio',	
                                                        'Sortino Ratio': 'Sortino_ratio',
                                                        'CECPP': 'cecpp',
                                                        'Morningstar\'s risk-adjusted rating': 'mrar'}):

    ''' Joint plot performance statistics of a trading strategy over different values of the fixed/variable transaction cost with hue = score function
    INPUT
        algo: a ML algorithm used to forecast price movement directions
        loss_fn: a loss function used to train a ML model
        score_fns_dict: a dictionary of score functions used to cross-validate a ML model
            (i.e., score_fns_dict = {  'Accuracy': 'Accuracy', 'AUC': 'AUC',  'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_fixed_trans_cost', \
                                                    'Calmar ratio': 'Calmar_ratio_fixed_trans_cost', 'Sharpe ratio': 'Sharpe_ratio_fixed_trans_cost', \
                                                    'Sortino ratio': 'Sortino_ratio_fixed_trans_cost',  'CECPP': 'CECPP_fixed_trans_cost'} for the fixed cost strategy
            and score_fns_dict = {  'Accuracy': 'Accuracy', 'AUC': 'AUC',  'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_variable_trans_cost', \
                                                    'Calmar ratio': 'Calmar_ratio_variable_trans_cost', 'Sharpe ratio': 'Sharpe_ratio_variable_trans_cost', \
                                                    'Sortino ratio': 'Sortino_ratio_variable_trans_cost',  'CECPP': 'CECPP_variable_trans_cost'} for the variable cost strategy)
        init_wealth: an initial wealth
        invest_window: a trading window
        trans_costs: a list of transaction costs (i.e., trans_costs = [0.05, 0.1, 0.5, 1.0] for the fixed cost strategy 
                                                                        and  trans_costs = [0.0005, 0.001, 0.005, 0.01] for the variable cost strategy)
        perf_stats_dict: a dictionary of performance statistics
    OUTPUT
        joint boxplots of performance statistics against transaction costs for various score functions
    '''

    out_dir = f'../Results/{algo}/loss_fn={loss_fn}/graphs'
    if not os.path.exists( out_dir ):
    # create the directory if it does not exist .
        os.makedirs( out_dir )

    # import data into dataframes
    list_dfs = [[] for _ in np.arange( len(score_fns_dict) )]
    try:
        score_fns_dict_keys = list( score_fns_dict.keys() )
        score_fns_dict_values = list( score_fns_dict.values() )
        use_strategy = re.search(r'(?<=ratio\_)\w+', score_fns_dict_values[2], flags=re.IGNORECASE | re.VERBOSE).group()
        for i in np.arange( len(score_fns_dict) ):
            for j in np.arange( len(trans_costs) ):
                performance_df = pd.read_csv(f'../Results/{algo}/loss_fn={loss_fn}/score_fn={score_fns_dict_values[i]}/performance/' \
                                                                    f'performance_hper_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_costs[j]}.csv', \
                                                                        engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'end_date'], index_col = 'end_date')
                performance_df.drop(columns = ['start_date', 'ratio_profit_over_total_loss',  'annualized_return', 'annualized_return_bh'], axis = 1, inplace = True)
                # # scale down the values of the Calmar ratio so that all the variables are roughly on the same scale
                # performance_df['Calmar_ratio'] = StandardScaler(with_mean=False).fit_transform( performance_df['Calmar_ratio'].values.reshape(-1,1) )
                # display(performance_df.head() )
                list_dfs[i].append(performance_df)
    except Exception as er:
        print(er)

    # melt all the dataframes to a long dataframe
    list_metric_dfs = []
    perf_stats_values = list( perf_stats_dict.values() )
    for i in np.arange( len(score_fns_dict) ):
        for j in np.arange( len(trans_costs) ):
            metric_df = list_dfs[i][j][perf_stats_values].reset_index(drop = False)
            metric_melted_df = pd.melt(metric_df, id_vars = 'end_date', var_name='perf_stats', value_name='value')
            metric_melted_df['score_fn'] = [score_fns_dict_keys[i] for _ in range( len(metric_melted_df ) )]
            metric_melted_df['trans_cost'] = [trans_costs[j] for _ in range( len(metric_melted_df ) )]
            list_metric_dfs.append(metric_melted_df)
    merged_df = pd.concat(list_metric_dfs, axis = 0)
    # display( merged_df.head() )
    # merged_df.to_csv(os.path.join(out_dir, 'merged_data.csv'), index=False, header = True) 

    # joint boxplot each performance statistic
    perf_stats_keys = list( perf_stats_dict.keys() )
    for i in np.arange( len(perf_stats_dict) ):
        fig, ax = plt.subplots( figsize=(13, 8) )
        perf_stat_df = merged_df[ merged_df['perf_stats'] == perf_stats_values[i] ]
        # display( perf_stat_df.head() )
        ax = sns.boxplot(x = 'trans_cost', y = 'value', hue='score_fn', data=perf_stat_df)
        sns.stripplot(x = 'trans_cost', y = 'value', hue='score_fn', data=perf_stat_df, jitter=0.12, dodge=True, color='gray', size=2, alpha=0.8, ax = ax)
        handles, labels = ax.get_legend_handles_labels()
        ax.axhline(0, ls = '--', linewidth = 2, color ='red') 
        # ax.axhline(1, ls = '--', linewidth = 1, color ='red') 
        # ax.text(0, 1, "1", ha="center", va="center")
        ax.grid(ls=':')
        ax.legend(handles, labels[0:7], fontsize=10, loc = 'upper right', title = 'Scoring functions')
        ax.set_ylabel(perf_stats_keys[i], fontsize=15)
        ax.set_xlabel('Transaction Cost', fontsize=15)
        ax.tick_params(axis='x', which='major', labelsize=13, labelrotation=0)        
        out_file = f'{out_dir}/joint_boxplot_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{perf_stats_values[i]}.png'
        fig.savefig(out_file, dpi=150, bbox_inches="tight")
        plt.close()

    del perf_stat_df, merged_df, metric_df, performance_df
    gc.collect()

    return True

if __name__ == "__main__":
    startTime = time.time()

    # Define a list of algorithms employed
    algos = ['RF_SPY_all_vars', 'RF_SPY_all_vars_plus_patterns', 'XGB_SPY_all_vars', 'XGB_SPY_all_vars_plus_patterns']
    # algos = ['LGBM_SPY_all_vars', 'LGBM_SPY_all_vars_plus_patterns']

    # Define a list of loss functions used to train a ML model
    # loss_fns = ['CE', 'Brier', 'Boost', 'As1', 'As2']
    loss_fns = ['CE']

    # Define a list of score functions used to cross validate a ML algorithm
    score_fns_fixed_trans_cost_dict = {'Accuracy': 'Accuracy', 'AUC': 'AUC',  'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_fixed_trans_cost', \
                                                                'Calmar ratio': 'Calmar_ratio_fixed_trans_cost', 'Sharpe ratio': 'Sharpe_ratio_fixed_trans_cost', \
                                                                'Sortino ratio': 'Sortino_ratio_fixed_trans_cost',  'CECPP': 'CECPP_fixed_trans_cost'}
    score_fns_variable_trans_cost_dict = {'Accuracy': 'Accuracy', 'AUC': 'AUC',  'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_variable_trans_cost', \
                                                                    'Calmar ratio': 'Calmar_ratio_variable_trans_cost', 'Sharpe ratio': 'Sharpe_ratio_variable_trans_cost', \
                                                                    'Sortino ratio': 'Sortino_ratio_variable_trans_cost',  'CECPP': 'CECPP_variable_trans_cost'} 
    # Define a list of holding periods
    invest_windows = [100, 200]

    try:
        # client = Client('tcp://localhost:8786', timeout='2s')
        cluster = LocalCluster(n_workers=num_cores, processes=True, memory_limit='auto', threads_per_worker=1, scheduler_port=8786, dashboard_address='localhost:8787')
        client = Client(cluster)
    except OSError:
        client.close()
        cluster.close()
        time.sleep(20)
        cluster = LocalCluster(n_workers=num_cores, processes=True, memory_limit='auto', threads_per_worker=1, scheduler_port=8786, dashboard_address='localhost:8787')
        client = Client(cluster)
    print(client)

    with joblib.parallel_backend('dask'):
        job_run = joblib.Parallel(verbose=20) (joblib.delayed(joint_plot)(  algo = algo,
                                                                                                                    loss_fn = loss_fn,
                                                                                                                    score_fns_dict = score_fns_fixed_trans_cost_dict,
                                                                                                                    invest_window = invest_window,
                                                                                                                    trans_costs = [0.05, 0.5]) \
                                                                        for algo in algos
                                                                            for loss_fn in loss_fns
                                                                                for invest_window in invest_windows)
    time.sleep(30)

    with joblib.parallel_backend('dask'):
        job_run = joblib.Parallel(verbose=20) (joblib.delayed(joint_plot)(  algo = algo,
                                                                                                                    loss_fn = loss_fn,
                                                                                                                    score_fns_dict = score_fns_variable_trans_cost_dict,
                                                                                                                    invest_window = invest_window,
                                                                                                                    trans_costs = [0.0005, 0.005]) \
                                                                        for algo in algos
                                                                            for loss_fn in loss_fns
                                                                                for invest_window in invest_windows)

    time.sleep(30)

    client.close()
    cluster.close()
    
    print( 'The script took {} second !'.format(time.time() - startTime) )

In [None]:
# ============================================= Compare the Performance of a Trading Strategy Across Various Loss Functions ==================================== #
# =========================================================================================================================================== #
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import sys
import datetime
import time
import re
import gc

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

#Gradient Color Bar Plots
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from matplotlib import colors as mcolors, path

from dask.distributed import Client, LocalCluster
import joblib
import multiprocessing
# import dask
# import distributed
# dask.config.set({"distributed.comm.timeouts.tcp": "100000s", "distributed.scheduler.allowed-failures": 999})
# num_cores = multiprocessing.cpu_count()
num_cores = 30

##### Set the current working directory
path="e:/Copy/SCRIPTS/Forecast_Stocks/Jupyter_notebooks/"
os.chdir(path)

##### joint-boxplot a performance statistic of a trading strategy over different values of the fixed/variable transaction cost with hue = loss function
def joint_plot( algo = 'LGBM_SPY_all_vars',
                        loss_fns = ['CE', 'Brier', 'Boost', 'As1', 'As2'],
                        score_fn = 'AUC',
                        init_wealth = 1000,
                        invest_window = 100,
                        use_strategy = 'fixed_trans_cost',
                        trans_costs = [0.05, 0.5],
                        perf_stats_dict = {  'Average Number of Trades': 'average_number_of_trades',
                                                        'Percentage of Winning Trades': 'percentage_of_winning_trades',	
                                                        'Largest Simple Return':   'largest_raw_return',	
                                                        'Smallest Simple Return':    'smallest_raw_return',	
                                                        'Ratio of Average Winning Trade to Average Losing Trade': 'ratio_win_loss',	
                                                        'Maximum Number of Consecutive Winners': 'max_number_of_consecutive_winners',	
                                                        'Maximum Number of Consecutive Losers':    'max_number_of_consecutive_losers', 
                                                        'Annualized Excess Return':    'annualized_excess_return',	
                                                        'Annualized Standard Deviation':    'annualized_standard_deviation',	
                                                        'Maximum Drawdown':   'max_drawdown',	
                                                        'Schwager\'s Gain to Pain Ratio': 'Schwager_gain-to-pain_ratio',	
                                                        'Calmar Ratio':  'Calmar_ratio',	
                                                        'Sharpe Ratio':    'Sharpe_ratio',	
                                                        'Sortino Ratio': 'Sortino_ratio',
                                                        'CECPP': 'cecpp',
                                                        'Morningstar\'s risk-adjusted rating': 'mrar'}):
    ''' Joint-boxplot a performance statistic of a trading strategy over different values of the fixed/variable transaction cost with hue = loss function
     INPUT
        algo: a ML algorithm used to forecast
        loss_fns: a list of loss functions used to train the model
        score_fn: a scoring function used to cross validate the model
        init_wealth: an initial wealth used to invest
        invest_window: an investment horizon
        use_strategy: the trading strategy using fixed/variable transaction costs
        trans_costs: a list of transaction costs (i.e., fixed_trans_costs = [0.05, 0.1, 0.5, 1.0] for the fixed cost strategy 
                                                                            and  variable_trans_costs = [0.0005, 0.001, 0.005, 0.01] for the variable cost strategy)
        perf_stats_dict: a dictionary of performance statistics
    OUTPUT
        Matplotlib graphs
    '''    
    out_dir = f'../Results/{algo}/graphs'
    if not os.path.exists( out_dir ):
    # create the directory if it does not exist .
        os.makedirs( out_dir )

    # import data into dataframes
    list_dfs = [[] for _ in np.arange( len(loss_fns) )]

    try:
        for i in np.arange( len(loss_fns) ):
            for j in np.arange( len(trans_costs) ):
                performance_df = pd.read_csv(f'../Results/{algo}/loss_fn={loss_fns[i]}/score_fn={score_fn}/performance/' \
                                                                        f'performance_hper_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_costs[j]}.csv', \
                                                                        engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'end_date'], index_col = 'end_date')
                performance_df.drop(columns = ['start_date', 'ratio_profit_over_total_loss',  'annualized_return', 'annualized_return_bh'], axis = 1, inplace = True)
                # # scale down the values of the Calmar ratio so that all the variables are roughly on the same scale
                # performance_df['Calmar_ratio'] = StandardScaler(with_mean=False).fit_transform( performance_df['Calmar_ratio'].values.reshape(-1,1) )
                # display(performance_df.head() )
                list_dfs[i].append(performance_df)
    except Exception as err:
        print(err)

    # melt all the dataframes to a long dataframe
    list_metric_dfs = []
    perf_stats_values = list(perf_stats_dict.values())
    for i in np.arange( len(loss_fns) ):
        for j in np.arange( len(trans_costs) ):
            metric_df = list_dfs[i][j][perf_stats_values].reset_index(drop = False)
            metric_melted_df = pd.melt(metric_df, id_vars = 'end_date', var_name='perf_stats', value_name='value')
            metric_melted_df['loss_fn'] = [loss_fns[i] for _ in range( len(metric_melted_df ) )]
            metric_melted_df['trans_cost'] = [trans_costs[j] for _ in range( len(metric_melted_df ) )]
            list_metric_dfs.append(metric_melted_df)
    merged_df = merged_df = pd.concat(list_metric_dfs, axis = 0)
    # display(merged_df.head() )
    # merged_df.to_csv(os.path.join(out_dir, 'merged_data.csv'), index=False, header = True) 

    perf_stats_keys = list( perf_stats_dict.keys() )
    for i in np.arange( len(perf_stats_dict) ):
        fig, ax = plt.subplots( figsize=(13, 8) )
        perf_stat_df = merged_df[ merged_df['perf_stats'] == perf_stats_values[i] ]
        # display( perf_stat_df.head() )
        ax = sns.boxplot(x = 'trans_cost', y = 'value', hue='loss_fn', data=perf_stat_df)
        sns.stripplot(x = 'trans_cost', y = 'value', hue='loss_fn', data=perf_stat_df, jitter=0.12, dodge=True, color='gray', size=2, alpha=0.8, ax = ax)
        handles, labels = ax.get_legend_handles_labels()
        ax.axhline(0, ls = '--', linewidth = 2, color ='red') 
        # ax.axhline(1, ls = '--', linewidth = 1, color ='red') 
        # ax.text(0, 1, "1", ha="center", va="center")
        ax.grid(ls=':')
        ax.legend(handles, labels[0:5], fontsize=10, loc = 'upper right', title = 'Loss functions')
        ax.set_ylabel(perf_stats_keys[i], fontsize=15)
        ax.set_xlabel('Transaction Cost', fontsize=15)
        ax.tick_params(axis='x', which='major', labelsize=13, labelrotation=0)        
        try:
            score_word = re.search(r'^[^\_]+(?=\_)', score_fn, flags=re.IGNORECASE | re.VERBOSE).group()
        except:
            score_word = score_fn
        out_file = f'{out_dir}/joint_boxplot_{invest_window}_init_wealth_{init_wealth}_score_fn_{score_word}_{use_strategy}_{perf_stats_values[i]}.png'
        fig.savefig(out_file, dpi=150, bbox_inches="tight")
        plt.close()

    del perf_stat_df, merged_df, metric_df, performance_df
    gc.collect()

    return True

if __name__ == "__main__":
    startTime = time.time()

    # Define a list of algorithms employed
    algos = ['RF_SPY_all_vars', 'RF_SPY_all_vars_plus_patterns', 'XGB_SPY_all_vars', 'XGB_SPY_all_vars_plus_patterns']
    # algos = ['LGBM_SPY_all_vars', 'LGBM_SPY_all_vars_plus_patterns']

    # Define a list of loss functions used to train a ML model
    # loss_fns = ['CE', 'Brier', 'Boost', 'As1', 'As2']
    loss_fns = ['CE']

    # Define a list of score functions used to cross validate a ML algorithm
    score_fns_fixed_trans_cost = ['Accuracy', 'AUC',  'Gain_to_pain_ratio_fixed_trans_cost', 'Calmar_ratio_fixed_trans_cost', \
                                                        'Sharpe_ratio_fixed_trans_cost',  'Sortino_ratio_fixed_trans_cost',  'CECPP_fixed_trans_cost']   
    score_fns_variable_trans_cost = ['Accuracy', 'AUC',  'Gain_to_pain_ratio_variable_trans_cost', 'Calmar_ratio_variable_trans_cost', \
                                                            'Sharpe_ratio_variable_trans_cost', 'Sortino_ratio_variable_trans_cost', 'CECPP_variable_trans_cost']  

    # Define a list of holding periods
    invest_windows = [100, 200]

    try:
        # client = Client('tcp://localhost:8786', timeout='2s')
        cluster = LocalCluster(n_workers=num_cores, processes=True, memory_limit='auto', threads_per_worker=1, scheduler_port=8786, dashboard_address='localhost:8787')
        client = Client(cluster)
    except OSError:
        client.close()
        cluster.close()
        time.sleep(20)
        cluster = LocalCluster(n_workers=num_cores, processes=True, memory_limit='auto', threads_per_worker=1, scheduler_port=8786, dashboard_address='localhost:8787')
        client = Client(cluster)
    print(client)

    with joblib.parallel_backend('dask'):
        job_run = joblib.Parallel(verbose=20) (joblib.delayed(joint_plot)(  algo = algo,
                                                                                                                    loss_fns = loss_fns,
                                                                                                                    score_fn = score_fn,
                                                                                                                    invest_window = invest_window,
                                                                                                                    use_strategy = 'fixed_trans_cost',
                                                                                                                    trans_costs = [0.05, 0.5]) \
                                                                        for algo in algos
                                                                            for score_fn in score_fns_fixed_trans_cost
                                                                                for invest_window in invest_windows)
    time.sleep(30)

    with joblib.parallel_backend('dask'):
        job_run = joblib.Parallel(verbose=20) (joblib.delayed(joint_plot)(  algo = algo,
                                                                                                                    loss_fns = loss_fns,
                                                                                                                    score_fn = score_fn,
                                                                                                                    invest_window = invest_window,
                                                                                                                    use_strategy = 'variable_trans_cost',
                                                                                                                    trans_costs = [0.0005, 0.005]) \
                                                                        for algo in algos
                                                                            for score_fn in score_fns_variable_trans_cost
                                                                                for invest_window in invest_windows)

    time.sleep(30)

    client.close()
    cluster.close()

    print( 'The script took {} second !'.format(time.time() - startTime) )

In [None]:
# ==================================== Compare the Performance of  a Trading Strategy based on Technical Indicators vs. Price Patterns ==================================== #
# =========================================================================================================================================== #
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import sys
import datetime
import time
import re
import gc

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

#Gradient Color Bar Plots
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from matplotlib import colors as mcolors, path

from dask.distributed import Client, LocalCluster
import joblib
import multiprocessing
# import dask
# import distributed
# dask.config.set({"distributed.comm.timeouts.tcp": "100000s", "distributed.scheduler.allowed-failures": 999})
# num_cores = multiprocessing.cpu_count()
num_cores = 30

##### Set the current working directory
path="e:/Copy/SCRIPTS/Forecast_Stocks/Jupyter_notebooks/"
os.chdir(path)

##### joint-boxplot a performance statistic of a trading strategy over different loss functions for a given scoring function with hue = 'algo'
def joint_plot1(algos = ['LGBM_SPY_all_vars', 'LGBM_SPY_all_vars_plus_patterns'],
                        loss_fns = ['CE', 'Brier', 'Boost', 'As1', 'As2'],
                        score_fn = 'AUC',
                        init_wealth = 1000,
                        invest_window = 100,
                        use_strategy = 'fixed_trans_cost',
                        trans_cost = 0.05,
                        perf_stats_dict = {  'Average Number of Trades': 'average_number_of_trades',
                                                        'Percentage of Winning Trades': 'percentage_of_winning_trades',	
                                                        'Largest Simple Return':   'largest_raw_return',	
                                                        'Smallest Simple Return':    'smallest_raw_return',	
                                                        'Ratio of Average Winning Trade to Average Losing Trade': 'ratio_win_loss',	
                                                        'Maximum Number of Consecutive Winners': 'max_number_of_consecutive_winners',	
                                                        'Maximum Number of Consecutive Losers':    'max_number_of_consecutive_losers', 
                                                        'Annualized Excess Return':    'annualized_excess_return',	
                                                        'Annualized Standard Deviation':    'annualized_standard_deviation',	
                                                        'Maximum Drawdown':   'max_drawdown',	
                                                        'Schwager\'s Gain to Pain Ratio': 'Schwager_gain-to-pain_ratio',	
                                                        'Calmar Ratio':  'Calmar_ratio',	
                                                        'Sharpe Ratio':    'Sharpe_ratio',	
                                                        'Sortino Ratio': 'Sortino_ratio',
                                                        'CECPP': 'cecpp',
                                                        'Morningstar\'s risk-adjusted rating': 'mrar'}):
    ''' joint-boxplot a performance statistic of a trading strategy over different loss functions for a given scoring function with hue = 'algo'
     INPUT
        algos: a list of ML models used to forecast
        loss_fns: a list of loss functions used to train the models
        score_fn: a scoring function used to cross validate the model
        init_wealth: an initial wealth used to invest
        invest_window: an investment horizon
        use_strategy: the trading strategy using fixed/variable transaction costs
        trans_cost: a transaction cost (i.e., trans_cost = 0.05 for the fixed cost strategy and trans_cost = 0.0005 for the variable cost strategy)
        perf_stats_dict: a dictionary of performance statistics
    OUTPUT
        Matplotlib graphs
    '''    

    algo_word = re.search(r'\w+(?=\_all)',  algos[0], flags=re.IGNORECASE | re.VERBOSE).group()
    out_dir = f'../Results/graphs/{algo_word}'
    if not os.path.exists( out_dir ):
    # create the directory if it does not exist .
        os.makedirs( out_dir )

    # import data into dataframes
    list_dfs = [[] for _ in np.arange( len(loss_fns) )]

    try:
        for i in np.arange( len(loss_fns) ):
            for j in np.arange( len(algos) ):
                performance_df = pd.read_csv(f'../Results/{algos[j]}/loss_fn={loss_fns[i]}/score_fn={score_fn}/performance/' \
                                                                        f'performance_hper_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_cost}.csv', \
                                                                            engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'end_date'], index_col = 'end_date')
                performance_df.drop(columns = ['start_date', 'ratio_profit_over_total_loss',  'annualized_return', 'annualized_return_bh'], axis = 1, inplace = True)
                # # scale down the values of the Calmar ratio so that all the variables are roughly on the same scale
                # performance_df['Calmar_ratio'] = StandardScaler(with_mean=False).fit_transform( performance_df['Calmar_ratio'].values.reshape(-1,1) )
                # display(performance_df.head() )
                list_dfs[i].append(performance_df)
    except Exception as err:
        print(err)

    # melt all the dataframes to a long dataframe
    list_metric_dfs = []
    perf_stats_values = list(perf_stats_dict.values())
    for i in np.arange( len(loss_fns) ):
        for j in np.arange( len(algos) ):
            metric_df = list_dfs[i][j][perf_stats_values].reset_index(drop = False)
            metric_melted_df = pd.melt(metric_df, id_vars = 'end_date', var_name='perf_stats', value_name='value')
            metric_melted_df['loss_fn'] = [loss_fns[i] for _ in range( len(metric_melted_df ) )]
            metric_melted_df['predictor_set'] = [algos[j] for _ in range( len(metric_melted_df ) )]
            list_metric_dfs.append(metric_melted_df)
    merged_df = pd.concat(list_metric_dfs, axis = 0)
    # display(merged_df.head() )
    # merged_df.to_csv(os.path.join(out_dir, 'merged_data.csv'), index=False, header = True) 

    perf_stats_keys = list( perf_stats_dict.keys() )
    for i in np.arange( len(perf_stats_dict) ):
        fig, ax = plt.subplots( figsize=(13, 8) )
        perf_stat_df = merged_df[ merged_df['perf_stats'] == perf_stats_values[i] ]
        # display( perf_stat_df.head() )
        ax = sns.boxplot(x = 'loss_fn', y = 'value', hue='predictor_set', data=perf_stat_df)
        sns.stripplot(x = 'loss_fn', y = 'value', hue='predictor_set', data=perf_stat_df, jitter=0.12, dodge=True, color='gray', size=2, alpha=0.8, ax = ax)
        handles, labels = ax.get_legend_handles_labels()
        ax.axhline(0, ls = '--', linewidth = 2, color ='red') 
        # ax.axhline(1, ls = '--', linewidth = 1, color ='red') 
        # ax.text(0, 1, "1", ha="center", va="center")
        ax.grid(ls=':')
        # ax.legend(handles[0:5], labels[0:5], fontsize=10)
        ax.legend(handles = handles, labels = ['Dataset I', 'Dataset II'], loc = 'upper right', title = 'Set of Predictors')
        ax.set_ylabel(perf_stats_keys[i], fontsize=15)
        ax.set_xlabel('Loss Function', fontsize=15)
        ax.tick_params(axis='x', which='major', labelsize=13, labelrotation=0)        
        
        try:
            score_word = re.search(r'^[^\_]+(?=\_)', score_fn, flags=re.IGNORECASE | re.VERBOSE).group()
        except:
            score_word = score_fn
        
        out_file = f'{out_dir}/joint_boxplot_{score_word}_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_cost}_perf_metric_{perf_stats_values[i]}.png'
        fig.savefig(out_file, dpi=150, bbox_inches="tight")
        plt.close()

    del perf_stat_df, merged_df, metric_df, performance_df
    gc.collect()

    return True

##### joint-boxplot a performance statistic of a trading strategy over different scoring functions for a given loss function with hue = 'algo'
def joint_plot2(algos = ['LGBM_SPY_all_vars', 'LGBM_SPY_all_vars_plus_patterns'],
                        loss_fn = 'CE',
                        score_fns_dict = {'Accuracy': 'Accuracy', 'AUC': 'AUC',  'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_fixed_trans_cost', \
                                                    'Calmar ratio': 'Calmar_ratio_fixed_trans_cost', 'Sharpe ratio': 'Sharpe_ratio_fixed_trans_cost', \
                                                    'Sortino ratio': 'Sortino_ratio_fixed_trans_cost',  'CECPP': 'CECPP_fixed_trans_cost'},
                        init_wealth = 1000,
                        invest_window = 100,
                        trans_cost = 0.05,
                        perf_stats_dict = {  'Average Number of Trades': 'average_number_of_trades',
                                                        'Percentage of Winning Trades': 'percentage_of_winning_trades',	
                                                        'Largest Simple Return':   'largest_raw_return',	
                                                        'Smallest Simple Return':    'smallest_raw_return',	
                                                        'Ratio of Average Winning Trade to Average Losing Trade': 'ratio_win_loss',	
                                                        'Maximum Number of Consecutive Winners': 'max_number_of_consecutive_winners',	
                                                        'Maximum Number of Consecutive Losers':    'max_number_of_consecutive_losers', 
                                                        'Annualized Excess Return':    'annualized_excess_return',	
                                                        'Annualized Standard Deviation':    'annualized_standard_deviation',	
                                                        'Maximum Drawdown':   'max_drawdown',	
                                                        'Schwager\'s Gain to Pain Ratio': 'Schwager_gain-to-pain_ratio',	
                                                        'Calmar Ratio':  'Calmar_ratio',	
                                                        'Sharpe Ratio':    'Sharpe_ratio',	
                                                        'Sortino Ratio': 'Sortino_ratio',
                                                        'CECPP': 'cecpp',
                                                        'Morningstar\'s risk-adjusted rating': 'mrar'}):
    ''' Joint-boxplot a performance statistic of a trading strategy over different scoring functions for a given loss function with hue = 'algo'.
     INPUT
        algos: a list of ML models used to forecast
        loss_fns_dict: a loss function used to train the models
        score_fns_dict: a dictionary of scoring functions used for cross validation
        init_wealth: an initial wealth used to invest
        invest_window: an investment horizon
        trans_cost: a transaction cost (i.e., trans_cost = 0.05 for the fixed cost strategy and trans_costs = 0.0005 for the variable cost strategy)
        perf_stats_dict: a dictionary of performance statistics
    OUTPUT
        Matplotlib graphs
    '''    

    algo_word = re.search(r'\w+(?=\_all)',  algos[0], flags=re.IGNORECASE | re.VERBOSE).group()
    out_dir = f'../Results/graphs/{algo_word}'
    if not os.path.exists( out_dir ):
    # create the directory if it does not exist .
        os.makedirs( out_dir )

    score_fns_keys = list(score_fns_dict.keys())
    score_fns_values = list(score_fns_dict.values())
    try:
        use_strategy = re.search(r'(?<=ratio\_)\w+', score_fns_values[2], flags=re.IGNORECASE | re.VERBOSE).group()
    except:
        try:
            use_strategy = re.search(r'(?<=ratio\_)\w+', score_fns_values[0], flags=re.IGNORECASE | re.VERBOSE).group()
        except:
            pass

    # import data into dataframes
    list_dfs = [[] for _ in np.arange( len(score_fns_dict) )]

    try:
        for i in np.arange( len(score_fns_dict) ):
            for j in np.arange( len(algos) ):
                performance_df = pd.read_csv(f'../Results/{algos[j]}/loss_fn={loss_fn}/score_fn={score_fns_values[i]}/performance/' \
                                                                        f'performance_hper_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_cost}.csv', \
                                                                            engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'end_date'], index_col = 'end_date')
                performance_df.drop(columns = ['start_date', 'ratio_profit_over_total_loss',  'annualized_return', 'annualized_return_bh'], axis = 1, inplace = True)
                # # scale down the values of the Calmar ratio so that all the variables are roughly on the same scale
                # performance_df['Calmar_ratio'] = StandardScaler(with_mean=False).fit_transform( performance_df['Calmar_ratio'].values.reshape(-1,1) )
                # display(performance_df.head() )
                list_dfs[i].append(performance_df)
    except Exception as err:
        print(err)

    # melt all the dataframes to a long dataframe
    list_metric_dfs = []
    perf_stats_values = list(perf_stats_dict.values())
    for i in np.arange( len(score_fns_dict) ):
        for j in np.arange( len(algos) ):
            metric_df = list_dfs[i][j][perf_stats_values].reset_index(drop = False)
            metric_melted_df = pd.melt(metric_df, id_vars = 'end_date', var_name='perf_stats', value_name='value')
            metric_melted_df['score_fn'] = [score_fns_keys[i] for _ in range( len(metric_melted_df ) )]
            metric_melted_df['predictor_set'] = [algos[j] for _ in range( len(metric_melted_df ) )]
            list_metric_dfs.append(metric_melted_df)
    merged_df = pd.concat(list_metric_dfs, axis = 0)
    # display(merged_df.head() )
    # merged_df.to_csv(os.path.join(out_dir, 'merged_data.csv'), index=False, header = True) 

    perf_stats_keys = list( perf_stats_dict.keys() )
    for i in np.arange( len(perf_stats_dict) ):
        fig, ax = plt.subplots( figsize=(13, 8) )
        perf_stat_df = merged_df[ merged_df['perf_stats'] == perf_stats_values[i] ]
        # display( perf_stat_df.head() )
        ax = sns.boxplot(x = 'score_fn', y = 'value', hue='predictor_set', data=perf_stat_df)
        sns.stripplot(x = 'score_fn', y = 'value', hue='predictor_set', data=perf_stat_df, jitter=0.12, dodge=True, color='gray', size=2, alpha=0.8, ax = ax)
        handles, labels = ax.get_legend_handles_labels()
        ax.axhline(0, ls = '--', linewidth = 2, color ='red') 
        # ax.axhline(1, ls = '--', linewidth = 1, color ='red') 
        # ax.text(0, 1, "1", ha="center", va="center")
        ax.grid(ls=':')
        # ax.legend(handles[0:5], labels[0:5], fontsize=10)
        ax.legend(handles = handles, labels = ['Dataset I', 'Dataset II'], loc = 'upper right', title = 'Set of Predictors')
        ax.set_ylabel(perf_stats_keys[i], fontsize=15)
        ax.set_xlabel('Scoring Function', fontsize=15)
        ax.tick_params(axis='x', which='major', labelsize=10, labelrotation=10)        
        
        out_file = f'{out_dir}/joint_boxplot_{loss_fn}_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_cost}_perf_metric_{perf_stats_values[i]}.png'
        fig.savefig(out_file, dpi=150, bbox_inches="tight")
        plt.close()

    del perf_stat_df, merged_df, metric_df, performance_df
    gc.collect()

    return True


##### joint-boxplot a performance statistic of a trading strategy over different scoring functions for a given loss function with hue = four different ML models
def joint_plot3(algos = ['RF_SPY_all_vars', 'RF_SPY_all_vars_plus_patterns', 'RF_RW_all_vars', 'RF_RW_all_vars_plus_patterns'],
                        loss_fn = 'CE',
                        score_fns_dict = {'Accuracy': 'Accuracy', 'AUC': 'AUC',  'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_fixed_trans_cost', \
                                                        'Calmar ratio': 'Calmar_ratio_fixed_trans_cost', 'Sharpe ratio': 'Sharpe_ratio_fixed_trans_cost', \
                                                        'Sortino ratio': 'Sortino_ratio_fixed_trans_cost',  'CECPP': 'CECPP_fixed_trans_cost'},
                        init_wealth = 1000,
                        invest_window = 100,
                        trans_cost = 0.05,
                        perf_stats_dict = {  'Average Number of Trades': 'average_number_of_trades',
                                                        'Percentage of Winning Trades': 'percentage_of_winning_trades',	
                                                        'Largest Simple Return':   'largest_raw_return',	
                                                        'Smallest Simple Return':    'smallest_raw_return',	
                                                        'Ratio of Average Winning Trade to Average Losing Trade': 'ratio_win_loss',	
                                                        'Maximum Number of Consecutive Winners': 'max_number_of_consecutive_winners',	
                                                        'Maximum Number of Consecutive Losers':    'max_number_of_consecutive_losers', 
                                                        'Annualized Excess Return':    'annualized_excess_return',	
                                                        'Risk-Adjusted Annualized Excess Return': 'risk_adj_annualized_excess_return',
                                                        'Annualized Standard Deviation':    'annualized_standard_deviation',	
                                                        'Maximum Drawdown':   'max_drawdown',	
                                                        'Schwager\'s Gain to Pain Ratio': 'Schwager_gain-to-pain_ratio',	
                                                        'Calmar Ratio':  'Calmar_ratio',	
                                                        'Sharpe Ratio':    'Sharpe_ratio',	
                                                        'Sortino Ratio': 'Sortino_ratio',
                                                        'CECPP': 'cecpp',
                                                        'Morningstar\'s risk-adjusted rating': 'mrar'}):
    ''' Joint-boxplot a performance statistic of a trading strategy over different scoring functions for a given loss function with hue = 'algos'.
     INPUT
        algos: a list of ML models used to forecast
        loss_fns_dict: a loss function used to train the models
        score_fns_dict: a dictionary of scoring functions used for cross validation
        init_wealth: an initial wealth used to invest
        invest_window: an investment horizon
        trans_cost: a transaction cost (i.e., trans_cost = 0.05 for the fixed cost strategy and trans_costs = 0.0005 for the variable cost strategy)
        perf_stats_dict: a dictionary of performance statistics
    OUTPUT
        Matplotlib graphs
    '''    

    algo_word1 = re.search(r'\w+(?=\_all)',  algos[0], flags=re.IGNORECASE | re.VERBOSE).group()
    algo_word2 = re.search(r'\w+(?=\_all)',  algos[2], flags=re.IGNORECASE | re.VERBOSE).group()
    out_dir = f'../Results/graphs/{algo_word1}_and_{algo_word2}'
    if not os.path.exists( out_dir ):
    # create the directory if it does not exist .
        os.makedirs( out_dir )

    score_fns_keys = list(score_fns_dict.keys())
    score_fns_values = list(score_fns_dict.values())
    try:
        use_strategy = re.search(r'(?<=ratio\_)\w+', score_fns_values[2], flags=re.IGNORECASE | re.VERBOSE).group()
    except:
        try:
            use_strategy = re.search(r'(?<=ratio\_)\w+', score_fns_values[0], flags=re.IGNORECASE | re.VERBOSE).group()
        except:
            pass

    # import data into dataframes
    list_dfs = [[] for _ in np.arange( len(score_fns_dict) )]

    try:
        for i in np.arange( len(score_fns_dict) ):
            for j in np.arange( len(algos) ):
                performance_df = pd.read_csv(f'../Results/{algos[j]}/loss_fn={loss_fn}/score_fn={score_fns_values[i]}/performance/' \
                                                                        f'performance_hper_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_cost}.csv', \
                                                                            engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'end_date'], index_col = 'end_date')
                performance_df.drop(columns = ['start_date', 'ratio_profit_over_total_loss',  'annualized_return', 'annualized_return_bh'], axis = 1, inplace = True)
                # # scale down the values of the Calmar ratio so that all the variables are roughly on the same scale
                # performance_df['Calmar_ratio'] = StandardScaler(with_mean=False).fit_transform( performance_df['Calmar_ratio'].values.reshape(-1,1) )
                performance_df['risk_adj_annualized_excess_return'] = performance_df['annualized_excess_return'] / performance_df['annualized_standard_deviation']
                # display(performance_df.head() )
                list_dfs[i].append(performance_df)
    except Exception as err:
        print(err)

    # melt all the dataframes to a long dataframe
    list_metric_dfs = []
    perf_stats_values = list(perf_stats_dict.values())
    for i in np.arange( len(score_fns_dict) ):
        for j in np.arange( len(algos) ):
            metric_df = list_dfs[i][j][perf_stats_values].reset_index(drop = False)
            metric_melted_df = pd.melt(metric_df, id_vars = 'end_date', var_name='perf_stats', value_name='value')
            metric_melted_df['score_fn'] = [score_fns_keys[i] for _ in range( len(metric_melted_df ) )]
            metric_melted_df['predictor_set'] = [algos[j] for _ in range( len(metric_melted_df ) )]
            list_metric_dfs.append(metric_melted_df)
    merged_df = pd.concat(list_metric_dfs, axis = 0)
    # display(merged_df.head() )
    # merged_df.to_csv(os.path.join(out_dir, 'merged_data.csv'), index=False, header = True) 
    
    perf_stats_keys = list( perf_stats_dict.keys() )
    for i in np.arange( len(perf_stats_dict) ):
        fig, ax = plt.subplots( figsize=(13, 8) )
        perf_stat_df = merged_df[ merged_df['perf_stats'] == perf_stats_values[i] ]
        # display( perf_stat_df.head() )
        ax = sns.boxplot(x = 'score_fn', y = 'value', hue='predictor_set', data=perf_stat_df)
        sns.stripplot(x = 'score_fn', y = 'value', hue='predictor_set', data=perf_stat_df, jitter=0.12, dodge=True, color='gray', size=2, alpha=0.8, ax = ax)
        handles, labels = ax.get_legend_handles_labels()
        ax.axhline(0, ls = '--', linewidth = 2, color ='red') 
        # ax.axhline(1, ls = '--', linewidth = 1, color ='red') 
        # ax.text(0, 1, "1", ha="center", va="center")
        ax.grid(ls=':')
        # ax.legend(handles[0:5], labels[0:5], fontsize=10)
        ax.legend(handles = handles, labels = ['SPY: Dataset I', 'SPY: Dataset II', 'RW: Dataset I', 'RW: Dataset II'], loc = 'upper right', title = 'Data & Predictors')
        ax.set_ylabel(perf_stats_keys[i], fontsize=15)
        ax.set_xlabel('Scoring Function', fontsize=15)
        ax.tick_params(axis='x', which='major', labelsize=10, labelrotation=10)        
        
        out_file = f'{out_dir}/joint_boxplot_{loss_fn}_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_cost}_perf_metric_{perf_stats_values[i]}.png'
        fig.savefig(out_file, dpi=150, bbox_inches="tight")
        plt.close()

    del perf_stat_df, merged_df, metric_df, performance_df
    gc.collect()

    return True



if __name__ == "__main__":
    startTime = time.time()

    # Define a list of algorithms employed
    # algos = ['RF_SPY_all_vars', 'RF_SPY_all_vars_plus_patterns']
    # algos = ['XGB_SPY_all_vars', 'XGB_SPY_all_vars_plus_patterns']
    # algos = ['LGBM_SPY_all_vars', 'LGBM_SPY_all_vars_plus_patterns']
    
    algos = ['RF_SPY_all_vars_plus_patterns', 'RF_RW_all_vars_plus_patterns']

    # Define a list of loss functions used to train a ML model
    # loss_fns = ['CE', 'Brier', 'Boost', 'As1', 'As2']
    loss_fns = ['CE']

    # Define a list of score functions used to cross validate a ML algorithm
    score_fns_fixed_trans_cost_dict = {'Accuracy': 'Accuracy', 'AUC': 'AUC',  'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_fixed_trans_cost', \
                                                                'Calmar ratio': 'Calmar_ratio_fixed_trans_cost', 'Sharpe ratio': 'Sharpe_ratio_fixed_trans_cost', \
                                                                'Sortino ratio': 'Sortino_ratio_fixed_trans_cost',  'CECPP': 'CECPP_fixed_trans_cost'}
    
    score_fns_variable_trans_cost_dict = {'Accuracy': 'Accuracy', 'AUC': 'AUC',  'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_variable_trans_cost', \
                                                                    'Calmar ratio': 'Calmar_ratio_variable_trans_cost', 'Sharpe ratio': 'Sharpe_ratio_variable_trans_cost', \
                                                                    'Sortino ratio': 'Sortino_ratio_variable_trans_cost',  'CECPP': 'CECPP_variable_trans_cost'}  

    # Define a list of holding periods
    invest_windows = [100, 200]
    
    # joint_plot3( score_fns_dict = score_fns_fixed_trans_cost_dict)
    
    try:
        # client = Client('tcp://localhost:8786', timeout='2s')
        cluster = LocalCluster(n_workers=num_cores, processes=True, memory_limit='auto', threads_per_worker=1, scheduler_port=8786, dashboard_address='localhost:8787')
        client = Client(cluster)
    except OSError:
        client.close()
        cluster.close()
        time.sleep(20)
        cluster = LocalCluster(n_workers=num_cores, processes=True, memory_limit='auto', threads_per_worker=1, scheduler_port=8786, dashboard_address='localhost:8787')
        client = Client(cluster)
    print(client)

    # # joint-boxplot a performance statistic of a trading strategy over different loss functions for a given scoring function with hue = 'algo'
    # if algos[0] == 'LGBM_SPY_all_vars':
    #     with joblib.parallel_backend('dask'):
    #         job_run = joblib.Parallel(verbose=20) (joblib.delayed(joint_plot1)( score_fn = score_fn, 
    #                                                                                                                     invest_window = invest_window, 
    #                                                                                                                     use_strategy = 'fixed_trans_cost', 
    #                                                                                                                     trans_cost = 0.05) \
    #                                                                             for score_fn in list( score_fns_fixed_trans_cost_dict.values() )
    #                                                                                 for invest_window in invest_windows)
    #     time.sleep(30)
        
    #     with joblib.parallel_backend('dask'):
    #         job_run = joblib.Parallel(verbose=20) (joblib.delayed(joint_plot1)( score_fn = score_fn, 
    #                                                                                                                     invest_window = invest_window, 
    #                                                                                                                     use_strategy = 'variable_trans_cost', 
    #                                                                                                                     trans_cost = 0.0005) \
    #                                                                             for score_fn in list( score_fns_variable_trans_cost_dict.values() )
    #                                                                                 for invest_window in invest_windows)
            
    #     time.sleep(30)
    # else:
    #     pass
    
    # # joint-boxplot a performance statistic of a trading strategy over different scoring functions for a given loss function with hue = 'algo'
    # with joblib.parallel_backend('dask'):
    #     job_run = joblib.Parallel(verbose=20) (joblib.delayed(joint_plot2)( algos = algos, 
    #                                                                                                                 loss_fn = loss_fn, 
    #                                                                                                                 score_fns_dict = score_fns_fixed_trans_cost_dict, 
    #                                                                                                                 invest_window = invest_window, 
    #                                                                                                                 trans_cost = 0.05) \
    #                                                                         for loss_fn in loss_fns
    #                                                                             for invest_window in invest_windows)
    # time.sleep(30)
    
    # with joblib.parallel_backend('dask'):
    #     job_run = joblib.Parallel(verbose=20) (joblib.delayed(joint_plot2)( algos = algos, 
    #                                                                                                                 loss_fn = loss_fn, 
    #                                                                                                                 score_fns_dict = score_fns_variable_trans_cost_dict, 
    #                                                                                                                 invest_window = invest_window, 
    #                                                                                                                 trans_cost = 0.0005) \
    #                                                                         for loss_fn in loss_fns
    #                                                                             for invest_window in invest_windows)

    # time.sleep(30)
    
    ##### joint-boxplot a performance statistic of a trading strategy over different scoring functions for a given loss function with hue = four different ML models
    with joblib.parallel_backend('dask'):
        job_run = joblib.Parallel(verbose=20) (joblib.delayed(joint_plot3)( algos = ['RF_SPY_all_vars', 'RF_SPY_all_vars_plus_patterns', 'RF_RW_all_vars', 'RF_RW_all_vars_plus_patterns'],
                                                                                                                    loss_fn = loss_fn, 
                                                                                                                    score_fns_dict = score_fns_fixed_trans_cost_dict, 
                                                                                                                    invest_window = invest_window, 
                                                                                                                    trans_cost = 0.05) \
                                                                            for loss_fn in loss_fns
                                                                                for invest_window in invest_windows)
    time.sleep(30)

    client.close()
    cluster.close()
    

    
    print( 'The script took {} second !'.format(time.time() - startTime) )

In [None]:
# ================================================= Plot Reality Check (RC) p-values ================================================================= #
# =========================================================================================================================================== #
import pandas as pd
import numpy as np
from pathlib import Path
import os
import sys
import datetime
import time
import re
import gc

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from cycler import cycler

# ------------------------------------------------------------------------------- Plot the RC p-values from RF trained on SPY ------------------------------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- #
# # Set path to the RC p-values generated by RF trained on SPY
# folder_path = Path(f'../Results/bootstrap/RF_SPY/')
# column_mapping = {  'RF_SPY_all_vars_CE_Accuracy': 'Dataset I: Accuracy',
#                                     'RF_SPY_all_vars_CE_AUC': 'Dataset I: AUC',
#                                     'RF_SPY_all_vars_CE_Gain_to_pain_ratio_fixed_trans_cost': 'Dataset I: Schwager\'s gain/pain ratio',
#                                     'RF_SPY_all_vars_CE_Calmar_ratio_fixed_trans_cost': 'Dataset I: Calmar ratio',
#                                     'RF_SPY_all_vars_CE_Sharpe_ratio_fixed_trans_cost': 'Dataset I: Sharpe ratio',
#                                     'RF_SPY_all_vars_CE_Sortino_ratio_fixed_trans_cost': 'Dataset I: Sortino ratio',
#                                     'RF_SPY_all_vars_CE_CECPP_fixed_trans_cost': 'Dataset I: CECPP',
#                                     'RF_SPY_all_vars_plus_patterns_CE_Accuracy': 'Dataset II: Accuracy',
#                                     'RF_SPY_all_vars_plus_patterns_CE_AUC': 'Dataset II: AUC',
#                                     'RF_SPY_all_vars_plus_patterns_CE_Gain_to_pain_ratio_fixed_trans_cost':  'Dataset II: Schwager\'s gain/pain ratio',
#                                     'RF_SPY_all_vars_plus_patterns_CE_Calmar_ratio_fixed_trans_cost': 'Dataset II: Calmar ratio',
#                                     'RF_SPY_all_vars_plus_patterns_CE_Sharpe_ratio_fixed_trans_cost': 'Dataset II: Sharpe ratio',
#                                     'RF_SPY_all_vars_plus_patterns_CE_Sortino_ratio_fixed_trans_cost': 'Dataset II: Sortino ratio',
#                                     'RF_SPY_all_vars_plus_patterns_CE_CECPP_fixed_trans_cost': 'Dataset II: CECPP'
#                                 }

# # Read all RC p-values to a dataframe
# file_path = folder_path.joinpath('all_p_values.csv')
# p_values_df = pd.read_csv(file_path, encoding='utf-8', sep = ',', low_memory=False, header = 0, skiprows = 0, skipinitialspace=True)
# p_values_df = p_values_df.rename(columns = column_mapping)
# display(p_values_df.shape )

# fig, ax = plt.subplots( figsize=(13, 8) )
# ax.set_prop_cycle(cycler('color', ['r', 'g', 'b', 'y']) * cycler('marker', ['o', 'v', '^', '>'])) #  cycler('linestyle', ['-', '--', ':', '-.']) 
# p_values_df.plot(kind='line', ax = ax)

# # set line widths, styles, and markers
# linewidths = [(i+2)/4. for i in range(p_values_df.shape[1])]
# for i, line in enumerate(ax.get_lines()):
#     line.set_linewidth(linewidths[i])
# #    line.set_linestyle(lines[i])
# #     line.set_marker(markers[i])

# # add a horizontal line
# ax.axhline(y = 0.05, linestyle='--', linewidth = 2, color ='black')
# ax.text(-1.1, 0.05, "0.05")
# ax.grid(ls=':')
# ax.legend(fontsize=10, loc='upper center', bbox_to_anchor=(0.5, -0.0), ncol = 2)
# ax.set_ylabel('RC p-value', fontsize=15)
# fig.savefig(folder_path.joinpath('rc_p_values.png'), dpi=150, bbox_inches="tight")

# ---------------------------------------------------------------------------------------- Plot the RC p-values from LGBM trained on SPY ------------------------------------------------------------------------------------- #
# ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ #

# Set path to the RC p-values generated by LGBM trained on SPY
folder_path = Path(f'../Results/bootstrap/LGBM_SPY/')
column_mapping = {  'LGBM_SPY_all_vars_CE_Accuracy': 'Dataset I (loss: $CE$, score: Accuracy)',
                                    'LGBM_SPY_all_vars_CE_AUC': 'Dataset I (loss: $CE$, score: AUC)',
                                    'LGBM_SPY_all_vars_CE_Gain_to_pain_ratio_fixed_trans_cost': 'Dataset I (loss: $CE$, score: Schwager\'s gain/pain ratio)',
                                    'LGBM_SPY_all_vars_CE_Calmar_ratio_fixed_trans_cost':  'Dataset I (loss: $CE$, score: Calmar ratio)',	
                                    'LGBM_SPY_all_vars_CE_Sharpe_ratio_fixed_trans_cost':  'Dataset I (loss: $CE$, score: Sharpe ratio)',	
                                    'LGBM_SPY_all_vars_CE_Sortino_ratio_fixed_trans_cost':  'Dataset I (loss: $CE$, score: Sortino ratio)',
                                    'LGBM_SPY_all_vars_CE_CECPP_fixed_trans_cost':  'Dataset I (loss: $CE$, score: CECPP)',
                                    'LGBM_SPY_all_vars_Brier_Accuracy':  'Dataset I (loss: Brier, score: Accuracy)',	
                                    'LGBM_SPY_all_vars_Brier_AUC':  'Dataset I (loss: Brier, score: AUC)',	
                                    'LGBM_SPY_all_vars_Brier_Gain_to_pain_ratio_fixed_trans_cost':  'Dataset I (loss: Brier, score: Schwager\'s gain/pain ratio)',
                                    'LGBM_SPY_all_vars_Brier_Calmar_ratio_fixed_trans_cost':  'Dataset I (loss: Brier, score: Calmar ratio)',
                                    'LGBM_SPY_all_vars_Brier_Sharpe_ratio_fixed_trans_cost':  'Dataset I (loss: Brier, score: Sharpe ratio)',
                                    'LGBM_SPY_all_vars_Brier_Sortino_ratio_fixed_trans_cost':  'Dataset I (loss: Brier, score: Sortino ratio)',
                                    'LGBM_SPY_all_vars_Brier_CECPP_fixed_trans_cost':  'Dataset I (loss: Brier, score: CECPP)',
                                    'LGBM_SPY_all_vars_Boost_Accuracy':  'Dataset I (loss: Boost, score: Accuracy)',
                                    'LGBM_SPY_all_vars_Boost_AUC':  'Dataset I (loss: Boost, score: AUC)',	
                                    'LGBM_SPY_all_vars_Boost_Gain_to_pain_ratio_fixed_trans_cost':  'Dataset I (loss: Boost, score: Schwager\'s gain/pain ratio)',
                                    'LGBM_SPY_all_vars_Boost_Calmar_ratio_fixed_trans_cost':  'Dataset I (loss: Boost, score: Calmar ratio)',
                                    'LGBM_SPY_all_vars_Boost_Sharpe_ratio_fixed_trans_cost':  'Dataset I (loss: Boost, score: Sharpe ratio)',
                                    'LGBM_SPY_all_vars_Boost_Sortino_ratio_fixed_trans_cost':  'Dataset I (loss: Boost, score: Sortino ratio)',
                                    'LGBM_SPY_all_vars_Boost_CECPP_fixed_trans_cost':  'Dataset I (loss: Boost, score: CECPP)',
                                    'LGBM_SPY_all_vars_As1_Accuracy':  'Dataset I (loss: As1, score: Accuracy)',
                                    'LGBM_SPY_all_vars_As1_AUC':  'Dataset I (loss: As1, score: AUC)',	
                                    'LGBM_SPY_all_vars_As1_Gain_to_pain_ratio_fixed_trans_cost':  'Dataset I (loss: As1, score: Schwager\'s gain/pain ratio)',	
                                    'LGBM_SPY_all_vars_As1_Calmar_ratio_fixed_trans_cost':  'Dataset I (loss: As1, score: Calmar ratio)',
                                    'LGBM_SPY_all_vars_As1_Sharpe_ratio_fixed_trans_cost': 'Dataset I (loss: As1, score: Sharpe ratio)',	
                                    'LGBM_SPY_all_vars_As1_Sortino_ratio_fixed_trans_cost': 'Dataset I (loss: As1, score: Sortino ratio)',
                                    'LGBM_SPY_all_vars_As1_CECPP_fixed_trans_cost': 'Dataset I (loss: As1, score: CECPP)',
                                    'LGBM_SPY_all_vars_As2_Accuracy':  'Dataset I (loss: As2, score: Accuracy)',
                                    'LGBM_SPY_all_vars_As2_AUC':  'Dataset I (loss: As2, score: AUC)',	
                                    'LGBM_SPY_all_vars_As2_Gain_to_pain_ratio_fixed_trans_cost':  'Dataset I (loss: As2, score: Schwager\'s gain/pain ratio)',	
                                    'LGBM_SPY_all_vars_As2_Calmar_ratio_fixed_trans_cost':  'Dataset I (loss: As2, score: Calmar ratio)',
                                    'LGBM_SPY_all_vars_As2_Sharpe_ratio_fixed_trans_cost':  'Dataset I (loss: As2, score: Sharpe ratio)',
                                    'LGBM_SPY_all_vars_As2_Sortino_ratio_fixed_trans_cost':  'Dataset I (loss: As2, score: Sortino ratio)',	
                                    'LGBM_SPY_all_vars_As2_CECPP_fixed_trans_cost':  'Dataset I (loss: As2, score: CECPP)',
                                    'LGBM_SPY_all_vars_plus_patterns_CE_Accuracy': 'Dataset II (loss: $CE$, score: Accuracy)',
                                    'LGBM_SPY_all_vars_plus_patterns_CE_AUC': 'Dataset II (loss: $CE$, score: AUC)',
                                    'LGBM_SPY_all_vars_plus_patterns_CE_Gain_to_pain_ratio_fixed_trans_cost': 'Dataset II (loss: $CE$, score: Schwager\'s gain/pain ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_CE_Calmar_ratio_fixed_trans_cost':  'Dataset II (loss: $CE$, score: Calmar ratio)',	
                                    'LGBM_SPY_all_vars_plus_patterns_CE_Sharpe_ratio_fixed_trans_cost':  'Dataset II (loss: $CE$, score: Sharpe ratio)',	
                                    'LGBM_SPY_all_vars_plus_patterns_CE_Sortino_ratio_fixed_trans_cost':  'Dataset II (loss: $CE$, score: Sortino ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_CE_CECPP_fixed_trans_cost':  'Dataset II (loss: $CE$, score: CECPP)',
                                    'LGBM_SPY_all_vars_plus_patterns_Brier_Accuracy':  'Dataset II (loss: Brier, score: Accuracy)',	
                                    'LGBM_SPY_all_vars_plus_patterns_Brier_AUC':  'Dataset II (loss: Brier, score: AUC)',	
                                    'LGBM_SPY_all_vars_plus_patterns_Brier_Gain_to_pain_ratio_fixed_trans_cost':  'Dataset II (loss: Brier, score: Schwager\'s gain/pain ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_Brier_Calmar_ratio_fixed_trans_cost':  'Dataset II (loss: Brier, score: Calmar ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_Brier_Sharpe_ratio_fixed_trans_cost':  'Dataset II (loss: Brier, score: Sharpe ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_Brier_Sortino_ratio_fixed_trans_cost':  'Dataset II (loss: Brier, score: Sortino ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_Brier_CECPP_fixed_trans_cost':  'Dataset II (loss: Brier, score: CECPP)',
                                    'LGBM_SPY_all_vars_plus_patterns_Boost_Accuracy':  'Dataset II (loss: Boost, score: Accuracy)',
                                    'LGBM_SPY_all_vars_plus_patterns_Boost_AUC':  'Dataset II (loss: Boost, score: AUC)',	
                                    'LGBM_SPY_all_vars_plus_patterns_Boost_Gain_to_pain_ratio_fixed_trans_cost':  'Dataset II (loss: Boost, score: Schwager\'s gain/pain ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_Boost_Calmar_ratio_fixed_trans_cost':  'Dataset II (loss: Boost, score: Calmar ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_Boost_Sharpe_ratio_fixed_trans_cost':  'Dataset II (loss: Boost, score: Sharpe ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_Boost_Sortino_ratio_fixed_trans_cost':  'Dataset II (loss: Boost, score: Sortino ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_Boost_CECPP_fixed_trans_cost':  'Dataset II (loss: Boost, score: CECPP)',
                                    'LGBM_SPY_all_vars_plus_patterns_As1_Accuracy':  'Dataset II (loss: As1, score: Accuracy)',
                                    'LGBM_SPY_all_vars_plus_patterns_As1_AUC':  'Dataset II (loss: As1, score: AUC)',	
                                    'LGBM_SPY_all_vars_plus_patterns_As1_Gain_to_pain_ratio_fixed_trans_cost':  'Dataset II (loss: As1, score: Schwager\'s gain/pain ratio)',	
                                    'LGBM_SPY_all_vars_plus_patterns_As1_Calmar_ratio_fixed_trans_cost':  'Dataset II (loss: As1, score: Calmar ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_As1_Sharpe_ratio_fixed_trans_cost': 'Dataset II (loss: As1, score: Sharpe ratio)',	
                                    'LGBM_SPY_all_vars_plus_patterns_As1_Sortino_ratio_fixed_trans_cost': 'Dataset II (loss: As1, score: Sortino ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_As1_CECPP_fixed_trans_cost': 'Dataset II (loss: As1, score: CECPP)',
                                    'LGBM_SPY_all_vars_plus_patterns_As2_Accuracy':  'Dataset II (loss: As2, score: Accuracy)',
                                    'LGBM_SPY_all_vars_plus_patterns_As2_AUC':  'Dataset II (loss: As2, score: AUC)',	
                                    'LGBM_SPY_all_vars_plus_patterns_As2_Gain_to_pain_ratio_fixed_trans_cost':  'Dataset II (loss: As2, score: Schwager\'s gain/pain ratio)',	
                                    'LGBM_SPY_all_vars_plus_patterns_As2_Calmar_ratio_fixed_trans_cost':  'Dataset II (loss: As2, score: Calmar ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_As2_Sharpe_ratio_fixed_trans_cost':  'Dataset II (loss: As2, score: Sharpe ratio)',
                                    'LGBM_SPY_all_vars_plus_patterns_As2_Sortino_ratio_fixed_trans_cost':  'Dataset II (loss: As2, score: Sortino ratio)',	
                                    'LGBM_SPY_all_vars_plus_patterns_As2_CECPP_fixed_trans_cost':  'Dataset II (loss: As2, score: CECPP)'
                                } 

# Read all RC p-values to a dataframe
file_path = folder_path.joinpath('all_p_values.csv')
p_values_df = pd.read_csv(file_path, encoding='utf-8', sep = ',', low_memory=False, header = 0, skiprows = 0, skipinitialspace=True)
p_values_df = p_values_df.rename(columns = column_mapping)
display(p_values_df.shape )

## extract all columns that contains 'like' in the labels
like = 'Accuracy'
p_values_df = p_values_df.filter(like=like, axis=1)
display(p_values_df.head() )


fig, ax = plt.subplots( figsize=(13, 8) )
ax.set_prop_cycle(cycler('color', ['r', 'g', 'b', 'y']) * cycler('marker', ['o', 'v', '^', '>'])) #  cycler('linestyle', ['-', '--', ':', '-.']) 
p_values_df.plot(kind='line', ax = ax)

# set line widths, styles, and markers
linewidths = [(i+2)/4. for i in range(p_values_df.shape[1])]
for i, line in enumerate(ax.get_lines()):
    line.set_linewidth(linewidths[i])
#    line.set_linestyle(lines[i])
#     line.set_marker(markers[i])

# add a horizontal line
ax.axhline(y = 0.10, linestyle='--', linewidth = 2, color ='black')
ax.text(-6.1, 0.10, "0.1")
ax.grid(ls=':')
ax.legend(fontsize=10, loc='upper center', bbox_to_anchor=(0.5, -0.0), ncol = 2)
ax.set_ylabel('RC p-value', fontsize=15)
fig.savefig(folder_path.joinpath(f'rc_p_values_{like}.png'), dpi=150, bbox_inches="tight")


# colors = ['b', 'darkorange', 'y', 'c', 'm', 'k', 'crimson', 'limegreen', 'steelblue', 'gold', 'hotpink', 'maroon']
#     markers = ['o', 'v', '^', 'p', '>', '*', 'x', 'h', '+', 'p', "D", "d"] 
#     lines = ["-", "--", "-.", (0, (5, 1) ), "-", "--", (0, (3, 1, 1, 1) ), (0, (5, 1) ), (0, (5, 1)), (0, (3, 1, 1, 1, 1, 1)), (0, (3, 10, 1, 10, 1, 10)), "-"]
#     linewidths = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
#     ax = rmse_df.set_index('date').plot(kind='line', figsize=(10, 8), rot=10, color=colors)

#     # set line width and line style
#     for i, line in enumerate(ax.get_lines()):
#         line.set_linewidth(linewidths[i])
#         line.set_linestyle(lines[i])
#     #     line.set_marker(markers[i])
#     ax.grid(ls=':')
#     ax.legend(fontsize=15)
#     [l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
#     [l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
#     ax.set_ylabel('RMSE', fontsize=15)

#     plt.savefig(f'./US_df_big/graphs/rmse_fhorizon_{fhorizon}.pdf', dpi=500)
#     print(ax)


In [None]:
# =========================================================================================================================================== #
# Compare the Performance of a Trading Strategy based on Technical Indicators vs. Price Patterns across Multiple Forecasting Horizons for a given Loss Function and Scoring Function 
# =========================================================================================================================================== #
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import re
import os
import sys
import datetime
from varname import nameof
import time
import gc

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

#Gradient Color Bar Plots
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from matplotlib import colors as mcolors, path

from dask.distributed import Client, LocalCluster
import joblib
import multiprocessing
# import dask
# import distributed
# dask.config.set({"distributed.comm.timeouts.tcp": "100000s", "distributed.scheduler.allowed-failures": 999})
# num_cores = multiprocessing.cpu_count()
num_cores = 30

##### Set the current working directory
path="e:/Copy/SCRIPTS/Forecast_Stocks/Jupyter_notebooks/"
os.chdir(path)

##### Joint-boxplot a performance statistic of a trading strategy over forecasting horizons with hue = 'algos'
def joint_plot( algos = ['RF_SPY_all_vars', 'RF_SPY_all_vars_plus_patterns'],
                        loss_fn = 'CE',
                        score_fn_dict = {'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_fixed_trans_cost'},
                        taus = [1, 2, 4, 6, 8, 10, 12],
                        init_wealth = 1000,
                        invest_window = 200,
                        trans_cost = 0.5,
                        perf_stats_dict = {  'Average Number of Trades': 'average_number_of_trades',
                                                        'Percentage of Winning Trades': 'percentage_of_winning_trades',	
                                                        'Largest Simple Return':   'largest_raw_return',	
                                                        'Smallest Simple Return':    'smallest_raw_return',	
                                                        'Ratio of Average Winning Trade to Average Losing Trade': 'ratio_win_loss',	
                                                        'Maximum Number of Consecutive Winners': 'max_number_of_consecutive_winners',	
                                                        'Maximum Number of Consecutive Losers':    'max_number_of_consecutive_losers', 
                                                        'Risk-Adjusted Annualized Excess Return':    'annualized_excess_return',	
                                                        'Annualized Standard Deviation':    'annualized_standard_deviation',	
                                                        'Maximum Drawdown':   'max_drawdown',	
                                                        'Schwager\'s Gain to Pain Ratio': 'Schwager_gain-to-pain_ratio',	
                                                        'Calmar Ratio':  'Calmar_ratio',	
                                                        'Sharpe Ratio':    'Sharpe_ratio',	
                                                        'Sortino Ratio': 'Sortino_ratio',
                                                        'CECPP': 'cecpp',
                                                        'Morningstar\'s risk-adjusted rating': 'mrar'}
                        ):

    ''' Joint plot performance statistics of a trading strategy over over different values of the forecasting horizon
    INPUT
        algos: ML algorithms used to forecast price movement directions
        loss_fn: a loss function used to train a ML model
        score_fn_dict: a scoring function used to cross-validate a ML model
            (i.e., score_fn_dict is in the dictionary: {  'Accuracy': 'Accuracy', 'AUC': 'AUC',  'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_fixed_trans_cost', \
                                                                                'Calmar ratio': 'Calmar_ratio_fixed_trans_cost', 'Sharpe ratio': 'Sharpe_ratio_fixed_trans_cost', \
                                                                                'Sortino ratio': 'Sortino_ratio_fixed_trans_cost',  'CECPP': 'CECPP_fixed_trans_cost'} for the fixed cost strategy}
            )
        taus: a list of forecasting horizons
        init_wealth: an initial wealth
        invest_window: a trading window
        trans_cost: a  value of transaction cost (i.e., {0.05, 0.1, 0.5, 1.0} for the fixed cost strategy and {0.0005, 0.001, 0.005, 0.01} for the variable cost strategy)
        perf_stats_dict: a dictionary of performance statistics
    OUTPUT
        joint boxplots of a performance statistic across multiple forecasting horizons
    '''
    
    # retrieve the name of the scoring function
    score_fn_dict_key = list(score_fn_dict.keys())[0]
    score_fn_dict_value = list(score_fn_dict.values())[0]
    # print(score_fn_dict_value)
    
    algo_word = re.search(r'\w+(?=\_all)',  algos[0], flags=re.IGNORECASE | re.VERBOSE).group()
    out_dir = f'../Results/graphs/{algo_word}/'
    if not os.path.exists( out_dir ):
    # create the directory if it does not exist .
        os.makedirs( out_dir )
        
    use_strategy = 'fixed_trans_cost'
    try:
        use_strategy = re.search(fr'(?<={score_fn_dict_key.split()[-1]}_)\w+', score_fn_dict_value, flags=re.IGNORECASE | re.VERBOSE).group().lower() 
    except:
        pass
    # print(use_strategy)
    
    # import data into dataframes
    try:
        list_dfs = [[] for _ in np.arange( len(algos) )]
        for i in np.arange(len(algos)):
            for tau in taus:
                performance_df = pd.read_csv(f'../Results/{algos[i]}/loss_fn={loss_fn}/score_fn={score_fn_dict_value}/tau={tau}/performance/' +
                                                                    f'performance_hper_{invest_window}_init_wealth_{init_wealth}_{use_strategy}_{trans_cost}.csv', \
                                                                        engine = 'python', encoding='utf-8', skipinitialspace=True, sep = ',', parse_dates=['start_date', 'end_date'], index_col = 'end_date')
                performance_df['annualized_excess_return'] = performance_df['annualized_excess_return'].values / performance_df['annualized_standard_deviation'].values
                performance_df.drop(columns = ['start_date', 'ratio_profit_over_total_loss',  'annualized_return', 'annualized_return_bh'], axis = 1, inplace = True)
                # # scale down the values of the Calmar ratio so that all the variables are roughly on the same scale
                # performance_df['Calmar_ratio'] = StandardScaler(with_mean=False).fit_transform( performance_df['Calmar_ratio'].values.reshape(-1,1) )
                # display(performance_df.head() )
                list_dfs[i].append(performance_df)
    except Exception as er:
        print(er)
        
    # display(list_dfs[0].head())

    # melt all the dataframes to a long dataframe
    list_metric_dfs = []
    perf_stats_values = list( perf_stats_dict.values() )
    
    for i in np.arange( len(algos) ):
        for j in np.arange( len(taus) ):
            metric_df = list_dfs[i][j][perf_stats_values].reset_index(drop = False)
            metric_melted_df = pd.melt(metric_df, id_vars = 'end_date', var_name='perf_stats', value_name='value')
            metric_melted_df['tau'] = [taus[j] for _ in range( len(metric_melted_df ) )]
            metric_melted_df['predictor_set'] = [algos[i] for _ in range( len(metric_melted_df ) )]
            list_metric_dfs.append(metric_melted_df)
    merged_df = pd.concat(list_metric_dfs, axis = 0)
    # display( merged_df.head() )
    # merged_df.to_csv(os.path.join(out_dir, 'merged_data.csv'), index=False, header = True) 

    # joint boxplot each performance statistic
    perf_stats_keys = list( perf_stats_dict.keys() )
    for i in np.arange( len(perf_stats_dict) ):
        print('Performance metric: %s' % perf_stats_values[i])
        fig, ax = plt.subplots( figsize=(13, 8) )
        perf_stat_df = merged_df[ merged_df['perf_stats'] == perf_stats_values[i] ].reset_index(drop = True)
        # display( perf_stat_df.head() )
        ax = sns.boxplot(x = 'tau', y = 'value', hue = 'predictor_set', data=perf_stat_df)
        sns.stripplot(x = 'tau', y = 'value',   hue = 'predictor_set', data=perf_stat_df, jitter=0.12, dodge=True, palette='dark:gray', size=2, alpha=0.8, ax = ax)
        handles, labels = ax.get_legend_handles_labels()
        ax.axhline(0, ls = '--', linewidth = 2, color ='red') 
        # ax.axhline(1, ls = '--', linewidth = 1, color ='red') 
        # ax.text(0, 1, "1", ha="center", va="center")
        ax.grid(ls=':')
        ax.legend(handles = handles, labels = ['Dataset I', 'Dataset II'], loc = 'upper right', title = 'Set of Predictors')
        ax.set_ylabel(perf_stats_keys[i], fontsize=15)
        ax.set_xlabel('Forecasting Horizon', fontsize=15)
        ax.tick_params(axis='x', which='major', labelsize=13, labelrotation=0)        
        out_file = f'{out_dir}/joint_boxplot_all_fhorizons_{loss_fn}_{score_fn_dict_value}_{invest_window}_{perf_stats_values[i]}_{use_strategy}_{trans_cost}.png'
        fig.savefig(out_file, dpi=150, bbox_inches="tight")
        plt.close()

    del perf_stat_df, merged_df, metric_df, performance_df
    gc.collect()

    return True

if __name__ == "__main__":
    startTime = time.time()

    # Define a list of algorithms employed
    # algos = ['RF_SPY_all_vars', 'RF_SPY_all_vars_plus_patterns']
    algos = ['LGBM_SPY_all_vars', 'LGBM_SPY_all_vars_plus_patterns']

    # Define a list of loss functions used to train a ML model
    loss_fns = ['CE', 'Brier', 'Boost', 'As1', 'As2']
    # loss_fns = ['CE']

    # Define a list of score functions used to cross validate a ML algorithm
    # score_fns_dict = [{'Accuracy': 'Accuracy'}, \
    #                              {'AUC': 'AUC'},  \
    #                              {'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_fixed_trans_cost'}, \
    #                              {'Calmar ratio': 'Calmar_ratio_fixed_trans_cost'}, 
    #                              {'Sharpe ratio': 'Sharpe_ratio_fixed_trans_cost'}, \
    #                              {'Sortino ratio': 'Sortino_ratio_fixed_trans_cost'},  
    #                              {'CECPP': 'CECPP_fixed_trans_cost'}]
    score_fns_dict = [{'Accuracy': 'Accuracy'}, \
                                 {'AUC': 'AUC'},  \
                                 {'Schwager\'s gain/pain ratio': 'Gain_to_pain_ratio_fixed_trans_cost'}, \
                                 {'Sharpe ratio': 'Sharpe_ratio_fixed_trans_cost'}, \
                                 {'CECPP': 'CECPP_fixed_trans_cost'}]

    # Define a list of holding periods
    invest_windows = [100, 200]
    
    # Define a list of transaction costs
    trans_costs = [0.05, 0.1, 0.5, 1.0]
    
    for loss_fn in loss_fns:
        for score_fn_dict in score_fns_dict:
            for invest_window in invest_windows:
                for trans_cost in trans_costs:
                    joint_plot(algos = algos, 
                                    loss_fn = loss_fn, 
                                    score_fn_dict = score_fn_dict,
                                    taus = [1, 2, 4, 6, 8, 10, 12],
                                    invest_window = invest_window,
                                    trans_cost = trans_cost
                                    )
    
    # joint_plot()

    
    print( 'The script took {} second !'.format(time.time() - startTime) )