# Data Analysis
for model selection and results analysis 

## Libraries and imports

In [1]:
import sys
sys.path.append('..')
from scripts.utils import initialize_bucket, plot_importance

from datetime import date

import pandas as pd
import numpy as np
import re
import ast

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold, cross_val_score,cross_validate
from sklearn.metrics import r2_score, mean_squared_error, make_scorer, explained_variance_score,mean_absolute_percentage_error
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.decomposition import PCA


from sklearn.preprocessing import StandardScaler, Normalizer

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import shap

import pickle
import json

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [None]:
pd.set_option("display.max_columns", 400)

credentials_path = '../datascience-capstone-project-05b1642f45c3.json'

In [None]:
client, bucket = initialize_bucket(credentials_path,'storage-barsianize')

path = "gs://storage-barsianize/05_datasets/df_base_dataset.parquet"
df =  pd.read_parquet(path)

path = "gs://storage-barsianize/05_datasets/df_to_pred.parquet"
df_to_pred =  pd.read_parquet(path)

## Methods and functions

In [None]:
def load_model(path, model_name):
    
    with open(path + model_name, 'rb') as file:
        model = pickle.load(file)

    loaded_model = None
    for step_name, step_model in model.named_steps.items():
        if isinstance(step_model, LGBMRegressor):
            loaded_model = step_model
            break
        elif isinstance(step_model, XGBRegressor):
            loaded_model = step_model
            break
        elif step_name == 'preprocessing':
            transformer = step_model

    return loaded_model, transformer

In [None]:
def make_predictions(path, model_name, results_file, X_to_pred):
    # model_name = '2023-08-04_LGBMRegressor_Normalizer()_feat_selection.pkl'
    with open(path + model_name, 'rb') as file:
        pipeline = pickle.load(file)

    with open(path + results_file, 'rb') as json_file:
        results = json.load(json_file)

    model, transformer = load_model(path, model_name)

    features = results['features']

    return pipeline.predict(X_to_pred[features]), pipeline, model, transformer, features

In [None]:
def get_feature_importances(model, model_class, features):
    if 'LGB' in model_class:
        feature_importances = model.booster_.feature_importance(importance_type='gain')
    elif 'XGB' in model_class:
        feature_importances = model.get_booster().get_score(importance_type='gain').values()
        
    # Create a dictionary to associate feature names with their importance scores
    feature_importance_dict = dict(zip(features, feature_importances))
    df_feature_importance = pd.DataFrame(feature_importance_dict, 
                                      index=['Total gain']).T.sort_values(by='Total gain', ascending=False)
    return df_feature_importance

In [None]:
def calculate_shap_values(model, transformer, X_train, X_test, path, filename, plot=False):
    # If the LightGBM model is found, convert it to a LightGBM Booster object and prepare the explainer
    if model is not None:
        # Assuming you have your test data in 'X_test' (replace 'X_test' with your actual test data)
        explainer = shap.Explainer(model, transformer.transform(X_train))

        # Calculate SHAP values for the test data
        shap_values = explainer(transformer.transform(X_test), check_additivity=False)

        # Save SHAP values to a file using pickle
        with open(path + filename, 'wb') as file:
            pickle.dump(shap_values, file)

        # plot shap summary if desired
        if plot:
            shap.summary_plot(shap_values, X_test)
        
        # Return SHAP values
        return shap_values
    
    else:
        print("LightGBM model not found in the pipeline.")
    

In [None]:
def get_dy_metrics(data, sort_col, filter_col,value_filter):

    data = data.sort_values(by=sort_col, ascending=False)
    
    d = {}
    d['top5'] = {
                        'dy_mean'     : data['dy_label'].iloc[:5].mean(),
                        'pl_mean_last'     : data['pl_mean_last'].iloc[:5].mean(),
                        'cotacao_mean_last': data['cotacao_mean_last'].iloc[:5].mean()
                    }

    d['top10'] = {
                        'dy_mean'     : data['dy_label'].iloc[:10].mean(),
                        'pl_mean_last'     : data['pl_mean_last'].iloc[:10].mean(),
                        'cotacao_mean_last': data['cotacao_mean_last'].iloc[:10].mean()
                    }

    d['top20'] = {
                        'dy_mean'     : data['dy_label'].iloc[:20].mean(),
                        'pl_mean_last'     : data['pl_mean_last'].iloc[:20].mean(),
                        'cotacao_mean_last': data['cotacao_mean_last'].iloc[:20].mean()
                    }

    d[f'top5_{filter_col}'] = {
                        'dy_mean'     : data[data[filter_col]==value_filter]['dy_label'].iloc[:5].mean(),
                        'pl_mean_last'     : data[data[filter_col]==value_filter]['pl_mean_last'].iloc[:5].mean(),
                        'cotacao_mean_last': data[data[filter_col]==value_filter]['cotacao_mean_last'].iloc[:5].mean()
                    }

    d[f'top10_{filter_col}'] = {
                        'dy_mean'     : data[data[filter_col]==value_filter]['dy_label'].iloc[:10].mean(),
                        'pl_mean_last'     : data[data[filter_col]==value_filter]['pl_mean_last'].iloc[:10].mean(),
                        'cotacao_mean_last': data[data[filter_col]==value_filter]['cotacao_mean_last'].iloc[:10].mean()
                    }

    d[f'top20_{filter_col}'] = {
                        'dy_mean'     : data[data[filter_col]==value_filter]['dy_label'].iloc[:20].mean(),
                        'pl_mean_last'     : data[data[filter_col]==value_filter]['pl_mean_last'].iloc[:20].mean(),
                        'cotacao_mean_last': data[data[filter_col]==value_filter]['cotacao_mean_last'].iloc[:20].mean()
                    }

    return pd.DataFrame(d)

In [None]:
def plot_results(data, y, title, palette):
    plt.figure(figsize=(20,20), dpi=400)
    ax = sns.barplot(x=data.sort_values(y).index,
                    y=y, palette=palette, data=data.sort_values(y))
    # ax = sns.barplot(x=df_grupos_de_despesas.index[:-1],
    #                  y='Valor', palette='viridis', data=df_grupos_de_despesas[:-1])
    plt.xlabel('Model Output',fontsize=24)
    plt.ylabel('DY [-]', fontsize=20)
    plt.title(title, fontsize=24)
    plt.xticks(rotation=90, fontsize=20)
    plt.yticks(fontsize=20)

    for p in ax.patches:
        ax.annotate('{:,.3f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+0.005), fontsize=20,rotation=0)
    sns.despine()
    plt.tight_layout()


def create_syled_sheet(data, filter_word_in, sort_col, cmap):
    return data.sort_values(sort_col, ascending=False)[[col for col in data.columns if filter_word_in in col]]\
        .style.background_gradient(cmap=cmap,axis=0)


## Analyse the prediction outcomes for each model
### Best models (Transformer + Algorithm + Feature Selection)

In [None]:
# separating the train and target features
X_to_pred = df_to_pred.drop(['year_month','Papel','Empresa','dy_label'], axis=1)
y_to_pred = df_to_pred['dy_label']

In [None]:
path = '../data/03_models/out/'

# '2023-08-07_LGBMRegressor_Normalizer()_feat_selection.pkl'

model_name = '2023-08-07_LGBMRegressor_Normalizer()_feat_selection.pkl'
results_file = '2023-08-07_LGBMRegressor_Normalizer()_results_feat_selection.json'
file_name_SHAP = '2023-08-07_LGBMRegressor_Normalizer()_results_feat_selection_SHAP.pkl'

df_to_pred['dy_pred_N_FS'], N_FS_0807_pipeline, N_FS_0807_model, N_FS_0807_transformer, N_FS_0807_features = make_predictions(path, model_name, results_file, X_to_pred)

X = df[N_FS_0807_features]
y = df['dy_label']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=32)

N_FS_0807_shap = calculate_shap_values(N_FS_0807_model, N_FS_0807_transformer, X_train, X_test, path, file_name_SHAP, plot=True)



# '2023-08-07_LGBMRegressor_StandardScaler()_feat_selection.pkl' 

model_name = '2023-08-07_LGBMRegressor_StandardScaler()_feat_selection.pkl'
results_file = '2023-08-07_LGBMRegressor_StandardScaler()_results_feat_selection.json'
file_name_SHAP = '2023-08-07_LGBMRegressor_StandardScaler()_results_feat_selection_SHAP.pkl'

df_to_pred['dy_pred_S_FS'], S_FS_0807_pipeline, S_FS_0807_model, S_FS_0807_transformer, S_FS_0807_features = make_predictions(path, model_name, results_file, X_to_pred)

X = df[S_FS_0807_features]
y = df['dy_label']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=32)

S_FS_0807_shap = calculate_shap_values(S_FS_0807_model, N_FS_0807_transformer, X_train, X_test, path, file_name_SHAP, plot=True)



path = '../data/03_models/out/'

# '2023-08-05_LGBMRegressor_Normalizer()_feat_selection_lgbm.pkl'

model_name = '2023-08-07_LGBMRegressor_Normalizer()_feat_selection_lgbm.pkl'
results_file = '2023-08-07_LGBMRegressor_Normalizer()_results_feat_selection_lgbm.json'
file_name_SHAP = '2023-08-07_LGBMRegressor_Normalizer()_results_feat_selection_lgbm_SHAP.pkl'

df_to_pred['dy_pred_N_FS_LGBM'], N_FS_LGBM_0807_pipeline, N_FS_LGBM_0807_model, N_FS_LGBM_0807_transformer, N_FS_LGBM_0807_features = make_predictions(path, model_name, results_file, X_to_pred)

X = df[N_FS_LGBM_0807_features]
y = df['dy_label']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=32)

N_FS_LGBM_0807_shap = calculate_shap_values(N_FS_LGBM_0807_model, N_FS_LGBM_0807_transformer, X_train, X_test, path, file_name_SHAP, plot=True)



# '2023-08-07_LGBMRegressor_StandardScaler()_feat_selection_lgbm.pkl' 

model_name = '2023-08-07_LGBMRegressor_StandardScaler()_feat_selection_lgbm.pkl'
results_file = '2023-08-07_LGBMRegressor_StandardScaler()_results_feat_selection_lgbm.json'
file_name_SHAP = '2023-08-07_LGBMRegressor_StandardScaler()_results_feat_selection_lgbm_SHAP.pkl'

df_to_pred['dy_pred_S_FS_LGBM'], S_FS_LGBM_0807_pipeline, S_FS_LGBM_0807_model, S_FS_LGBM_0807_transformer, S_FS_LGBM_0807_features = make_predictions(path, model_name, results_file, X_to_pred)

X = df[S_FS_LGBM_0807_features]
y = df['dy_label']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=32)

S_FS_LGBM_0807_shap = calculate_shap_values(S_FS_LGBM_0807_model, S_FS_LGBM_0807_transformer, X_train, X_test, path, file_name_SHAP, plot=True)

In [None]:
path = '../data/03_models/out/'

# '2023-08-08_LGBMRegressor_Normalizer()_feat_selection.pkl'

model_name = '2023-08-08_LGBMRegressor_Normalizer()_feat_selection.pkl'
results_file = '2023-08-08_LGBMRegressor_Normalizer()_results_feat_selection.json'
file_name_SHAP = '2023-08-08_LGBMRegressor_Normalizer()_results_feat_selection_SHAP.pkl'

df_to_pred['dy_pred_N_FS_0808'], N_FS_0808_pipeline, N_FS_0808_model, N_FS_0808_transformer, N_FS_0808_features = make_predictions(path, model_name, results_file, X_to_pred)

X = df[N_FS_0808_features]
y = df['dy_label']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=32)

N_FS_0808_shap = calculate_shap_values(N_FS_0808_model, N_FS_0808_transformer, X_train, X_test, path, file_name_SHAP, plot=True)



# '2023-08-08_LGBMRegressor_StandardScaler()_feat_selection.pkl' 

model_name = '2023-08-08_LGBMRegressor_StandardScaler()_feat_selection.pkl'
results_file = '2023-08-08_LGBMRegressor_StandardScaler()_results_feat_selection.json'
file_name_SHAP = '2023-08-08_LGBMRegressor_StandardScaler()_results_feat_selection_SHAP.pkl'

df_to_pred['dy_pred_S_FS_0808'], S_FS_0808_pipeline, S_FS_0808_model, S_FS_0808_transformer, S_FS_0808_features = make_predictions(path, model_name, results_file, X_to_pred)

X = df[S_FS_0808_features]
y = df['dy_label']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=32)

S_FS_0808_shap = calculate_shap_values(S_FS_0808_model, N_FS_0808_transformer, X_train, X_test, path, file_name_SHAP, plot=True)



path = '../data/03_models/out/'

# '2023-08-05_LGBMRegressor_Normalizer()_feat_selection_lgbm.pkl'

model_name = '2023-08-08_LGBMRegressor_Normalizer()_feat_selection_lgbm.pkl'
results_file = '2023-08-08_LGBMRegressor_Normalizer()_results_feat_selection_lgbm.json'
file_name_SHAP = '2023-08-08_LGBMRegressor_Normalizer()_results_feat_selection_lgbm_SHAP.pkl'

df_to_pred['dy_pred_N_FS_LGBM_0808'], N_FS_LGBM_0808_pipeline, N_FS_LGBM_0808_model, N_FS_LGBM_0808_transformer, N_FS_LGBM_0808_features = make_predictions(path, model_name, results_file, X_to_pred)

X = df[N_FS_LGBM_0808_features]
y = df['dy_label']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=32)

N_FS_LGBM_0808_shap = calculate_shap_values(N_FS_LGBM_0808_model, N_FS_LGBM_0808_transformer, X_train, X_test, path, file_name_SHAP, plot=True)



# '2023-08-08_LGBMRegressor_StandardScaler()_feat_selection_lgbm.pkl' 

model_name = '2023-08-08_LGBMRegressor_StandardScaler()_feat_selection_lgbm.pkl'
results_file = '2023-08-08_LGBMRegressor_StandardScaler()_results_feat_selection_lgbm.json'
file_name_SHAP = '2023-08-08_LGBMRegressor_StandardScaler()_results_feat_selection_lgbm_SHAP.pkl'

df_to_pred['dy_pred_S_FS_LGBM_0808'], S_FS_LGBM_0808_pipeline, S_FS_LGBM_0808_model, S_FS_LGBM_0808_transformer, S_FS_LGBM_0808_features = make_predictions(path, model_name, results_file, X_to_pred)

X = df[S_FS_LGBM_0808_features]
y = df['dy_label']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=32)

S_FS_LGBM_0808_shap = calculate_shap_values(S_FS_LGBM_0808_model, S_FS_LGBM_0808_transformer, X_train, X_test, path, file_name_SHAP, plot=True)


## Recommend 20 best Stocks

In [None]:
df_to_pred['Papel_root'] = df_to_pred['Papel'].str[:4]

df_recommend = df_to_pred[['Empresa','Papel','Papel_root','besst_1','besst_2','dy_label','pl_mean_last','cotacao_mean_last',
                            'dy_pred_N_FS',
                            'dy_pred_S_FS',
                            'dy_pred_N_FS_LGBM',
                            'dy_pred_S_FS_LGBM',
                            'dy_pred_N_FS_0808',
                            'dy_pred_S_FS_0808',
                            'dy_pred_N_FS_LGBM_0808',
                            'dy_pred_S_FS_LGBM_0808']]\
                    .sort_values(by='dy_pred_N_FS', ascending=False)\
                    .drop_duplicates(subset=['Papel_root'], keep='first')

In [None]:
df_recommend.sort_values(by='dy_pred_N_FS_LGBM', ascending=False).iloc[:10]

In [None]:
dict_results = {'dy_pred_N_FS_0808'      :get_dy_metrics(df_recommend, 'dy_pred_N_FS_0808', 'besst_1', 1),
                'dy_pred_S_FS_0808'      :get_dy_metrics(df_recommend, 'dy_pred_S_FS_0808', 'besst_1', 1),
                'dy_pred_N_FS_LGBM_0808' :get_dy_metrics(df_recommend, 'dy_pred_N_FS_LGBM_0808', 'besst_1', 1),
                'dy_pred_S_FS_LGBM_0808' :get_dy_metrics(df_recommend, 'dy_pred_S_FS_LGBM_0808', 'besst_1', 1),
                'dy_pred_N_FS'      :get_dy_metrics(df_recommend, 'dy_pred_N_FS', 'besst_1', 1),
                'dy_pred_S_FS'      :get_dy_metrics(df_recommend, 'dy_pred_S_FS', 'besst_1', 1),
                'dy_pred_N_FS_LGBM' :get_dy_metrics(df_recommend, 'dy_pred_N_FS_LGBM', 'besst_1', 1),
                'dy_pred_S_FS_LGBM' :get_dy_metrics(df_recommend, 'dy_pred_S_FS_LGBM', 'besst_1', 1)}

dict_results2= {'dy_pred_N_FS_0808'      :get_dy_metrics(df_recommend, 'dy_pred_N_FS_0808', 'besst_2', 1),
                'dy_pred_S_FS_0808'      :get_dy_metrics(df_recommend, 'dy_pred_S_FS_0808', 'besst_2', 1),
                'dy_pred_N_FS_LGBM_0808' :get_dy_metrics(df_recommend, 'dy_pred_N_FS_LGBM_0808', 'besst_2', 1),
                'dy_pred_S_FS_LGBM_0808' :get_dy_metrics(df_recommend, 'dy_pred_S_FS_LGBM_0808', 'besst_2', 1),
                'dy_pred_N_FS'      :get_dy_metrics(df_recommend, 'dy_pred_N_FS', 'besst_2', 1),
                'dy_pred_S_FS'      :get_dy_metrics(df_recommend, 'dy_pred_S_FS', 'besst_2', 1),
                'dy_pred_N_FS_LGBM' :get_dy_metrics(df_recommend, 'dy_pred_N_FS_LGBM', 'besst_2', 1),
                'dy_pred_S_FS_LGBM' :get_dy_metrics(df_recommend, 'dy_pred_S_FS_LGBM', 'besst_2', 1)}

                

In [None]:
df_recommend['score'] = np.log(1 + df_recommend['dy_pred_N_FS_0808'].apply(lambda x: x if x>= 0 else 0)*dict_results['dy_pred_N_FS_0808']['top10'].loc['dy_mean']) + \
                        np.log(1 + df_recommend['dy_pred_S_FS_0808'].apply(lambda x: x if x>= 0 else 0)*dict_results['dy_pred_S_FS_0808']['top10'].loc['dy_mean']) + \
                        np.log(1 + df_recommend['dy_pred_N_FS_LGBM_0808'].apply(lambda x: x if x>= 0 else 0)*dict_results['dy_pred_N_FS_LGBM_0808']['top10'].loc['dy_mean']) + \
                        np.log(1 + df_recommend['dy_pred_S_FS_LGBM_0808'].apply(lambda x: x if x>= 0 else 0)*dict_results['dy_pred_S_FS_LGBM_0808']['top10'].loc['dy_mean']) + \
                        np.log(1 + df_recommend['dy_pred_N_FS'].apply(lambda x: x if x>= 0 else 0)*dict_results['dy_pred_N_FS']['top10'].loc['dy_mean']) + \
                        np.log(1 + df_recommend['dy_pred_S_FS'].apply(lambda x: x if x>= 0 else 0)*dict_results['dy_pred_S_FS']['top10'].loc['dy_mean']) + \
                        np.log(1 + df_recommend['dy_pred_N_FS_LGBM'].apply(lambda x: x if x>= 0 else 0)*dict_results['dy_pred_N_FS_LGBM']['top10'].loc['dy_mean']) + \
                        np.log(1 + df_recommend['dy_pred_S_FS_LGBM'].apply(lambda x: x if x>= 0 else 0)*dict_results['dy_pred_S_FS_LGBM']['top10'].loc['dy_mean'])

In [None]:
dict_results['score'] = get_dy_metrics(df_recommend, 'score', 'besst_1', 1)
dict_results2['score'] = get_dy_metrics(df_recommend, 'score', 'besst_2', 1)

In [None]:
top_results = {}
for key in dict_results.keys():
    top_results[key] = df_recommend.sort_values(by=key, ascending=False)['dy_label'].iloc[:50].to_list()

for key in dict_results.keys():
    top_results[f'{key}_besst_1'] = df_recommend[df_recommend['besst_1']==1].sort_values(by=key, ascending=False)['dy_label'].iloc[:50].to_list()

for key in dict_results.keys():
    top_results[f'{key}_besst_2'] = df_recommend[df_recommend['besst_2']==1].sort_values(by=key, ascending=False)['dy_label'].iloc[:50].to_list()

df_top_results = pd.DataFrame(top_results)

In [None]:
df_top_results_melt_top20 = pd.melt(df_top_results.iloc[:20][df_top_results.columns], 
                                value_name='DY_mean_top20', var_name='Model').groupby('Model').mean()
df_top_results_melt_top10 = pd.melt(df_top_results.iloc[:10][df_top_results.columns], 
                                value_name='DY_mean_top10', var_name='Model').groupby('Model').mean()
df_top_results_melt_top5 = pd.melt(df_top_results.iloc[:5][df_top_results.columns], 
                                value_name='DY_mean_top5', var_name='Model').groupby('Model').mean()               




df_top_results_melt_full = pd.concat([df_top_results_melt_top5, df_top_results_melt_top10, df_top_results_melt_top20], axis=1)

df_top_results_melt_top20 = pd.melt(df_top_results.iloc[:20][df_top_results.columns], 
                                value_name='DY_std_top20', var_name='Model').groupby('Model').std()
df_top_results_melt_top10 = pd.melt(df_top_results.iloc[:10][df_top_results.columns], 
                                value_name='DY_std_top10', var_name='Model').groupby('Model').std()
df_top_results_melt_top5 = pd.melt(df_top_results.iloc[:5][df_top_results.columns], 
                                value_name='DY_std_top5', var_name='Model').groupby('Model').std()  

df_top_results_melt_full = pd.concat([df_top_results_melt_full,df_top_results_melt_top5, df_top_results_melt_top10, df_top_results_melt_top20], axis=1)

df_top_results_melt_top20 = pd.melt(df_top_results.iloc[:20][df_top_results.columns], 
                                value_name='DY_median_top20', var_name='Model').groupby('Model').median()
df_top_results_melt_top10 = pd.melt(df_top_results.iloc[:10][df_top_results.columns], 
                                value_name='DY_median_top10', var_name='Model').groupby('Model').median()
df_top_results_melt_top5 = pd.melt(df_top_results.iloc[:5][df_top_results.columns], 
                                value_name='DY_median_top5', var_name='Model').groupby('Model').median()  

df_top_results_melt_full = pd.concat([df_top_results_melt_full,df_top_results_melt_top5, df_top_results_melt_top10, df_top_results_melt_top20], axis=1)

df_top_results_melt_full.head()

### Analysis for the overall stocks

In [None]:
df_top_results_melt = df_top_results_melt_full.loc[[index for index in df_top_results_melt_full.index if 'besst_' not in index]]

df_top_results_melt[['DY_sharpe_ratio_top5','DY_sharpe_ratio_top10','DY_sharpe_ratio_top20']] = \
        np.divide(df_top_results_melt[[col for col in df_top_results_melt.columns if 'mean' in col]].values, 
                  df_top_results_melt[[col for col in df_top_results_melt.columns if 'std' in col]].values)

df_top_results_melt

#### Top 5 Stocks

In [None]:
plot_results(df_top_results_melt, 'DY_mean_top5', 'DY Mean Value - Top 5','rocket')

In [None]:
plot_results(df_top_results_melt, 'DY_std_top5', 'DY Std Value - Top 5','viridis')

In [None]:
plot_results(df_top_results_melt, 'DY_mean_top10', 'DY Std Value - Top 10','rocket')

#### Top 10 Stocks

In [None]:
plot_results(df_top_results_melt, 'DY_mean_top10', 'DY Std Value - Top 10','rocket')

In [None]:
plot_results(df_top_results_melt, 'DY_std_top10', 'DY Std Value - Top 10','viridis')

#### Top 20 Stocks

In [None]:
plot_results(df_top_results_melt, 'DY_mean_top20', 'DY Mean Value - Top 20','rocket')

In [None]:
plot_results(df_top_results_melt, 'DY_std_top20', 'DY Std Value - Top 20','viridis')

#### Overall results

In [None]:
create_syled_sheet(df_top_results_melt, 'mean', 'DY_mean_top10', 'vlag')

In [None]:
create_syled_sheet(df_top_results_melt, 'sharpe_ratio', 'DY_sharpe_ratio_top10', 'vlag')

### Analysis for the BESST_1 Stocks (Sectors: A, B C ...)

In [None]:
df_top_results_melt = df_top_results_melt_full.loc[[index for index in df_top_results_melt_full.index if 'besst_1' in index]]

df_top_results_melt[['DY_sharpe_ratio_top5','DY_sharpe_ratio_top10','DY_sharpe_ratio_top20']] = \
        np.divide(df_top_results_melt[[col for col in df_top_results_melt.columns if 'mean' in col]].values, 
                  df_top_results_melt[[col for col in df_top_results_melt.columns if 'std' in col]].values)

df_top_results_melt

In [None]:
create_syled_sheet(df_top_results_melt, 'mean', 'DY_mean_top10', 'vlag')

In [None]:
create_syled_sheet(df_top_results_melt, 'sharpe_ratio', 'DY_sharpe_ratio_top10', 'vlag')

### Analysis for the BESST_2 Stocks (Sectors: A, B C ...)

In [None]:
df_top_results_melt = df_top_results_melt_full.loc[[index for index in df_top_results_melt_full.index if 'besst_2' in index]]

df_top_results_melt[['DY_sharpe_ratio_top5','DY_sharpe_ratio_top10','DY_sharpe_ratio_top20']] = \
        np.divide(df_top_results_melt[[col for col in df_top_results_melt.columns if 'mean' in col]].values, 
                  df_top_results_melt[[col for col in df_top_results_melt.columns if 'std' in col]].values)

df_top_results_melt

In [None]:
create_syled_sheet(df_top_results_melt, 'mean', 'DY_mean_top10', 'vlag')

In [None]:
create_syled_sheet(df_top_results_melt, 'sharpe_ratio', 'DY_sharpe_ratio_top10', 'vlag')

### Analysis for the overall Stocks between 11 and 30

In [None]:
df_top_results_melt_top20 = pd.melt(df_top_results.iloc[10:30][df_top_results.columns], 
                                value_name='DY_mean_top20', var_name='Model').groupby('Model').mean()
df_top_results_melt_top10 = pd.melt(df_top_results.iloc[10:20][df_top_results.columns], 
                                value_name='DY_mean_top10', var_name='Model').groupby('Model').mean()
df_top_results_melt_top5 = pd.melt(df_top_results.iloc[10:15][df_top_results.columns], 
                                value_name='DY_mean_top5', var_name='Model').groupby('Model').mean()   

In [None]:
df_top_results_melt_full = pd.concat([df_top_results_melt_top5, df_top_results_melt_top10, df_top_results_melt_top20], axis=1)

df_top_results_melt_top20 = pd.melt(df_top_results.iloc[10:30][df_top_results.columns], 
                                value_name='DY_std_top20', var_name='Model').groupby('Model').std()
df_top_results_melt_top10 = pd.melt(df_top_results.iloc[10:20][df_top_results.columns], 
                                value_name='DY_std_top10', var_name='Model').groupby('Model').std()
df_top_results_melt_top5 = pd.melt(df_top_results.iloc[10:15][df_top_results.columns], 
                                value_name='DY_std_top5', var_name='Model').groupby('Model').std()  

df_top_results_melt_full = pd.concat([df_top_results_melt_full,df_top_results_melt_top5, df_top_results_melt_top10, df_top_results_melt_top20], axis=1)

In [None]:
df_top_results_melt = df_top_results_melt_full.loc[[index for index in df_top_results_melt_full.index if 'besst_' not in index]]

df_top_results_melt[['DY_sharpe_ratio_top5','DY_sharpe_ratio_top10','DY_sharpe_ratio_top20']] = \
        np.divide(df_top_results_melt[[col for col in df_top_results_melt.columns if 'mean' in col]].values, 
                  df_top_results_melt[[col for col in df_top_results_melt.columns if 'std' in col]].values)

df_top_results_melt

In [None]:
plot_results(df_top_results_melt, 'DY_mean_top10', 'DY Mean Value - Top 10','rocket')

In [None]:
plot_results(df_top_results_melt, 'DY_std_top10', 'DY Std Value - Top 10','viridis')

In [None]:
create_syled_sheet(df_top_results_melt, 'mean', 'DY_mean_top10', 'vlag')

In [None]:
create_syled_sheet(df_top_results_melt, 'sharpe_ratio', 'DY_sharpe_ratio_top10', 'vlag')