In [1]:
## This file performs alternative machine learning models

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tseries.offsets import *
from tqdm import tqdm
from functools import reduce
import statsmodels.api as sm
import scipy.stats as stats
from linearmodels import PanelOLS

from functions import utils
from functions import summary2

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.neural_network import MLPRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.model_selection import ParameterGrid

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

plt.rcParams['font.sans-serif']=['Times New Roman']
plt.rcParams.update({'font.size':13})
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['grid.color'] = 'gray'
plt.rcParams['grid.linestyle'] = '--'
%config InlineBackend.figure_format = 'retina'

# ML Forecasts

In [None]:
ratio_chars = ['CAPEI', 'bm',
       'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',
       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',
       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',
       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',
       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',
       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',
       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',
       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',
       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',
       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',
       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',
       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',
       'divyield']

per_share_chars = ['dividend_p','BE_p','Liability_p','cur_liability_p','LT_debt_p',
                  'cash_p', 'total_asset_p', 'tot_debt_p', 'accrual_p', 'EBIT_p', 
                   'cur_asset_p', 'pbda_p', 'ocf_p', 'inventory_p', 'receivables_p',
                   'Cur_debt_p', 'interest_p', 'fcf_ocf_p', 'evm_p',
                   'sales_p', 'invcap_p', 'c_equity_p', 'rd_p', 'opmad_p', 'gpm_p','ptpm_p'
                  ]

macro_chars = ['RGDP', 'RCON', 'INDPROD', 'UNEMP']

fundamental_chars = ['ret', 'prc',
                    'EPS_true_l1_q1','EPS_true_l1_q2','EPS_true_l1_q3',
                    'EPS_true_l1_y1','EPS_true_l1_y2',
                    ]

analyst_chars = ['EPS_ana_q1','EPS_ana_q2','EPS_ana_q3','EPS_ana_y1','EPS_ana_y2']

targets = ['EPS_true_q1', 'EPS_true_q2', 'EPS_true_q3', 'EPS_true_y1', 'EPS_true_y2']

df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')
df_tmp['Year'] = df_tmp['YearMonth'].dt.year

In [None]:
def get_data(train_data,validation_data,test_data,X_col,Y_col):
    
    train_X = train_data.dropna(subset=X_col+[Y_col])[X_col]
    train_y = train_data.dropna(subset=X_col+[Y_col])[Y_col]
    
    valid_X = validation_data.dropna(subset=X_col+[Y_col])[X_col]
    valid_y = validation_data.dropna(subset=X_col+[Y_col])[Y_col]
    
    train_valid_X = pd.concat([train_X, valid_X],axis=0)
    train_valid_y = pd.concat([train_y, valid_y],axis=0)
    
    test_X = test_data.dropna(subset=X_col+[Y_col])[X_col]
    test_y = test_data.dropna(subset=X_col+[Y_col])[Y_col]
    
    return train_X, train_y, valid_X, valid_y, train_valid_X, train_valid_y, test_X, test_y

def GridSearch(mdl_class, param_grid, 
               train_X, train_y, valid_X, valid_y, 
               metrics, higher_better=True):
    '''
    GridSearch using given validation data for sklearn-type models.
    mdl_class: e.g., RandomForestRegressor
    param_grid: e.g.,
        param_grid = {
                    'n_estimators': [200,500],
                    'max_depth' : [2,],#3,4,5,6],
                    'max_features' : [3, 5,],# 10, 15]
                    'random_state': [0]
                    }
    train_X, train_y, valid_X, valid_y: Train and Validation data
    metrics: evaluation metrics
    higher_better: if True, return the model with highest evaluation score
    
    Output: best_param(dict); best_mdl(the trained model)
    '''
    ## Function for Validation
    validation_scores = []
    # for each parameter, train a model and test on validation set
    for params in ParameterGrid(param_grid):
        mdl = mdl_class().set_params(**params)
        mdl.fit(train_X, train_y)
        validation_scores.append((params,mdl,metrics(valid_y, mdl.predict(valid_X))))
        # break
    # sort based on validation score    
    validation_scores = sorted(validation_scores, key=lambda x: x[2])

    if higher_better:
        best_param = validation_scores[-1][0]
        best_mdl = validation_scores[-1][1]
    else:
        best_param = validation_scores[0][0]
        best_mdl = validation_scores[0][1]
    return best_param, best_mdl

In [None]:
X_col_qtr = ratio_chars + ['ret','prc','EPS_true_l1_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
X_col_ann = ratio_chars + ['ret','prc','EPS_true_l1_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
# Rolling Window
train_window = 36
validation_window = 12
# Output path
output_dir = '../data/Results/ML_variants/'

In [None]:
models = [
         ('OLS',LinearRegression,{}),   
         ('RF',RandomForestRegressor,{'n_estimators': [1000],'max_depth' : [4,6,8,10],
                                                   'max_samples' : [0.05],'min_samples_leaf': [5],
                                                   'random_state': [0],'n_jobs': [32]
                                                  }),
         ('LGBM',LGBMRegressor,{'n_estimators': [100, 200, 300],'learning_rate' : [0.01, 0.03, 0.07, 0.1],
                                             'max_depth': [3, 4, 5, 6],'random_state': [0],'verbose':[-1]
                                            }),
        #  ('PLS',PLSRegression,{'n_components': np.arange(1,20)}),
        #  ('LASSO',Lasso,{'alpha': np.logspace(-4,-1,20),}),
        #  ('ENet',ElasticNet,{'alpha': np.logspace(-4,-1,20),}),
        ]

In [None]:
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1989-01-01')]
for (mdl_abbr, mdl_class, param_grid) in models:
    pred_value = []
    for t in tqdm(time_idx):
        pred_value_t = []
        for q in [1,2,3]:
            X_col = X_col_qtr + [f'EPS_ana_q{q}']
            y_col = f'EPS_true_q{q}'
            
            ### sample splitting ###
            train_data = df_tmp[(df_tmp['YearMonth'] >= t - MonthEnd(validation_window) - MonthEnd(train_window)) & \
                                (df_tmp['YearMonth'] < t - MonthEnd(validation_window)) & \
                                (df_tmp[f'ANNDATS_q{q}'] + MonthEnd(0) < t - MonthEnd(validation_window))
                               ].set_index(['permno','YearMonth'])

            validation_data = df_tmp[(df_tmp['YearMonth'] >= t - MonthEnd(validation_window)) & \
                                     (df_tmp['YearMonth'] < t) & \
                                     (df_tmp[f'ANNDATS_q{q}'] < t)
                                    ].set_index(['permno','YearMonth'])

            test_data = df_tmp[(df_tmp[f'ANNDATS_q{q}']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].set_index(['permno','YearMonth'])

            train_X, train_y, valid_X, valid_y, train_valid_X, train_valid_y, test_X, test_y = get_data(train_data, validation_data, test_data, X_col, y_col)
            
            ## Validation to choose Best Parameter
            best_param, best_mdl = GridSearch(mdl_class, param_grid, 
                                          train_X, train_y, valid_X, valid_y, 
                                          r2_score)
            # print(best_param)
            best_mdl = mdl_class().set_params(**best_param).fit(train_valid_X, train_valid_y)
            if mdl_abbr == 'PLS':
                pred_value_t.append(pd.Series(best_mdl.predict(test_X)[:,0], name=f'{mdl_abbr}_EPS_Q{q}', index=test_X.index))
            else:
                pred_value_t.append(pd.Series(best_mdl.predict(test_X), name=f'{mdl_abbr}_EPS_Q{q}', index=test_X.index))
            # break
        # break
        for y in [1,2]:
            X_col = X_col_ann + [f'EPS_ana_y{y}']
            y_col = f'EPS_true_y{y}'
            if y == 2:
                validation_window = 24

            ### sample splitting ###
            train_data = df_tmp[(df_tmp['YearMonth'] >= t - MonthEnd(validation_window) - MonthEnd(train_window)) & \
                            (df_tmp['YearMonth'] < t - MonthEnd(validation_window)) & \
                            (df_tmp[f'ANNDATS_y{y}'] + MonthEnd(0) < t - MonthEnd(validation_window))
                           ].set_index(['permno','YearMonth'])

            validation_data = df_tmp[(df_tmp['YearMonth'] >= t - MonthEnd(validation_window)) & \
                                     (df_tmp['YearMonth'] < t) & \
                                     (df_tmp[f'ANNDATS_y{y}'] < t)
                                    ].set_index(['permno','YearMonth'])

            test_data = df_tmp[(df_tmp[f'ANNDATS_y{y}']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].set_index(['permno','YearMonth'])

            train_X, train_y, valid_X, valid_y, train_valid_X, train_valid_y, test_X, test_y = get_data(train_data, validation_data, test_data, X_col, y_col)

            ## Validation to choose Best Parameter
            best_param, best_mdl = GridSearch(mdl_class, param_grid, 
                                          train_X, train_y, valid_X, valid_y, 
                                          r2_score)

            best_mdl = mdl_class().set_params(**best_param).fit(train_valid_X, train_valid_y)
            if mdl_abbr == 'PLS':
                pred_value_t.append(pd.Series(best_mdl.predict(test_X)[:,0], name=f'{mdl_abbr}_EPS_Y{y}', index=test_X.index))
            else:
                pred_value_t.append(pd.Series(best_mdl.predict(test_X), name=f'{mdl_abbr}_EPS_Y{y}', index=test_X.index))
            # break

        pred_value_t = pd.concat(pred_value_t,axis=1,)
        pred_value.append(pred_value_t)
        # break
        
    pred_value = pd.concat(pred_value, axis=0)
    pred_value.reset_index().to_parquet(f'{output_dir}{mdl_abbr}_pred.parquet')
    
    # break

In [None]:
## For LASSO, ElasticNet, and PLS, we need to standardize the data
###############################
### Standardization: scale to the same cross-sectional std of last EPS ##
###############################
df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')
df_tmp['Year'] = df_tmp['YearMonth'].dt.year
cols = ratio_chars
df_tmp[cols] = df_tmp.groupby('YearMonth',group_keys=False)[cols]\
                             .transform(lambda x: x / x.std()) 
df_tmp[cols] = df_tmp[cols].mul(df_tmp.groupby('YearMonth',group_keys=False)['EPS_true_l1_q1'].transform('std'), axis=0)
models = [
         ('PLS',PLSRegression,{'n_components': np.arange(1,20)}),
         ('LASSO',Lasso,{'alpha': np.logspace(-4,0,20),}),
         ('ENet',ElasticNet,{'alpha': np.logspace(-4,0,20),}),
        ]

In [None]:
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1989-01-01')]
for (mdl_abbr, mdl_class, param_grid) in models:
    pred_value = []
    for t in tqdm(time_idx):
        pred_value_t = []
        for q in [1,2,3]:
            X_col = X_col_qtr + [f'EPS_ana_q{q}']
            y_col = f'EPS_true_q{q}'
            
            ### sample splitting ###
            train_data = df_tmp[(df_tmp['YearMonth'] >= t - MonthEnd(validation_window) - MonthEnd(train_window)) & \
                                (df_tmp['YearMonth'] < t - MonthEnd(validation_window)) & \
                                (df_tmp[f'ANNDATS_q{q}'] + MonthEnd(0) < t - MonthEnd(validation_window))
                               ].set_index(['permno','YearMonth'])

            validation_data = df_tmp[(df_tmp['YearMonth'] >= t - MonthEnd(validation_window)) & \
                                     (df_tmp['YearMonth'] < t) & \
                                     (df_tmp[f'ANNDATS_q{q}'] < t)
                                    ].set_index(['permno','YearMonth'])

            test_data = df_tmp[(df_tmp[f'ANNDATS_q{q}']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].set_index(['permno','YearMonth'])

            train_X, train_y, valid_X, valid_y, train_valid_X, train_valid_y, test_X, test_y = get_data(train_data, validation_data, test_data, X_col, y_col)
            
            ## Validation to choose Best Parameter
            best_param, best_mdl = GridSearch(mdl_class, param_grid, 
                                          train_X, train_y, valid_X, valid_y, 
                                          r2_score)
            # print(best_param)
            best_mdl = mdl_class().set_params(**best_param).fit(train_valid_X, train_valid_y)
            if mdl_abbr == 'PLS':
                pred_value_t.append(pd.Series(best_mdl.predict(test_X)[:,0], name=f'{mdl_abbr}_EPS_Q{q}', index=test_X.index))
            else:
                pred_value_t.append(pd.Series(best_mdl.predict(test_X), name=f'{mdl_abbr}_EPS_Q{q}', index=test_X.index))
            # break
        # break
        for y in [1,2]:
            X_col = X_col_ann + [f'EPS_ana_y{y}']
            y_col = f'EPS_true_y{y}'
            if y == 2:
                validation_window = 24

            ### sample splitting ###
            train_data = df_tmp[(df_tmp['YearMonth'] >= t - MonthEnd(validation_window) - MonthEnd(train_window)) & \
                            (df_tmp['YearMonth'] < t - MonthEnd(validation_window)) & \
                            (df_tmp[f'ANNDATS_y{y}'] + MonthEnd(0) < t - MonthEnd(validation_window))
                           ].set_index(['permno','YearMonth'])

            validation_data = df_tmp[(df_tmp['YearMonth'] >= t - MonthEnd(validation_window)) & \
                                     (df_tmp['YearMonth'] < t) & \
                                     (df_tmp[f'ANNDATS_y{y}'] < t)
                                    ].set_index(['permno','YearMonth'])

            test_data = df_tmp[(df_tmp[f'ANNDATS_y{y}']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].set_index(['permno','YearMonth'])

            train_X, train_y, valid_X, valid_y, train_valid_X, train_valid_y, test_X, test_y = get_data(train_data, validation_data, test_data, X_col, y_col)

            ## Validation to choose Best Parameter
            best_param, best_mdl = GridSearch(mdl_class, param_grid, 
                                          train_X, train_y, valid_X, valid_y, 
                                          r2_score)

            best_mdl = mdl_class().set_params(**best_param).fit(train_valid_X, train_valid_y)
            if mdl_abbr == 'PLS':
                pred_value_t.append(pd.Series(best_mdl.predict(test_X)[:,0], name=f'{mdl_abbr}_EPS_Y{y}', index=test_X.index))
            else:
                pred_value_t.append(pd.Series(best_mdl.predict(test_X), name=f'{mdl_abbr}_EPS_Y{y}', index=test_X.index))
            # break

        pred_value_t = pd.concat(pred_value_t,axis=1,)
        pred_value.append(pred_value_t)
        # break
        
    pred_value = pd.concat(pred_value, axis=0)
    pred_value.reset_index().to_parquet(f'{output_dir}{mdl_abbr}_pred.parquet')
    
    # break

# Table F.3

In [3]:
df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')
f_abbr_list = [('OLS_pred','OLS'),
               ('PLS_pred','PLS'),
               ('LASSO_pred','LASSO'),
               ('ENet_pred','ENet'),
               ('RF_pred','RF'),
               ('LGBM_pred','LGBM'),
              ]
abbr_list = list(map(lambda x: x[1], f_abbr_list))

forecast_all = []
for f,abbr in f_abbr_list:
    RF = pd.read_parquet(f'../data/Results/ML_variants/{f}.parquet')
    RF = RF[['permno','YearMonth',f'{abbr}_EPS_Q1',f'{abbr}_EPS_Q2',f'{abbr}_EPS_Q3',
             f'{abbr}_EPS_Y1',f'{abbr}_EPS_Y2',]].set_index(['permno','YearMonth'])
    forecast_all.append(RF)
forecast_all = reduce(lambda x,y: pd.merge(x,y,on=['permno','YearMonth'],how='outer'),
                      forecast_all)
forecast_all.reset_index(inplace=True)

## Composite
for i in ['Q1','Q2','Q3','Y1','Y2']:
    forecast_all[f'Composite_EPS_{i}'] = forecast_all[[f'OLS_EPS_{i}',f'PLS_EPS_{i}',f'LASSO_EPS_{i}',
                                                   f'ENet_EPS_{i}',f'RF_EPS_{i}',f'LGBM_EPS_{i}']].mean(axis=1)
    
df = df_tmp.merge(forecast_all, on=['permno','YearMonth'])

In [4]:
# test = pd.read_parquet(f'../data/Results/RF_wo_lookahead_raw_005.parquet')
# test = test[test['YearMonth'] >= '1989-01-31']
# for h in ['q1','q2','q3','y1','y2']:
#     print(((test[f'RF_{h}'] - test[f'AE_{h}'])**2).groupby(test['YearMonth']).mean().mean())

In [5]:
## EX-POST Realization
df['REAL_EPS_Q1'] = df['EPS_true_q1']
df['REAL_EPS_Q2'] = df['EPS_true_q2']
df['REAL_EPS_Q3'] = df['EPS_true_q3']
df['REAL_EPS_Y1'] = df['EPS_true_y1']
df['REAL_EPS_Y2'] = df['EPS_true_y2']
## Analyst Forecast
df['ANA_EPS_Q1'] = df['EPS_ana_q1']
df['ANA_EPS_Q2'] = df['EPS_ana_q2']
df['ANA_EPS_Q3'] = df['EPS_ana_q3']
df['ANA_EPS_Y1'] = df['EPS_ana_y1']
df['ANA_EPS_Y2'] = df['EPS_ana_y2']

In [6]:
### 1. Forecast Performance
idx = ['Q1','Q2','Q3','Y1','Y2']
col = abbr_list + ['Composite']
MSE = pd.DataFrame(index=idx, columns=col)
## To make sure we have the same sample
N_obs = pd.DataFrame(index=idx, columns=col)
for c in col:
    for i in idx:
        df_ = df.dropna(subset=[f'REAL_EPS_{i}',f'{c}_EPS_{i}'])
        MSE.loc[i, c] = df_.groupby('YearMonth').apply(lambda x: np.mean((x[f'REAL_EPS_{i}']-x[f'{c}_EPS_{i}'])**2)
                                                        ).mean()
        N_obs.loc[i,c] = df_.shape[0]
(MSE).to_clipboard()
(MSE)

Unnamed: 0,OLS,PLS,LASSO,ENet,RF,LGBM,Composite
Q1,0.055145,0.056209,0.055249,0.055306,0.05369,0.052977,0.053554
Q2,0.07673,0.079135,0.076983,0.077242,0.074344,0.074603,0.074628
Q3,0.10634,0.107235,0.103841,0.104392,0.101053,0.102142,0.101084
Y1,0.545001,0.541063,0.537739,0.538338,0.525791,0.528357,0.521918
Y2,1.846411,1.67349,1.633948,1.634529,1.593215,1.607809,1.576186


In [7]:
## Panel A of Table F.3
## MSE Compared to RF model
rlts = []
for i in idx:
    MSE = pd.DataFrame(columns=col, dtype=float)
    # MSE for each model at each t
    for c in col:
        df_ = df.dropna(subset=[f'REAL_EPS_{i}',f'{c}_EPS_{i}'])
        MSE.loc[:, c] = ((df_[f'REAL_EPS_{i}']-df_[f'{c}_EPS_{i}'])**2).groupby(df_['YearMonth']).mean()
        
    # Accuracy improvement compared to RF   
    MSE_diff = -MSE.sub(MSE['RF'],axis=0)

    # test for difference
    mdls = MSE_diff[[i for i in MSE_diff.columns if i != 'RF']].apply(lambda x: sm.OLS(endog=x, exog=[1]*len(x)).fit(cov_type='HAC',cov_kwds={'maxlags':12})).to_list()
    rlt = summary2.summary_col(mdls, float_format='%0.3f' )

    # MSE of RF model
    rlt.insert(0, 'RF', [round(MSE['RF'].mean(), 3),''])
    rlt.index = [i,'']
    rlts.append(rlt)
    
    # break
rlts = pd.concat(rlts, axis=0)

In [8]:
rlts.to_clipboard()
rlts

Unnamed: 0,RF,OLS,PLS,LASSO,ENet,LGBM,Composite
Q1,0.054,-0.001,-0.003,-0.002,-0.002,0.001,0.000
,,(-2.55),(-2.53),(-2.10),(-2.32),(2.48),(0.44)
Q2,0.074,-0.002,-0.005,-0.003,-0.003,-0.000,-0.000
,,(-2.56),(-2.83),(-2.29),(-2.50),(-0.66),(-0.47)
Q3,0.101,-0.005,-0.006,-0.003,-0.003,-0.001,-0.000
,,(-3.37),(-3.51),(-2.33),(-2.93),(-1.58),(-0.04)
Y1,0.526,-0.019,-0.015,-0.012,-0.013,-0.003,0.004
,,(-2.82),(-1.93),(-1.40),(-1.46),(-0.81),(0.75)
Y2,1.593,-0.253,-0.080,-0.041,-0.041,-0.015,0.017
,,(-2.08),(-2.59),(-1.65),(-1.66),(-1.17),(1.68)


In [9]:
## Panel B of Table F.3
all_factor = pd.read_csv('../data/Other/ff5_factors_m.CSV')
all_factor['YearMonth'] = pd.to_datetime(all_factor['yyyymm'], format='%Y%m') + MonthEnd(0)
all_factor['YearMonth'] = all_factor['YearMonth'] + MonthEnd(-1)

In [10]:
idx = ['Q1','Q2','Q3','Y1','Y2']
col = abbr_list + ['Composite'] 
num_level = 5
factor_dict = {'Ret': ['ones'],
               'CAPM':['ones','Mkt_RF'],
               'FF3': ['ones','Mkt_RF','SMB','HML'],
               'FF5': ['ones','Mkt_RF','SMB', 'HML', 'RMW', 'CMA'],
               'FFC6':['ones','Mkt_RF','SMB', 'HML', 'RMW', 'CMA','MOM'],
               'HXZ':['ones','R_MKT','R_ME','R_IA','R_ROE'],
               'HMXZ':['ones','R_MKT','R_ME','R_IA','R_ROE','R_EG'],
               'SY':['ones','Mkt_RF','SMB_SY','MGMT', 'PERF'],
               'DHS':['ones','Mkt_RF','PEAD', 'FIN'],
               }

rlts = []
for c in col:
    for i in idx:
        df[f'{c}_Bias_{i}'] = (df[f'ANA_EPS_{i}'] - df[f'{c}_EPS_{i}'])/df['prc_l1']
        
    # Average Bias
    df[f'{c}_Bias_Avg'] = df[[f'{c}_Bias_Q1',f'{c}_Bias_Q2',f'{c}_Bias_Q3',
                              f'{c}_Bias_Y1',f'{c}_Bias_Y2']].mean(axis=1)
    
    nonNA = (~df[[f'{c}_Bias_Q1',f'{c}_Bias_Q2',f'{c}_Bias_Q3',
                  f'{c}_Bias_Y1',f'{c}_Bias_Y2']].isna()).sum(axis=1)
    df[f'{c}_Bias_Avg'] = np.where(nonNA > 1,
                                   df[f'{c}_Bias_Avg'],
                                   np.nan)

    sort_var = f'{c}_Bias_Avg'
    _,vwret1 = utils.SingleSort(df,'PERMNO', 'YearMonth', 
                                    sort_var, 'bh1m', num_level, 
                                    'ME', quantile_filter=None)
    result = utils.SingleSort_RetAna(_,vwret1,'YearMonth',factor_data=all_factor,factor_dict=factor_dict,lag=12)
    result = result['H-L']

    result.name = c
    rlts.append(result)
    # break
rlts = pd.concat(rlts,axis=1)
rlts.to_clipboard()
rlts

Var:OLS_Bias_Avg, Delete 75798 rows due to missing values, raw data 1273458 rows --> new data 1197660 rows
Var:PLS_Bias_Avg, Delete 75798 rows due to missing values, raw data 1273458 rows --> new data 1197660 rows
Var:LASSO_Bias_Avg, Delete 75798 rows due to missing values, raw data 1273458 rows --> new data 1197660 rows
Var:ENet_Bias_Avg, Delete 75798 rows due to missing values, raw data 1273458 rows --> new data 1197660 rows
Var:RF_Bias_Avg, Delete 75798 rows due to missing values, raw data 1273458 rows --> new data 1197660 rows
Var:LGBM_Bias_Avg, Delete 75798 rows due to missing values, raw data 1273458 rows --> new data 1197660 rows
Var:Composite_Bias_Avg, Delete 75798 rows due to missing values, raw data 1273458 rows --> new data 1197660 rows


Unnamed: 0,OLS,PLS,LASSO,ENet,RF,LGBM,Composite
Ret,-0.16,-0.16,-0.15,-0.22,-0.29,-0.38,-0.17
,(-0.46),(-0.44),(-0.41),(-0.61),(-0.77),(-1.14),(-0.44)
CAPM,-0.58,-0.57,-0.59,-0.67,-0.78,-0.78,-0.61
,(-2.07),(-1.81),(-1.88),(-2.20),(-2.40),(-2.58),(-1.84)
FF3,-0.65,-0.65,-0.65,-0.75,-0.89,-0.89,-0.71
,(-3.15),(-2.76),(-2.92),(-3.48),(-4.22),(-4.47),(-3.11)
FF5,-0.18,-0.19,-0.21,-0.30,-0.46,-0.51,-0.27
,(-0.61),(-0.65),(-0.72),(-1.04),(-1.57),(-1.83),(-0.86)
FFC6,0.34,0.32,0.30,0.21,0.03,0.01,0.27
,(1.55),(1.31),(1.32),(0.86),(0.15),(0.02),(1.17)
