In [7]:
## This file performs alternative random forest specifications

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tseries.offsets import *
from tqdm import tqdm
from functools import reduce
import statsmodels.api as sm
import scipy.stats as stats
from linearmodels import PanelOLS
import os
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestRegressor

from functions import utils
from functions import summary2

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

plt.rcParams['font.sans-serif']=['Times New Roman']
plt.rcParams.update({'font.size':13})
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['grid.color'] = 'gray'
plt.rcParams['grid.linestyle'] = '--'
%config InlineBackend.figure_format = 'retina'

# Train

In [None]:
ratio_chars = ['CAPEI', 'bm',
       'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',
       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',
       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',
       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',
       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',
       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',
       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',
       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',
       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',
       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',
       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',
       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',
       'divyield']

per_share_chars = ['dividend_p','BE_p','Liability_p','cur_liability_p','LT_debt_p',
                  'cash_p', 'total_asset_p', 'tot_debt_p', 'accrual_p', 'EBIT_p', 
                   'cur_asset_p', 'pbda_p', 'ocf_p', 'inventory_p', 'receivables_p',
                   'Cur_debt_p', 'interest_p', 'fcf_ocf_p', 'evm_p',
                   'sales_p', 'invcap_p', 'c_equity_p', 'rd_p', 'opmad_p', 'gpm_p','ptpm_p'
                  ]

macro_chars = ['RGDP', 'RCON', 'INDPROD', 'UNEMP']

fundamental_chars = ['ret', 'prc',
                    'EPS_true_l1_q1','EPS_true_l1_q2','EPS_true_l1_q3',
                    'EPS_true_l1_y1','EPS_true_l1_y2',
                    ]

analyst_chars = ['EPS_ana_q1','EPS_ana_q2','EPS_ana_q3','EPS_ana_y1','EPS_ana_y2']

targets = ['EPS_true_q1', 'EPS_true_q2', 'EPS_true_q3', 'EPS_true_y1', 'EPS_true_y2']

df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')

In [None]:
#############################
### Sample Fraction: 0.01 ###
#############################
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
max_samples = 0.01
window = 12
n_jobs = 16
num_trees = 2000
forecast = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    if window <= 24:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    else:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break
forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}.parquet')

In [None]:
#############################
### Sample Fraction: 0.10 ###
#############################
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
max_samples = 0.1
window = 12
n_jobs = 16
num_trees = 2000
forecast = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    if window <= 24:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    else:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break
forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}.parquet')

In [None]:
##############################
### Feature Fraction: sqrt ###
##############################
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
max_samples = 0.05
window = 12
n_jobs = 32
num_trees = 2000
forecast = []
feature_importance = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    feature_importance_t = pd.DataFrame(index=x_cols, 
                                    columns=['q1','q2','q3','y1','y2'])
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             max_features='sqrt',
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    feature_importance_t['q1'] = mdl.feature_importances_

    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             max_features='sqrt',
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])

    feature_importance_t['q2'] = mdl.feature_importances_

    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             max_features='sqrt',
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    feature_importance_t['q3'] = mdl.feature_importances_

    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             max_features='sqrt',
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])

    feature_importance_t['y1'] = mdl.feature_importances_

    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    if window <= 24:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    else:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             max_features='sqrt',
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])

    feature_importance_t['y2'] = mdl.feature_importances_

    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    
    feature_importance_t['YearMonth'] = t
    feature_importance.append(feature_importance_t.reset_index())
    
forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}_feature_sqrt.parquet')

In [None]:
##############################
### Feature Fraction: 0.5  ###
##############################
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
max_samples = 0.05
window = 12
n_jobs = 32
num_trees = 2000
forecast = []
feature_importance = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    feature_importance_t = pd.DataFrame(index=x_cols, 
                                    columns=['q1','q2','q3','y1','y2'])
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             max_features=0.5,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    feature_importance_t['q1'] = mdl.feature_importances_

    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             max_features=0.5,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])

    feature_importance_t['q2'] = mdl.feature_importances_

    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             max_features=0.5,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    feature_importance_t['q3'] = mdl.feature_importances_

    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             max_features=0.5,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])

    feature_importance_t['y1'] = mdl.feature_importances_

    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    if window <= 24:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    else:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             max_features=0.5,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])

    feature_importance_t['y2'] = mdl.feature_importances_

    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    
    feature_importance_t['YearMonth'] = t
    feature_importance.append(feature_importance_t.reset_index())
    
    # break
    
forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}_feature_0.5.parquet')

In [None]:
##############################
### Training Window: 3 years #
##############################
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
max_samples = 0.05
window = 36
n_jobs = 24
num_trees = 2000
forecast = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    if window <= 24:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    else:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break
forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}.parquet')

In [None]:
##############################
### Training Window: 5 years #
##############################
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
max_samples = 0.05
window = 50
n_jobs = 24
num_trees = 2000
forecast = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    if window <= 24:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    else:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break
forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}.parquet')

In [None]:
###############################
## With Per-share variables ###
###############################
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
max_samples = 0.05
window = 12
n_jobs = 16
num_trees = 2000
forecast = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + per_share_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + per_share_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + per_share_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + per_share_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + per_share_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    if window <= 24:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    else:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break
forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}_chars.parquet')

In [None]:
##############################
### Standardization: Rank ####
##############################
df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')
## CS-Rank
cols = ratio_chars + per_share_chars + fundamental_chars + analyst_chars
df_tmp[cols] = df_tmp.groupby('YearMonth',group_keys=False)[cols]\
                             .transform(lambda x: x.rank(pct=True)*2 - 1)
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
max_samples = 0.05
window = 12
n_jobs = 16
num_trees = 2000
forecast = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    if window <= 24:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    else:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break
forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}_rank.parquet')

In [None]:
###############################
### Standardization: Z-score ##
###############################
df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')
## Z-score
cols = ratio_chars + per_share_chars + fundamental_chars + analyst_chars
df_tmp[cols] = df_tmp.groupby('YearMonth',group_keys=False)[cols]\
                             .transform(lambda x: (x - x.mean()) / x.std())
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
max_samples = 0.05
window = 12
n_jobs = 16
num_trees = 2000
forecast = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    if window <= 24:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    else:
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=max_samples,
                             n_jobs=n_jobs)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break
forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}_zscore.parquet')

# All Permutation

In [None]:
df_tmp_raw = pd.read_parquet('../data/Results/df_train_new.parquet')
cols = ratio_chars + per_share_chars + fundamental_chars + analyst_chars

df_tmp_rank = df_tmp_raw.copy()
df_tmp_rank[cols] = df_tmp_rank.groupby('YearMonth',group_keys=False)[cols]\
                               .transform(lambda x: x.rank(pct=True)*2 - 1)

df_tmp_zscore = df_tmp_raw.copy()
df_tmp_zscore[cols] = df_tmp_zscore.groupby('YearMonth',group_keys=False)[cols]\
                                   .transform(lambda x: (x - x.mean()) / x.std())

In [None]:
param_grid = ParameterGrid({'sample': [0.05, 0.10],
                            'feature': ['sqrt', 1.0],
                            'window': [12, 24],
                            'std': ['zscore', 'raw', 'rank',], # 
                            'eps': [True, False],
                            })

In [None]:
time_idx = sorted(df_tmp_raw['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]

n_jobs = 16
num_trees = 2000

for params in param_grid:
    
    print(f"Current Parameter: sample={params['sample']}; feature={params['feature']}; window={params['window']}; std={params['std']}; eps={params['eps']}")
    
    forecast = []
    if params['std'] == 'raw':
        df_tmp = df_tmp_raw.copy()
    if params['std'] == 'rank':
        df_tmp = df_tmp_rank.copy()
    if params['std'] == 'zscore':
        df_tmp = df_tmp_zscore.copy()
        
    for t in tqdm(time_idx):
        
        ### Q1 ###
        x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
        if params['eps']:
            x_cols = x_cols + per_share_chars
        y_col = 'EPS_true_q1'
        
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(params['window'])) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
        df_test = df_tmp[(df_tmp['ANNDATS_q1'] > df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
        # break
        mdl = RandomForestRegressor(n_estimators=num_trees, 
                                     random_state=0, 
                                     max_depth=7,
                                     min_samples_leaf=5, 
                                     max_samples=params['sample'],
                                     max_features=params['feature'],
                                     n_jobs=n_jobs)
        
        mdl.fit(df_train[x_cols], df_train[y_col])
        y_pred = mdl.predict(df_test[x_cols])
        
        forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                    'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
        
        ### Q2 ###
        x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
        if params['eps']:
            x_cols = x_cols + per_share_chars
        y_col = 'EPS_true_q2'
        
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(params['window'])) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
        df_test = df_tmp[(df_tmp['ANNDATS_q2'] > df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
        
        mdl = RandomForestRegressor(n_estimators=num_trees, 
                                     random_state=0, 
                                     max_depth=7,
                                     min_samples_leaf=5, 
                                     max_samples=params['sample'],
                                     max_features=params['feature'],
                                     n_jobs=n_jobs)
        
        mdl.fit(df_train[x_cols], df_train[y_col])
        y_pred = mdl.predict(df_test[x_cols])
        
        forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                    'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
        
        ### Q3 ###
        x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
        if params['eps']:
            x_cols = x_cols + per_share_chars
        y_col = 'EPS_true_q3'
        
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(params['window'])) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
        df_test = df_tmp[(df_tmp['ANNDATS_q3'] > df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
        
        mdl = RandomForestRegressor(n_estimators=num_trees, 
                                     random_state=0, 
                                     max_depth=7,
                                     min_samples_leaf=5, 
                                     max_samples=params['sample'],
                                     max_features=params['feature'],
                                     n_jobs=n_jobs)
        
        mdl.fit(df_train[x_cols], df_train[y_col])
        y_pred = mdl.predict(df_test[x_cols])
        
        forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                    'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
        
        ### Y1 ###
        x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
        if params['eps']:
            x_cols = x_cols + per_share_chars
        y_col = 'EPS_true_y1'
        
        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(params['window'])) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
                   .dropna(subset=x_cols+[y_col])
        df_test = df_tmp[(df_tmp['ANNDATS_y1'] > df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
        
        mdl = RandomForestRegressor(n_estimators=num_trees, 
                                     random_state=0, 
                                     max_depth=7,
                                     min_samples_leaf=5, 
                                     max_samples=params['sample'],
                                     max_features=params['feature'],
                                     n_jobs=n_jobs)
        
        mdl.fit(df_train[x_cols], df_train[y_col])
        y_pred = mdl.predict(df_test[x_cols])
        
        forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                    'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
        
        ### Y2 ###
        x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
        if params['eps']:
            x_cols = x_cols + per_share_chars
        y_col = 'EPS_true_y2'
        
        if params['window'] <= 24:
            df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                       .dropna(subset=x_cols+[y_col])
        else:
            df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(params['window'])) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
                       .dropna(subset=x_cols+[y_col])
        df_test = df_tmp[(df_tmp['ANNDATS_y2'] > df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
        
        mdl = RandomForestRegressor(n_estimators=num_trees, 
                                     random_state=0, 
                                     max_depth=7,
                                     min_samples_leaf=5, 
                                     max_samples=params['sample'],
                                     max_features=params['feature'],
                                     n_jobs=n_jobs)
        
        mdl.fit(df_train[x_cols], df_train[y_col])
        y_pred = mdl.predict(df_test[x_cols])
        
        forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                    'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
        
        forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                           on=['permno','YearMonth'],
                                           how='outer'),
                     [forecast_q1,forecast_q2,forecast_q3,
                      forecast_y1,forecast_y2]))
        # break
    forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
    forecast_all.to_parquet(f"../data/Results/RF_variants_ALL/RF_sample_{params['sample']}_feature_{params['feature']}_window_{params['window']}_std_{params['std']}_eps_{params['eps']}.parquet")
    # break

# Table F.1

In [9]:
df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')
f_abbr_list = [('RF_wo_lookahead_raw_005','baseline'),
               ('RF_0.01_12','001'),
               ('RF_0.1_12','010'),
               ('RF_0.05_12_feature_sqrt','sqrt'),
               ('RF_0.05_12_feature_0.5','0.5'),
               ('RF_0.05_36','36m'),
               ('RF_0.05_60','60m'),
               ('RF_0.05_12_rank','rank'),
               ('RF_0.05_12_zscore','zscore'),
               ('RF_0.05_12_chars','chars'),
              ]
abbr_list = list(map(lambda x: x[1], f_abbr_list))

forecast_all = []
for f,abbr in f_abbr_list:
    RF = pd.read_parquet(f'../data/Results/RF_variants/{f}.parquet')
    RF = RF[['permno','YearMonth','RF_q1','RF_q2','RF_q3','RF_y1','RF_y2',]].set_index(['permno','YearMonth'])
    RF.columns = [f'{i}_{abbr}' for i in RF.columns]
    forecast_all.append(RF)

forecast_all = reduce(lambda x,y: pd.merge(x,y,on=['permno','YearMonth'],how='outer'),
                      forecast_all)
forecast_all.reset_index(inplace=True)

df = df_tmp.merge(forecast_all, on=['permno','YearMonth'])

In [None]:
### Panel A of Table F.1
### 1. Forecast Performance (RF-AE)**2
idx = ['q1','q2','q3','y1','y2']
col = abbr_list
MSE = pd.DataFrame(index=idx, columns=col)
## To make sure we have the same sample
N_obs = pd.DataFrame(index=idx, columns=col)
for c in col:
    for i in idx:
        df_ = df.dropna(subset=[f'EPS_true_{i}',f'RF_{i}_{c}'])
        MSE.loc[i, c] = df_.groupby('YearMonth').apply(lambda x: np.mean((x[f'EPS_true_{i}']-x[f'RF_{i}_{c}'])**2)
                                                        ).mean()
        N_obs.loc[i,c] = df_.shape[0]
MSE.round(3).to_clipboard()
MSE

Unnamed: 0,baseline,001,010,sqrt,0.5,36m,60m,rank,zscore,chars
q1,0.065128,0.068079,0.065149,0.101361,0.067243,0.065417,0.065554,0.06683,0.066569,0.065326
q2,0.08479,0.089357,0.084822,0.117751,0.087014,0.085699,0.085603,0.085311,0.085178,0.085254
q3,0.113587,0.122386,0.113586,0.14453,0.116149,0.115032,0.114798,0.112824,0.112602,0.113826
y1,0.592634,0.689291,0.590008,1.016876,0.632256,0.591358,0.590508,0.620745,0.617552,0.594245
y2,1.787721,1.891715,1.790494,2.113883,1.806513,1.784167,1.746952,1.765156,1.76692,1.794085


In [11]:
all_factor = pd.read_csv('../data/Other/ff5_factors_m.CSV')
all_factor['YearMonth'] = pd.to_datetime(all_factor['yyyymm'], format='%Y%m') + MonthEnd(0)
all_factor['YearMonth'] = all_factor['YearMonth'] + MonthEnd(-1)

In [12]:
## Panel B of Table E.1
### 2. Forecast Return
idx = ['q1','q2','q3','y1','y2']
col = abbr_list 
num_level = 5
factor_dict = {'Ret': ['ones'],
               'CAPM':['ones','Mkt_RF'],
               'FF3': ['ones','Mkt_RF','SMB','HML'],
               'FF5': ['ones','Mkt_RF','SMB', 'HML', 'RMW', 'CMA'],
               'FFC6':['ones','Mkt_RF','SMB', 'HML', 'RMW', 'CMA','MOM'],
               'HXZ':['ones','R_MKT','R_ME','R_IA','R_ROE'],
               'HMXZ':['ones','R_MKT','R_ME','R_IA','R_ROE','R_EG'],
               'SY':['ones','Mkt_RF','SMB_SY','MGMT', 'PERF'],
               'DHS':['ones','Mkt_RF','PEAD', 'FIN'],
               }

rlts = []
for c in col:
    for i in idx:
        df[f'{c}_Bias_{i}'] = (df[f'EPS_ana_{i}'] - df[f'RF_{i}_{c}'])/df['prc_l1']
        
    # Average Bias
    df[f'{c}_Bias_Avg'] = df[[f'{c}_Bias_q1',f'{c}_Bias_q2',f'{c}_Bias_q3',
                              f'{c}_Bias_y1',f'{c}_Bias_y2']].mean(axis=1)
    
    nonNA = (~df[[f'{c}_Bias_q1',f'{c}_Bias_q2',f'{c}_Bias_q3',
                  f'{c}_Bias_y1',f'{c}_Bias_y2']].isna()).sum(axis=1)
    df[f'{c}_Bias_Avg'] = np.where(nonNA > 1,
                                   df[f'{c}_Bias_Avg'],
                                   np.nan)

    sort_var = f'{c}_Bias_Avg'
    _,vwret1 = utils.SingleSort(df,'PERMNO', 'YearMonth', 
                                    sort_var, 'bh1m', num_level, 
                                    'ME', quantile_filter=None)
    result = utils.SingleSort_RetAna(_,vwret1,'YearMonth',factor_data=all_factor,factor_dict=factor_dict,lag=12)
    result = result['H-L']
    result.name = c
    rlts.append(result)
    # break
rlts = pd.concat(rlts,axis=1)
rlts.to_clipboard()
rlts

Var:baseline_Bias_Avg, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:001_Bias_Avg, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:010_Bias_Avg, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:sqrt_Bias_Avg, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:0.5_Bias_Avg, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:36m_Bias_Avg, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:60m_Bias_Avg, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:rank_Bias_Avg, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:zscore_Bias_Avg, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:chars_Bias_Avg, Delete 

Unnamed: 0,baseline,001,010,sqrt,0.5,36m,60m,rank,zscore,chars
Ret,-0.27,-0.09,-0.25,0.17,-0.12,-0.25,-0.20,-0.20,-0.11,-0.23
,(-0.71),(-0.27),(-0.70),(0.75),(-0.35),(-0.67),(-0.60),(-0.52),(-0.31),(-0.60)
CAPM,-0.68,-0.48,-0.67,0.06,-0.52,-0.70,-0.64,-0.53,-0.45,-0.66
,(-1.95),(-1.55),(-2.00),(0.24),(-1.61),(-2.02),(-2.09),(-1.36),(-1.24),(-1.90)
FF3,-0.77,-0.54,-0.77,-0.09,-0.61,-0.79,-0.74,-0.70,-0.62,-0.75
,(-3.30),(-2.49),(-3.55),(-0.45),(-2.79),(-3.43),(-3.57),(-2.63),(-2.45),(-3.14)
FF5,-0.44,-0.13,-0.42,0.09,-0.23,-0.43,-0.31,-0.51,-0.39,-0.39
,(-1.51),(-0.44),(-1.56),(0.41),(-0.84),(-1.56),(-1.19),(-1.83),(-1.27),(-1.37)
FFC6,-0.05,0.26,-0.03,0.30,0.14,-0.05,0.07,-0.12,0.00,0.01
,(-0.21),(1.13),(-0.14),(1.63),(0.68),(-0.24),(0.35),(-0.49),(0.02),(0.02)


# Table F.2

In [13]:
df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')
all_factor = pd.read_csv('../data/Other/ff5_factors_m.CSV')
all_factor['YearMonth'] = pd.to_datetime(all_factor['yyyymm'], format='%Y%m') + MonthEnd(0)
all_factor['YearMonth'] = all_factor['YearMonth'] + MonthEnd(-1)
param_grid = ParameterGrid({'sample': [0.01, 0.05, 0.10],
                            'feature': ['sqrt', 1.0],
                            'window': [12, 24, 36, 48, 60],
                            'std': ['zscore','raw', 'rank',], # 
                            'eps': [True, False],
                            })

In [14]:
rlts = []
for params in param_grid:
    
    ## 1. Data: read in and construct average BE
    file_path = f"../data/Results/RF_variants_ALL/RF_sample_{params['sample']}_feature_{params['feature']}_window_{params['window']}_std_{params['std']}_eps_{params['eps']}.parquet"
    if os.path.exists(file_path):
        forecast = pd.read_parquet(file_path)
    else:
        continue
        
    df = df_tmp[['permno','YearMonth','bh1m','ME',
                 'EPS_ana_q1','EPS_ana_q2','EPS_ana_q3',
                 'EPS_ana_y1','EPS_ana_y2',
                 'prc_l1']].merge(forecast, on=['permno','YearMonth'])
    
    idx = ['q1','q2','q3','y1','y2']

    for i in idx:
        df[f'Bias_{i}'] = (df[f'EPS_ana_{i}'] - df[f'RF_{i}'])/df['prc_l1']
        
    # Average Bias BE
    df[f'BE'] = df[[f'Bias_q1',f'Bias_q2',f'Bias_q3',
                    f'Bias_y1',f'Bias_y2']].mean(axis=1)
    
    nonNA = (~df[[f'Bias_q1',f'Bias_q2',f'Bias_q3',
                  f'Bias_y1',f'Bias_y2']].isna()).sum(axis=1)
    
    df[f'BE'] = np.where(nonNA > 1,
                         df[f'BE'],
                         np.nan)

    ## 2. Analysis: Single Sort and FF5 alpha
    sort_var = f'BE'
    num_level = 5
    _,vwret1 = utils.SingleSort(df,'PERMNO', 'YearMonth', 
                                    sort_var, 'bh1m', num_level, 
                                    'ME', quantile_filter=None)
    vwret1 = vwret1.merge(all_factor, left_index=True, right_on='YearMonth')

    vwret1['ones'] = 1
    mdl = sm.OLS(vwret1['H-L'], vwret1[['ones','Mkt_RF','SMB', 'HML', 'RMW', 'CMA']]).fit(cov_type = 'HAC', cov_kwds = {'maxlags':12})
    params['FF5 alpha'] = mdl.params['ones']
    params['FF5 t'] = mdl.tvalues['ones']

    mdl = sm.OLS(vwret1['H-L'], vwret1[['ones','Mkt_RF','SMB', 'HML', 'RMW', 'CMA', 'MOM']]).fit(cov_type = 'HAC', cov_kwds = {'maxlags':12})
    params['FFC6 alpha'] = mdl.params['ones']
    params['FFC6 t'] = mdl.tvalues['ones']

    rlts.append(params)

Var:BE, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:BE, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:BE, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:BE, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:BE, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:BE, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:BE, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:BE, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:BE, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:BE, Delete 90119 rows due to missing values, raw data 1350764 rows --> new data 1260645 rows
Var:BE, Delete 90119 rows due 

In [15]:
rlts = pd.DataFrame(rlts)
rlts.sort_values(by='FF5 t',ascending=True, inplace=True)
rlts['eps'].replace({True:'T',False:'F'},inplace=True)
rlts.to_clipboard()
rlts

Unnamed: 0,eps,feature,sample,std,window,FF5 alpha,FF5 t,FFC6 alpha,FFC6 t
22,T,1.0,0.1,rank,12,-0.55415,-2.128032,-0.200831,-0.816423
16,T,1.0,0.05,rank,12,-0.505817,-1.927607,-0.134541,-0.552755
40,F,1.0,0.05,rank,12,-0.511753,-1.828477,-0.124351,-0.487605
46,F,1.0,0.1,rank,12,-0.480574,-1.789767,-0.110914,-0.448205
23,T,1.0,0.1,rank,24,-0.425659,-1.697969,-0.133393,-0.535319
45,F,1.0,0.1,raw,24,-0.454986,-1.66644,-0.074775,-0.361197
44,F,1.0,0.1,raw,12,-0.420868,-1.564753,-0.030947,-0.142652
41,F,1.0,0.05,rank,24,-0.399293,-1.562574,-0.095277,-0.392359
17,T,1.0,0.05,rank,24,-0.385857,-1.544798,-0.090226,-0.363269
38,F,1.0,0.05,raw,12,-0.438535,-1.510949,-0.050251,-0.212513
