In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tseries.offsets import *
from tqdm import tqdm
from functools import reduce
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
ratio_chars = ['CAPEI', 'bm',
       'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',
       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',
       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',
       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',
       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',
       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',
       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',
       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',
       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',
       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',
       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',
       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',
       'divyield']

# RF with look-ahead bias

In [None]:
df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')

In [None]:
## Rolling Window:
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
num_trees = 2000
forecast = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=0.05,
                             n_jobs=14)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q2','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=0.05,
                             n_jobs=14)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q3','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=0.05,
                             n_jobs=14)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=0.05,
                             n_jobs=14)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y2','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=0.05,
                             n_jobs=14)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break

In [None]:
forecast_all = pd.concat(forecast,axis=0).reset_index()
forecast_all.to_parquet('../data/Results/RF_with_lookahead_raw_005.parquet')

# RF without look-ahead bias

In [None]:
##############################
## Difference in predictors ##
##############################
# Q1: same
# Q2: 'EPS_true_l1_q2' --> 'EPS_true_l1_q1'
# Q3: 'EPS_true_l1_q3' --> 'EPS_true_l1_q1'
# Y1: same
# Y2: 'EPS_true_l1_y2' --> 'EPS_true_l1_y1'

In [None]:
## Rolling Window:
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]

forecast = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=0.05,
                             n_jobs=14)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})
    
    ### Q2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=0.05,
                             n_jobs=14)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=0.05,
                             n_jobs=14)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=0.05,
                             n_jobs=14)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = 'EPS_true_y2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = RandomForestRegressor(n_estimators=num_trees, 
                             random_state=0, 
                             max_depth=7,
                             min_samples_leaf=5, 
                             max_samples=0.05,
                             n_jobs=14)
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break

In [None]:
forecast_all = pd.concat(forecast,axis=0).reset_index()
forecast_all.to_parquet('../data/Results/RF_wo_lookahead_raw_005.parquet')

# Hughes et al.(2008)

## Prepare Data

In [None]:
###  Characteristics ###

# acc, dPPE, dOLA, sg_5y from Compustat
compa = pd.read_parquet('../data/WRDS/compa.parquet')
compa['gvkey'] = compa['gvkey'].astype(float)
compa['datadate'] = compa['datadate'] + MonthEnd(0)
compa = compa[compa['at_avg'] > 0].copy()
compa['acc'] = compa['acc']/compa['at_avg']
compa['dPPE'] = compa['ppegt_diff']/compa['at_avg']
compa['dOLA'] = compa['ao_diff']/compa['at_avg']
compa['sg_5y'] = np.nan_to_num(compa['sg_5y'], nan=np.nan, posinf=np.nan, neginf=np.nan)

In [None]:
# LTG from IBES
consensus = pd.read_parquet('../data/WRDS/EPS_summary.parquet')
consensus['YearMonth'] = consensus.statpers + MonthEnd(0)
consensus_ltg = consensus[consensus.fpi == '0'].copy()
consensus_ltg['LTG'] = consensus_ltg['meanest']
consensus_ltg = consensus_ltg[['ticker','YearMonth','LTG']]

In [None]:
# Forecast Revision from IBES

# Analyst Forecast Earnings:
crsp = pd.read_parquet('../data/WRDS/crsp_m.parquet')
crsp['ME'] = abs(crsp['prc']) * crsp['shrout']
consensus = consensus.merge(crsp[['permno','ncusip','YearMonth','shrout']], 
                            left_on=['cusip','YearMonth'], 
                            right_on=['ncusip','YearMonth'])
consensus['AF'] = consensus['meanest'] * consensus['shrout']

## Revision for each Annual horizon
result = []
for i in [1,2]:
    consensus_f1 = consensus[consensus.fpi=='{}'.format(i)].copy()
    consensus_f1.drop_duplicates(subset=['YearMonth','ticker'], inplace=True)
    
    consensus_f1_lag = consensus[consensus.fpi.isin(['{}'.format(i),
                                                     '{}'.format(i+1)])][['YearMonth','ticker',
                                                                          'fpedats','AF']].copy()
    
    # Forecast in last month: F_{t-1}[x_t]
    consensus_f1_lag['YearMonth'] = consensus_f1_lag['YearMonth'] + MonthEnd(1)
    consensus_f1_lag.rename(columns={'AF':'AF_l1'},inplace=True)
    
    consensus_f1_change = consensus_f1[['YearMonth','permno','ticker','fpedats','AF']].merge(consensus_f1_lag, 
                                                                                        on=['YearMonth','ticker','fpedats'], 
                                                                                        how='left')
    
    consensus_f1_change['FRevision_A{}'.format(i)] = consensus_f1_change['AF'] - \
                                                     consensus_f1_change['AF_l1']
    consensus_f1_change.drop_duplicates(subset=['YearMonth','permno'], inplace=True)
    result.append(consensus_f1_change[['permno','YearMonth','FRevision_A{}'.format(i)]])

FR = reduce(lambda x,y: pd.merge(x,y,on=['permno','YearMonth'], how='outer'), result)
FR.sort_values(by=['permno','YearMonth'],inplace=True,ignore_index=True)
# Revision in recent 3months
FR[['FRevision_A1_3m','FRevision_A2_3m']] = FR.groupby('permno')[['FRevision_A1','FRevision_A2']]\
                                        .rolling(3).sum().reset_index(level=0, drop=True)

In [None]:
# Earnings Surprise

## Most recent Quarterly Earnings Announcement SUE
# 1. Actual
# IBES actual
EPS_true = pd.read_stata('../data/WRDS/EPS_unadjusted_actual_full.dta')
EPS_true['YearMonth'] = EPS_true['ANNDATS'] + MonthEnd(0)
EPS_true = EPS_true.merge(crsp[['ncusip','YearMonth','shrout','cfacshr','ME']], 
                          left_on=['CUSIP','YearMonth'], 
                          right_on=['ncusip','YearMonth'])
EPS_true['AE'] = EPS_true['VALUE'] * EPS_true['shrout']
EPS_true['YearMonth'] = EPS_true['ANNDATS'] + MonthEnd(-1) # use this to merge with forecast
EPS_true_qtr = EPS_true[EPS_true['PDICITY'] == 'QTR'].sort_values(by=['TICKER','PENDS'])
# 2. Forecast in last month
consensus_1q = consensus[consensus['fpi'] == '6']
FE_last = EPS_true_qtr[['TICKER','ANNDATS','YearMonth','PENDS','AE','ME']].merge(consensus_1q[['ticker','YearMonth','fpedats','AF','statpers']],
                   left_on=['TICKER','YearMonth','PENDS'],
                   right_on=['ticker','YearMonth','fpedats'],
                  )
FE_last['SUE'] = (FE_last['AE'] - FE_last['AF']) / FE_last['ME']
# These SUE shoulbe be used after the announcement
FE_last['YearMonth'] = FE_last['ANNDATS'] + MonthEnd(0)
FE_last.drop_duplicates(subset=['TICKER','YearMonth'], keep='last', inplace=True)

In [None]:
# Momentum
crsp = pd.read_parquet('../data/WRDS/crsp_m.parquet')
crsp.sort_values(by=['permno','YearMonth'], inplace=True)
crsp['ret_12m'] = np.log(1 + crsp['ret']).groupby(crsp['permno'])\
                    .rolling(12).sum()\
                    .reset_index(level=0, drop=True)
crsp['ret_12m'] = np.exp(crsp['ret_12m']) - 1

In [None]:
df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')
df_tmp = df_tmp.merge(compa, on=['gvkey','datadate'], how='left')
df_tmp = df_tmp.merge(FR, on=['permno','YearMonth'], how='left')
df_tmp = df_tmp.merge(consensus_ltg, 
                      left_on=['TICKER','YearMonth'], 
                      right_on=['ticker','YearMonth'],
                      how='left')
df_tmp = df_tmp.merge(FE_last[['TICKER','YearMonth','SUE']],
                      on=['TICKER','YearMonth'],
                      how='left'
                     )
df_tmp = df_tmp.merge(crsp[['permno','YearMonth','ret_12m']],
                      on=['permno','YearMonth'],
                      how='left'
                     )
df_tmp.sort_values(by=['permno','YearMonth'], inplace=True, ignore_index=True)
df_tmp['SUE'] = df_tmp.groupby('permno')['SUE'].ffill(limit=6)

## Preprocess and Make Forecasts

In [None]:
df_tmp['FRevision_A1_3m_std'] = df_tmp['FRevision_A1_3m']/df_tmp['ME']

# winsorization period-by-period
cols = [
        'acc','LTG', 'sg_5y','dPPE', 'dOLA','ret_12m',
        'SUE','FRevision_A1_3m_std'
       ]

df_tmp[cols] = df_tmp.groupby('YearMonth',group_keys=False)[cols]\
                     .transform(lambda x: x.clip(x.quantile(0.01),x.quantile(0.99)))

# ## FillNA with Industry Median
fillNA = ['acc','LTG', 'sg_5y','dPPE', 'dOLA','ret_12m',
        'SUE','FRevision_A1_3m_std'
       ]
for v in tqdm(fillNA):
    df_tmp[v] = df_tmp.groupby(['YearMonth','fama49'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))
## In case some characteristics are all NA in some industry
for v in tqdm(fillNA):
    df_tmp[v] = df_tmp.groupby(['YearMonth'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))

In [None]:
## Rolling Window:
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]
forecast = []

for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ['EPS_ana_q1','acc','LTG', 'sg_5y','dPPE', 'dOLA','SUE','ret_12m','FRevision_A1_3m_std']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(60)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()
    y_pred = mdl.predict(sm.add_constant(df_test[x_cols]))
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'LF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})

    ### Q2 ###
    x_cols = ['EPS_ana_q2','acc','LTG', 'sg_5y','dPPE', 'dOLA','SUE','ret_12m','FRevision_A1_3m_std']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(60)) & (df_tmp['ANNDATS_q2'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    mdl = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()
    y_pred = mdl.predict(sm.add_constant(df_test[x_cols]))
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'LF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})

    ### Q3 ###
    x_cols = ['EPS_ana_q3','acc','LTG', 'sg_5y','dPPE', 'dOLA','SUE','ret_12m','FRevision_A1_3m_std']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(60)) & (df_tmp['ANNDATS_q3'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    mdl = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()
    y_pred = mdl.predict(sm.add_constant(df_test[x_cols]))
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'LF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})

    ### Y1 ###
    x_cols = ['EPS_ana_y1','acc','LTG', 'sg_5y','dPPE', 'dOLA','SUE','ret_12m','FRevision_A1_3m_std']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(60)) & (df_tmp['ANNDATS_y1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    mdl = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()
    y_pred = mdl.predict(sm.add_constant(df_test[x_cols]))
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'LF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})

    ### Y2 ###
    x_cols = ['EPS_ana_y2','acc','LTG', 'sg_5y','dPPE', 'dOLA','SUE','ret_12m','FRevision_A1_3m_std']
    y_col = 'EPS_true_y2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(60)) & (df_tmp['ANNDATS_y2'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    mdl = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()
    y_pred = mdl.predict(sm.add_constant(df_test[x_cols]))
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'LF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})

    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break

In [None]:
forecast_all = pd.concat(forecast,axis=0).reset_index()
forecast_all.to_parquet('../data/Results/Hughes_eps.parquet')

# So (2013)

## Prepare Data

In [None]:
df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')
# compustat data
compa = pd.read_parquet('../data/WRDS/compa.parquet')
compa['gvkey'] = compa['gvkey'].astype(float)
compa['datadate'] = compa['datadate'] + MonthEnd(0)
compa = compa[compa['at_avg'] > 0].copy()
compa['acc'] = compa['acc']/compa['csho']
compa['div'] = compa['dvc']/compa['csho']
compa['dd'] = np.where(compa['dvc']==0, 1, 0)
compa['acc_n'] = compa['acc'].clip(upper=0).abs()
compa['acc_p'] = compa['acc'].clip(lower=0)

df_tmp = df_tmp.merge(compa[['gvkey','datadate','dd','div',
                             'acc_n','acc_p','ag',
                            ]], 
                        left_on=['gvkey','adate'], 
                        right_on=['gvkey','datadate'], 
                        how='left')

df_tmp['earnings_pos_l1_y1'] = df_tmp['EPS_true_l1_y1'].clip(lower=0)
df_tmp['nege_l1_y1'] = np.where(df_tmp['EPS_true_l1_y1'] < 0, 1, 0)

df_tmp['earnings_pos_l1_q1'] = df_tmp['EPS_true_l1_q1'].clip(lower=0)
df_tmp['nege_l1_q1'] = np.where(df_tmp['EPS_true_l1_q1'] < 0, 1, 0)

## winsorize period-by-period
cols = ['earnings_pos_l1_y1','earnings_pos_l1_q1',
        'acc_n','acc_p','ag','div',
       ]
df_tmp[cols] = df_tmp.groupby('YearMonth')[cols]\
                    .transform(lambda x: x.clip(x.quantile(0.01),x.quantile(0.99)))

# ## FillNA with Industry Median
fillNA = ['earnings_pos_l1_q1','nege_l1_q1',
          'earnings_pos_l1_y1','nege_l1_y1',
          'acc_n','acc_p','ag','dd','bm','prc','div']
for v in tqdm(fillNA):
    df_tmp[v] = df_tmp.groupby(['YearMonth','fama49'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))
## In case some characteristics are all NA in some industry
for v in tqdm(fillNA):
    df_tmp[v] = df_tmp.groupby(['YearMonth'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))

## Forecasts

In [None]:
## Rolling Window:
time_idx = sorted(df_tmp['YearMonth'].unique())
time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]

params = {}
t_values = {}

forecast = []
for t in tqdm(time_idx):
    
    ### Q1 ###
    x_cols = ['EPS_ana_q1','earnings_pos_l1_q1','nege_l1_q1','acc_n','acc_p','ag','dd','bm','prc','div']
    y_col = 'EPS_true_q1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    # break
    mdl = LinearRegression()
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'So_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})

    ### Q2 ###
    x_cols = ['EPS_ana_q2','earnings_pos_l1_q1','nege_l1_q1','acc_n','acc_p','ag','dd','bm','prc','div']
    y_col = 'EPS_true_q2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = LinearRegression()
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'So_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})
    
    ### Q3 ###
    x_cols = ['EPS_ana_q3','earnings_pos_l1_q1','nege_l1_q1','acc_n','acc_p','ag','dd','bm','prc','div']
    y_col = 'EPS_true_q3'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = LinearRegression()
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'So_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})
    
    ### Y1 ###
    x_cols = ['EPS_ana_y1','earnings_pos_l1_y1','nege_l1_y1','acc_n','acc_p','ag','dd','bm','prc','div']
    y_col = 'EPS_true_y1'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = LinearRegression()
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])

    mdl_sm = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()
    params[t] = mdl_sm.params
    t_values[t] = mdl_sm.tvalues
    
    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'So_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})
    
    ### Y2 ###
    x_cols = ['EPS_ana_y2','earnings_pos_l1_y1','nege_l1_y1','acc_n','acc_p','ag','dd','bm','prc','div']
    y_col = 'EPS_true_y2'
    
    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\
               .dropna(subset=x_cols+[y_col])
    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])
    
    mdl = LinearRegression()
    
    mdl.fit(df_train[x_cols], df_train[y_col])
    y_pred = mdl.predict(df_test[x_cols])
    
    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],
                                'So_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})
    
    forecast.append(reduce(lambda x,y: pd.merge(x,y,
                                       on=['permno','YearMonth'],
                                       how='outer'),
                 [forecast_q1,forecast_q2,forecast_q3,
                  forecast_y1,forecast_y2]))
    # break

In [None]:
forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)
forecast_all.to_parquet('../data/Results/So_eps_AF.parquet')