In [1]:
import os

import pandas as pd
from pandas import DataFrame
import numpy as np
from scipy.stats.mstats import winsorize

from Constants import Constants as const

In [10]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20240825_stock_act_reg_data.dta'))
new_year_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'fromZGY', 'new_shock_year_data_compustat.csv'),
                                     usecols=[const.GVKEY, 'year', 'post', 'freq']).rename(
    columns={'post': 'post60min', 'freq': 'freq60min', 'year': 'fiscal_year'})
# new_quarterly_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'fromZGY', 'new_year_quarter_data.csv'))

In [14]:
reg_df_60: DataFrame = reg_df.merge(new_year_df, on=[const.GVKEY, const.YEAR], how='left').drop(['busdesc'], axis=1)
event_index = reg_df_60.loc[reg_df_60[const.YEAR].apply(lambda x: 2008 < x < 2015)].index
reg_df_60.loc[event_index, 'freq60min'] = reg_df_60['freq60min'].fillna(0)
reg_df_60.loc[event_index, 'ln_freq60min'] = reg_df_60['freq60min'].apply(lambda x: np.log(x + 1))
reg_df_60.loc[event_index, 'freq60min'] = (reg_df_60.loc[event_index, const.YEAR] > 2011).astype(int)
reg_df_60.to_stata(os.path.join(const.RESULT_PATH, '20240918_stock_act_reg_data.dta'),
                   write_index=False)

In [20]:
# merge quarterly data
new_quarterly_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'fromZGY', 'new_year_quarter_data_gvkey.csv'))
majgovcustomer_gvkey = reg_df.loc[reg_df['MajorGovCustomer'] == 1, 'gvkey'].unique()
new_quarterly_df.loc[:, 'MajorGovCustomer'] = 0
new_quarterly_df.loc[new_quarterly_df[const.GVKEY].isin(majgovcustomer_gvkey), 'MajorGovCustomer'] = 1

new_quarterly_df['yearquarter'] = new_quarterly_df.apply(lambda x: '{}q{}'.format(int(x['Year']), int(x['Quarter'])), axis=1)
new_quarterly_df.loc[:, 'post'] = (new_quarterly_df['Year'] > 2011).astype(int)
new_quarterly_df['ln_freq'] = new_quarterly_df['freq'].apply(lambda x: np.log(x + 1))

int_terms = list()
for year in range(2008, 2015):
    for quarter in range(1, 5):
        new_quarterly_df.loc[:, f'd{year}q{quarter}'] = (new_quarterly_df['yearquarter'] == f'{year}q{quarter}').astype(int)
        int_terms.append(f'1.d{year}q{quarter}#1.MajorGovCustomer')

new_quarterly_df.to_stata(os.path.join(const.RESULT_PATH, '20240918_stock_act_reg_data_quarterly.dta'), write_index=False)

print(' '.join(int_terms))

1.d2008q1#1.MajorGovCustomer 1.d2008q2#1.MajorGovCustomer 1.d2008q3#1.MajorGovCustomer 1.d2008q4#1.MajorGovCustomer 1.d2009q1#1.MajorGovCustomer 1.d2009q2#1.MajorGovCustomer 1.d2009q3#1.MajorGovCustomer 1.d2009q4#1.MajorGovCustomer 1.d2010q1#1.MajorGovCustomer 1.d2010q2#1.MajorGovCustomer 1.d2010q3#1.MajorGovCustomer 1.d2010q4#1.MajorGovCustomer 1.d2011q1#1.MajorGovCustomer 1.d2011q2#1.MajorGovCustomer 1.d2011q3#1.MajorGovCustomer 1.d2011q4#1.MajorGovCustomer 1.d2012q1#1.MajorGovCustomer 1.d2012q2#1.MajorGovCustomer 1.d2012q3#1.MajorGovCustomer 1.d2012q4#1.MajorGovCustomer 1.d2013q1#1.MajorGovCustomer 1.d2013q2#1.MajorGovCustomer 1.d2013q3#1.MajorGovCustomer 1.d2013q4#1.MajorGovCustomer 1.d2014q1#1.MajorGovCustomer 1.d2014q2#1.MajorGovCustomer 1.d2014q3#1.MajorGovCustomer 1.d2014q4#1.MajorGovCustomer


In [24]:
inter_terms = '1.d2009q1#1.MajorGovCustomer 1.d2009q2#1.MajorGovCustomer 1.d2009q3#1.MajorGovCustomer 1.d2009q4#1.MajorGovCustomer 1.d2010q1#1.MajorGovCustomer 1.d2010q2#1.MajorGovCustomer 1.d2010q3#1.MajorGovCustomer 1.d2010q4#1.MajorGovCustomer 1.d2011q1#1.MajorGovCustomer 1.d2011q2#1.MajorGovCustomer 1.d2011q3#1.MajorGovCustomer 1.d2011q4#1.MajorGovCustomer 1.d2012q1#1.MajorGovCustomer 1.d2012q2#1.MajorGovCustomer 1.d2012q3#1.MajorGovCustomer 1.d2012q4#1.MajorGovCustomer 1.d2013q1#1.MajorGovCustomer 1.d2013q2#1.MajorGovCustomer 1.d2013q3#1.MajorGovCustomer 1.d2013q4#1.MajorGovCustomer 1.d2014q1#1.MajorGovCustomer 1.d2014q2#1.MajorGovCustomer 1.d2014q3#1.MajorGovCustomer 1.d2014q4#1.MajorGovCustomer'.split(' ')
xlabel = list()
for i, term in enumerate(inter_terms):
    year = term.split('q')[0][-4:]
    quarter = term.split('q')[1][0]
    xlabel.append(f'{i+1} "{year}q{quarter}"')
    
print(' '.join(xlabel))

1 "2009q1" 2 "2009q2" 3 "2009q3" 4 "2009q4" 5 "2010q1" 6 "2010q2" 7 "2010q3" 8 "2010q4" 9 "2011q1" 10 "2011q2" 11 "2011q3" 12 "2011q4" 13 "2012q1" 14 "2012q2" 15 "2012q3" 16 "2012q4" 17 "2013q1" 18 "2013q2" 19 "2013q3" 20 "2013q4" 21 "2014q1" 22 "2014q2" 23 "2014q3" 24 "2014q4"


# Merge Analysts FCSTERROR DISPERSION market information data.

In [26]:
# load regression data
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241009_stock_act_reg_data_v2.dta'))

# load sue123 data
sue123_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20241006_sue123_data_annual.pkl'))

# load analysts fcsterror dispersion data
adf_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20241006_analysts_dispersion_fcsterror.pkl'))

# load synchrony data
synchrony_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20241006_synchrony_weekly.pkl'))
synchrony_df_monthly: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20241010_synchrony_monthly.pkl'))
synchrony_df_daily: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20241010_synchrony_daily.pkl'))
synchrony_df: DataFrame = synchrony_df.merge(synchrony_df_monthly, on=['PERMNO', 'year'], how='left', suffixes=('', '_MONTH')).merge(
    synchrony_df_daily, on=['PERMNO', 'year'], how='left')

In [8]:
# load regression data
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20250323_stock_act_idiosyn_v1.dta'))
synchrony_df_monthly: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20241010_synchrony_monthly.pkl'))
key_to_drop = 'SYNCHRONICITY SYNCHRONICITY_MKT SYNCHRONICITY_IND SYNCHRONICITY_MONTH SYNCHRONICITY_MKT_MONTH SYNCHRONICITY_IND_MONTH SYNCHRONICITY_D SYNCHRONICITY_MKT_D SYNCHRONICITY_IND_D IDIOSYN_MONTH IDIOSYN_MKT_MONTH IDIOSYN_IND_MONTH PERMNO year'.split(' ')
reg_df.drop(key_to_drop, axis=1, inplace=True)
reg_df2: DataFrame = reg_df.merge(synchrony_df_monthly, left_on=['LPERMNO', const.YEAR], right_on=['PERMNO', 'year'], how='left')
reg_df2.drop(['PERMNO', 'year'], axis=1, inplace=True)
reg_df3: DataFrame = reg_df2.merge(synchrony_df_monthly, left_on=['LPERMCO', const.YEAR], right_on=['PERMNO', 'year'], how='left')

key_to_drop = ['PERMNO', 'year']
for key in ['SYNCHRONICITY', 'SYNCHRONICITY_MKT', 'SYNCHRONICITY_IND']:
    reg_df3[key] = reg_df3[f'{key}_x'].fillna(reg_df3[f'{key}_y'])
    key_to_drop.append(f'{key}_x')
    key_to_drop.append(f'{key}_y')

reg_df3.drop(key_to_drop, axis=1, inplace=True)

In [12]:
reg_df3[['SYNCHRONICITY', 'SYNCHRONICITY_MKT', 'SYNCHRONICITY_IND']].describe()

Unnamed: 0,SYNCHRONICITY,SYNCHRONICITY_MKT,SYNCHRONICITY_IND
count,11665.0,11722.0,11664.0
mean,1.425473,-0.1772,1.606128
std,1.54641,1.235163,1.453076
min,-3.04163,-7.19836,-4.871541
25%,0.559375,-0.918655,0.715606
50%,1.338421,-0.113104,1.327711
75%,2.158434,0.641813,2.178471
max,36.736801,5.661633,34.931621


In [11]:
reg_df3['SYNCHRONICITY_IND'] = reg_df3['SYNCHRONICITY'] - reg_df3['SYNCHRONICITY_MKT']
reg_df3['IDIOSYN'] = reg_df3['SYNCHRONICITY']  * -1
reg_df3['IDIOSYN_IND'] = reg_df3['SYNCHRONICITY_IND']  * -1
reg_df3['IDIOSYN_MKT'] = reg_df3['SYNCHRONICITY_MKT']  * -1

In [14]:
reg_df3.loc[reg_df3['IDIOSYN_IND'].isnull(), 'IDIOSYN'] = np.nan
reg_df3.loc[reg_df3['IDIOSYN_IND'].isnull(), 'IDIOSYN_MKT'] = np.nan

In [15]:
reg_df3[['IDIOSYN', 'IDIOSYN_MKT', 'IDIOSYN_IND']].describe()

Unnamed: 0,IDIOSYN,IDIOSYN_MKT,IDIOSYN_IND
count,11664.0,11664.0,11664.0
mean,-1.423013,0.183115,-1.606128
std,1.523484,1.227859,1.453076
min,-36.736801,-5.212837,-34.931621
25%,-2.158357,-0.632604,-2.178471
50%,-1.338381,0.114983,-1.327711
75%,-0.559374,0.920745,-0.715606
max,3.04163,7.19836,4.871541


In [17]:
reg_df3.to_stata(os.path.join(const.RESULT_PATH, '20250323_stock_act_idiosyn_v2.dta'), write_index=False, version=119)

In [35]:
# merge with sue 123 data
sue123_df['gvkey'] = sue123_df['gvkey'].astype(int)
sue123_df['fyearq'] = sue123_df['fyearq'].astype(int)
sue123_df.rename(columns={'fyearq': const.YEAR}, inplace=True)

reg_df2: DataFrame = reg_df.merge(sue123_df, on=[const.GVKEY, const.YEAR], how='left')

In [23]:
adf_df[adf_df[['ticker', 'fyear']].duplicated()]

Unnamed: 0,ticker,fpedats,anndats_act,meanest,numest,stdev,actual,prcc_f,fyear,ANALYSTS,lnANALYSTS,DISPERSION,FCSTERROR,fiscal_year
9,EXPD,2013-12-31,2014-02-25,1.96,1.0,,1.68,44.25,2013.0,1.0,0.693147,,0.006328,2013.0
10,JJSF,2010-03-31,2010-04-22,0.46,3.0,0.03,0.48,43.19,2009.0,3.0,1.386294,0.000695,0.000463,2009.0
11,NSM,2009-05-31,2009-06-11,-0.42,7.0,0.02,-0.28,,2008.0,7.0,2.079442,,,2008.0
25,NSM,2010-08-31,2010-09-09,0.21,14.0,0.03,0.36,,2009.0,14.0,2.708050,,,2009.0
27,WGL,2016-06-30,2016-08-03,0.11,4.0,0.19,0.33,57.67,2015.0,4.0,1.609438,0.003295,0.003815,2015.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155663,FOSL,2006-09-30,2006-11-14,0.34,9.0,0.02,0.32,21.51,2005.0,9.0,2.302585,0.000930,0.000930,2005.0
155664,LF,2010-03-31,2010-05-03,-0.29,5.0,0.04,-0.37,3.91,2009.0,5.0,1.791759,0.010230,0.020460,2009.0
155665,OXPS,2006-12-31,2007-01-31,0.95,4.0,0.06,1.15,22.69,2006.0,4.0,1.609438,0.002644,0.008814,2006.0
155666,STGN,2006-12-31,2007-03-06,0.40,2.0,0.03,0.25,7.44,2006.0,2.0,1.098612,0.004032,0.020161,2006.0


In [36]:
# merge adf data
# adf_df.rename(columns={'tic': 'ticker'}, inplace=True)
adf_df.loc[:, const.YEAR] = adf_df['fyear']
adf_valid: DataFrame = adf_df.loc[:, ['ticker', const.YEAR, 'ANALYSTS', 'lnANALYSTS', 'DISPERSION', 'FCSTERROR']]

reg_df3: DataFrame = reg_df2.merge(adf_valid, on=['ticker', const.YEAR], how='left')
reg_df3.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,QFM_f_annual,predicted_costcap,predicted_hurdle,annual_sue1,annual_sue2,annual_sue3,ANALYSTS,lnANALYSTS,DISPERSION,FCSTERROR
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,11630.0,8505.0,8505.0,6726.0,6726.0,6032.0,3948.0,3948.0,3778.0,3828.0
mean,6036.5,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,...,2.679648,0.092023,0.117034,0.00031,0.000311,-0.03255,11.545593,2.289246,552.9143,40.643852
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,-0.879652,0.04955,0.060928,-0.580058,-0.580058,-6.885936,1.0,0.693147,0.0,0.0
25%,3018.25,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,...,0.207093,0.083509,0.105448,0.0,0.0,-0.364511,5.0,1.791759,0.001275257,0.003183
50%,6036.5,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,...,0.434483,0.092779,0.118525,0.0,0.0,0.0,9.0,2.302585,0.003047777,0.008994
75%,9054.75,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,...,0.895383,0.101097,0.129915,0.0,0.0,0.3653,16.0,2.833213,0.009602861,0.029951
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,4931.937993,0.124678,0.167867,1.412761,1.412761,7.976155,56.0,4.043051,1899558.0,134482.758621
std,3485.60791,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,...,59.815111,0.012435,0.017691,0.023666,0.023675,0.970669,8.677086,0.719222,31028.71,2178.351333


In [37]:
# Define the columns to winsorize
columns_to_winsorize = ['DISPERSION', 'FCSTERROR']

# Winsorize each column in the dataset reg_df3 at the 1st and 99th percentiles
for column in columns_to_winsorize:
    non_na_data = reg_df3[column].dropna()
    winsorized_data = winsorize(non_na_data, limits=[0.01, 0.01])
    reg_df3.loc[non_na_data.index, column] = winsorized_data

# Display the updated dataframe
reg_df3.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,QFM_f_annual,predicted_costcap,predicted_hurdle,annual_sue1,annual_sue2,annual_sue3,ANALYSTS,lnANALYSTS,DISPERSION,FCSTERROR
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,11630.0,8505.0,8505.0,6726.0,6726.0,6032.0,3948.0,3948.0,3778.0,3828.0
mean,6036.5,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,...,2.679648,0.092023,0.117034,0.00031,0.000311,-0.03255,11.545593,2.289246,0.239984,0.357989
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,-0.879652,0.04955,0.060928,-0.580058,-0.580058,-6.885936,1.0,0.693147,0.0001,0.000244
25%,3018.25,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,...,0.207093,0.083509,0.105448,0.0,0.0,-0.364511,5.0,1.791759,0.001275,0.003183
50%,6036.5,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,...,0.434483,0.092779,0.118525,0.0,0.0,0.0,9.0,2.302585,0.003048,0.008994
75%,9054.75,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,...,0.895383,0.101097,0.129915,0.0,0.0,0.3653,16.0,2.833213,0.009603,0.029951
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,4931.937993,0.124678,0.167867,1.412761,1.412761,7.976155,56.0,4.043051,15.540258,20.199805
std,3485.60791,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,...,59.815111,0.012435,0.017691,0.023666,0.023675,0.970669,8.677086,0.719222,1.672322,2.191517


In [27]:
# merge sync data
synchrony_df.rename(columns={'TICKER': 'ticker', 'year': const.YEAR}, inplace=True)
crsp_comp_link: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, 'crsp_comp.pkl')).rename(columns={'permco': 'PERMNO'})
synchrony_df_gvkey = synchrony_df.merge(crsp_comp_link, on=['PERMNO'], how='left')
synchrony_df_gvkey['gvkey'] = pd.to_numeric(synchrony_df_gvkey['gvkey'], errors='coerce')

In [29]:
synchrony_df_gvkey.keys()

Index(['PERMNO', 'fiscal_year', 'SYNCHRONICITY', 'SYNCHRONICITY_MKT',
       'SYNCHRONICITY_IND', 'ticker', 'SYNCHRONICITY_MONTH',
       'SYNCHRONICITY_MKT_MONTH', 'SYNCHRONICITY_IND_MONTH', 'SYNCHRONICITY_D',
       'SYNCHRONICITY_MKT_D', 'SYNCHRONICITY_IND_D', 'gvkey'],
      dtype='object')

In [30]:
reg_df2: DataFrame = reg_df.drop(['SYNCHRONICITY', 'SYNCHRONICITY_MKT', 'SYNCHRONICITY_IND'], axis=1).merge(
    synchrony_df_gvkey.drop(['ticker', 'PERMNO'], axis=1).dropna(subset=['gvkey']), on=['gvkey', const.YEAR], how='left', suffixes=('', '_gvkey')).merge(
    synchrony_df_gvkey.drop(['gvkey', 'PERMNO'], axis=1).dropna(subset=['ticker']), on=['ticker', const.YEAR], how='left', suffixes=('', '_ticker')).merge(
    synchrony_df_gvkey.drop(['gvkey', 'ticker'], axis=1).dropna(subset=['PERMNO']), on=['PERMNO', const.YEAR], how='left', suffixes=('', '_PERMNO'))

key_to_drop = list()
for key in ['SYNCHRONICITY', 'SYNCHRONICITY_MKT', 'SYNCHRONICITY_IND', 'SYNCHRONICITY_MONTH','SYNCHRONICITY_MKT_MONTH', 
            'SYNCHRONICITY_IND_MONTH', 'SYNCHRONICITY_D', 'SYNCHRONICITY_MKT_D', 'SYNCHRONICITY_IND_D']:
    reg_df2[key] = reg_df2[key].fillna(reg_df2[f'{key}_ticker']).fillna(reg_df2[f'{key}_PERMNO'])
    key_to_drop.extend([f'{key}_ticker', f'{key}_PERMNO'])
    
reg_df2.drop(key_to_drop, axis=1, inplace=True)
reg_df2.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,numest,SYNCHRONICITY,SYNCHRONICITY_MKT,SYNCHRONICITY_IND,SYNCHRONICITY_MONTH,SYNCHRONICITY_MKT_MONTH,SYNCHRONICITY_IND_MONTH,SYNCHRONICITY_D,SYNCHRONICITY_MKT_D,SYNCHRONICITY_IND_D
count,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,...,7298.0,7702.0,7716.0,7716.0,7576.0,7629.0,7629.0,7735.0,7737.0,7737.0
mean,5917.137085,78643.466183,2011.542578,0.350485,4.630258,0.836249,0.147818,0.508168,6.555644,0.180124,...,102.521102,-0.646085,-1.035348,-2.472007,1.479553,-0.07786,-0.460027,-0.983703,-1.24903,-1.325857
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,1.0,-4.990253,-7.275702,-10.594919,-2.934787,-7.19836,-7.505549,-5.942492,-7.276499,-8.105904
25%,3026.0,12215.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.081295,0.000892,...,38.0,-1.263271,-1.6964,-3.115373,0.64425,-0.774217,-1.211535,-1.721788,-1.909017,-2.1122
50%,5773.0,61587.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.608994,0.148389,...,80.0,-0.614289,-0.938245,-2.354456,1.387043,-0.015227,-0.387299,-0.837935,-0.981284,-1.116047
75%,8931.5,150699.0,2014.0,1.0,6.0,1.94591,0.0,1.0,7.969589,0.26784,...,151.0,0.028338,-0.211519,-1.75094,2.228142,0.742137,0.386116,-0.078576,-0.316189,-0.301435
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,571.0,6.829389,3.986925,3.496632,36.043653,10.738283,15.020359,2.880722,1.724813,2.586106
std,3461.817696,73399.399565,2.280434,0.477141,8.419172,1.23213,0.354933,0.499953,2.13188,0.269232,...,83.724776,0.963188,1.157417,1.095382,1.581823,1.228575,1.285444,1.24898,1.32681,1.427007


In [14]:
reg_df2.keys()

Index(['index', 'gvkey', 'fiscal_year', 'mf_indicator', 'frequency',
       'log_frequency', 'MajorGovCustomer', 'post', 'log_market_value', 'lev',
       ...
       'DISPERSION', 'FCSTERROR', 'SYNCHRONICITY', 'SYNCHRONICITY_MKT',
       'SYNCHRONICITY_IND', 'PERMNO', 'cpie_gpin', 'cpie_owr', 'coverage',
       'numest'],
      dtype='object', length=191)

In [31]:
columns_to_winsorize = ['DISPERSION', 'FCSTERROR', 'SYNCHRONICITY', 'SYNCHRONICITY_MKT',
       'SYNCHRONICITY_IND', 'cpie_gpin', 'cpie_owr', 'coverage',
       'numest', 'SYNCHRONICITY_MONTH','SYNCHRONICITY_MKT_MONTH', 'SYNCHRONICITY_IND_MONTH',
                        'SYNCHRONICITY_D', 'SYNCHRONICITY_MKT_D', 'SYNCHRONICITY_IND_D']
reg_df3: DataFrame = reg_df2.drop_duplicates([const.GVKEY, const.YEAR], keep='first')
# Winsorize each column in the dataset reg_df3 at the 1st and 99th percentiles
for column in columns_to_winsorize:
    non_na_data = reg_df3[column].dropna()
    winsorized_data = winsorize(non_na_data, limits=[0.01, 0.01])
    reg_df3.loc[non_na_data.index, column] = winsorized_data

# Display the updated dataframe
reg_df3.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,numest,SYNCHRONICITY,SYNCHRONICITY_MKT,SYNCHRONICITY_IND,SYNCHRONICITY_MONTH,SYNCHRONICITY_MKT_MONTH,SYNCHRONICITY_IND_MONTH,SYNCHRONICITY_D,SYNCHRONICITY_MKT_D,SYNCHRONICITY_IND_D
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,6706.0,7109.0,7122.0,7122.0,6992.0,7037.0,7037.0,7138.0,7140.0,7140.0
mean,6036.5,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,...,98.629884,-0.670551,-1.064322,-2.490808,1.435795,-0.108291,-0.464465,-1.015247,-1.293022,-1.361813
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,3.0,-3.101548,-4.330298,-5.564165,-1.286456,-3.487743,-3.837932,-4.285619,-5.212167,-5.379391
25%,3018.25,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,...,36.0,-1.308171,-1.765061,-3.146918,0.615697,-0.839259,-1.20962,-1.799696,-1.999417,-2.196078
50%,6036.5,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,...,72.0,-0.634087,-0.961595,-2.378148,1.386042,-0.036902,-0.417715,-0.863187,-1.057522,-1.174833
75%,9054.75,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,...,137.75,0.006029,-0.238802,-1.750827,2.193494,0.714125,0.358543,-0.100233,-0.339558,-0.325174
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,374.0,1.319524,1.131258,-0.237203,4.726006,2.596473,2.385805,1.394683,0.957713,1.317516
std,3485.60791,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,...,83.42929,0.945929,1.139099,1.069094,1.200927,1.202594,1.213881,1.260805,1.329277,1.436088


In [32]:
reg_df3.to_stata(os.path.join(const.RESULT_PATH, '20241010_stock_act_reg_data_v3.dta'), write_index=False, version=117)

In [85]:
import zipfile

# Define the path to the zip file
zip_file_path = os.path.join(const.DATA_PATH, 'cpie_data.zip')
csv_file_name = 'cpie_daily.csv'

# Open the zip file and read the CSV file
with zipfile.ZipFile(zip_file_path, 'r') as z:
    # Extract the CSV file from the zip and load it into a pandas DataFrame
    with z.open(csv_file_name) as f:
        df = pd.read_csv(f, dtype={'date': str})

cpin_daily = df.loc[:, ['permno', 'year', 'cpie_gpin', 'cpie_owr', 'date']].copy()

In [33]:
cpin_daily_1316 = pd.read_pickle(os.path.join(const.TEMP_PATH, '2013_2016_gpin_owr.pkl')).loc[:, ['date', 'permno', 'year', 'gpin', 'owr']].rename(
    columns={'gpin': 'cpie_gpin', 'owr': 'cpie_owr'})

cpin_daily_full = pd.concat([cpin_daily, cpin_daily_1316], axis=0, ignore_index=True)

cpin_daily_full['date'] = pd.to_datetime(cpin_daily_full['date'])

# Display the first few rows of the DataFrame to ensure it loaded correctly
print(cpin_daily_full.head())

   permno  year  cpie_gpin  cpie_owr       date
0   10057  1993   0.010702  0.848057 1993-01-04
1   10057  1993   0.070915  0.875569 1993-01-05
2   10057  1993   0.013788  0.911562 1993-01-06
3   10057  1993   0.432916  0.815852 1993-01-07
4   10057  1993   0.002560  0.724871 1993-01-08


In [108]:
cpin_daily_full.shape

(9126580, 6)

In [110]:
cpin_daily_full.drop_duplicates(subset=['permno', 'date'], keep='first').shape

(8538019, 6)

In [111]:
cpin_daily_1319 = pd.read_pickle(os.path.join(const.TEMP_PATH, '2013_2019_gpin.pkl')).loc[:, ['date', 'permno', 'year', 'gpin']].rename(
    columns={'gpin': 'cpie_gpin'})

cpin_daily['date'] = pd.to_datetime(cpin_daily['date'], format='%Y%m%d')
cpin_daily_full = pd.concat([cpin_daily, cpin_daily_1319], axis=0, ignore_index=True).drop_duplicates(subset=['permno', 'date'], keep='first')

# cpin_daily_full['date'] = pd.to_datetime(cpin_daily_full['date'])
#
# # Display the first few rows of the DataFrame to ensure it loaded correctly
# print(cpin_daily_full.head())

In [113]:
cpin_daily_full.to_csv(os.path.join(const.RESULT_PATH, '1993_2019_cpie_daily.csv'), index=False)

In [83]:
cpin_daily_full

Unnamed: 0,permno,year,cpie_gpin,date
0,10057,1993,1.070173e-02,1970-01-01 00:00:00.019930104
1,10057,1993,7.091512e-02,1970-01-01 00:00:00.019930105
2,10057,1993,1.378791e-02,1970-01-01 00:00:00.019930106
3,10057,1993,4.329160e-01,1970-01-01 00:00:00.019930107
4,10057,1993,2.560344e-03,1970-01-01 00:00:00.019930108
...,...,...,...,...
9126575,13788,2019,9.869342e-01,2019-06-25 00:00:00.000000000
9126576,90715,2019,3.318165e-18,2019-06-25 00:00:00.000000000
9126577,17382,2019,1.307089e-13,2019-06-25 00:00:00.000000000
9126578,92902,2019,1.000000e+00,2019-06-25 00:00:00.000000000


In [89]:
cpin_daily_full['fqtr'] = cpin_daily_full['date'].dt.to_period('Q')

In [90]:
annual_gpin_owr = cpin_daily_full.groupby(['permno', 'year']).agg({
    'cpie_gpin': 'mean',
    # 'cpie_owr': 'mean',
}).reset_index(drop=False)
annual_gpin_owr.rename(columns={'year': const.YEAR}, inplace=True)

In [91]:
quarter_gpin_owr = cpin_daily_full.groupby(['permno', 'fqtr']).agg({
    'cpie_gpin': 'mean',
    # 'cpie_owr': 'mean',
}).reset_index(drop=False)

In [51]:
hw_linkage_file = pd.read_csv(os.path.join(const.DATA_PATH, 'HWBanksforGPIN.csv'))
hw_annual_gpin = hw_linkage_file.merge(annual_gpin_owr, right_on=['permno'], left_on=['permco'], how='left')
hw_qtr_gpin = hw_linkage_file.merge(quarter_gpin_owr, right_on=['permno'], left_on=['permco'], how='left')

In [54]:
hw_annual_gpin.drop_duplicates(subset=['permco', const.YEAR], keep='first')

Unnamed: 0,rssd,name,permco,gvkey,linkprim,permno,fiscal_year,cpie_gpin
0,1022764,CENTRAL PACIFIC FINANCIAL CORP.,9449,16705,C,,,
1,1023239,"MERCHANTS BANCSHARES, INC.",7724,16929,P,,,
2,1025309,BANK OF HAWAII CORPORATION,589,2005,C,,,
3,1025541,WESTAMERICA BANCORPORATION,2253,14253,P,,,
4,1025608,"FIRST HAWAIIAN, INC.",55670,27665,P,,,
...,...,...,...,...,...,...,...,...
1065,5423510,BOGOTA FINANCIAL CORP.,56998,35646,P,,,
1066,5587775,"CATALYST BANCORP, INC.",58840,39376,P,,,
1067,5651452,CFSB BANCORP INC,59122,39622,P,,,
1068,5653018,PONCE FINANCIAL GROUP INC,56103,32253,P,,,


In [99]:
crsp_comp_link: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, 'crsp_comp.pkl'))
annual_gpin_owr_link = annual_gpin_owr.merge(crsp_comp_link, left_on='permno', right_on='permco', how='left')

In [101]:
annual_gpin_owr_link['gvkey'] = pd.to_numeric(annual_gpin_owr_link['gvkey'], errors='coerce')

In [96]:
quarter_gpin_owr_link = quarter_gpin_owr.merge(crsp_comp_link, left_on='permno', right_on='permco', how='left')
quarter_gpin_owr_link['gvkey'] = pd.to_numeric(quarter_gpin_owr_link['gvkey'], errors='coerce')
quarter_gpin_owr_link['permno'] = pd.to_numeric(quarter_gpin_owr_link['permno'], errors='coerce')

# quarter_gpin_owr_link.to_pickle(os.path.join(const.TEMP_PATH, 'cpin_gpin_owr_quarterly.pkl'))

In [74]:
quarter_gpin_owr_link.loc[quarter_gpin_owr_link['fqtr'].dt.year > 2000, 'fqtr'].dt.year.describe()

count    55434.000000
mean      2015.728271
std          1.927117
min       2013.000000
25%       2014.000000
50%       2016.000000
75%       2017.000000
max       2019.000000
Name: fqtr, dtype: float64

In [104]:
hw_ann_gpin.head()

Unnamed: 0,rssd,name,permco,gvkey,linkprim,fiscal_year,cpie_gpin,fiscal_year_1,cpie_gpin_1,permno,fiscal_year_2,cpie_gpin_2
0,1022764,CENTRAL PACIFIC FINANCIAL CORP.,9449,16705,C,,,,,,,
1,1023239,"MERCHANTS BANCSHARES, INC.",7724,16929,P,,,,,,,
2,1025309,BANK OF HAWAII CORPORATION,589,2005,C,,,,,,,
3,1025541,WESTAMERICA BANCORPORATION,2253,14253,P,,,,,,,
4,1025608,"FIRST HAWAIIAN, INC.",55670,27665,P,,,,,,,


In [107]:
hw_ann_gpin = hw_linkage_file.merge(annual_gpin_owr_link.drop(['permno', 'permco'], axis=1).dropna(how='any'), right_on=['gvkey'], left_on=['gvkey'], how='left').merge(
    annual_gpin_owr_link.drop(['permno', 'gvkey'], axis=1).dropna(how='any'), right_on=['permco'], left_on=['permco'], how='left', suffixes=('', '_1')).merge(
    annual_gpin_owr_link.drop(['permco', 'gvkey'], axis=1).dropna(how='any'), right_on=['permno'], left_on=['permco'], how='left', suffixes=('', '_2'))

key_to_drop = list()
for key in [const.YEAR, 'cpie_gpin']:
    hw_ann_gpin[key] = hw_ann_gpin[key].fillna(hw_ann_gpin[f'{key}_1']).fillna(hw_ann_gpin[f'{key}_2'])
    key_to_drop.extend([f'{key}_1', f'{key}_2'])

hw_ann_gpin.drop(key_to_drop, axis=1, inplace=True)

hw_ann_gpin_valid = hw_ann_gpin.drop_duplicates(subset=['permco', const.YEAR], keep='first').drop(['permno'], axis=1)

hw_ann_gpin_valid.to_csv(os.path.join(const.RESULT_PATH, '2010_2019_HW_ANN_GPIN.csv'), index=False)


In [97]:
hw_qtr_gpin = hw_linkage_file.merge(quarter_gpin_owr_link.drop(['permno', 'permco'], axis=1).dropna(how='any'), right_on=['gvkey'], left_on=['gvkey'], how='left').merge(
    quarter_gpin_owr_link.drop(['permno', 'gvkey'], axis=1).dropna(how='any'), right_on=['permco'], left_on=['permco'], how='left', suffixes=('', '_1')).merge(
    quarter_gpin_owr_link.drop(['permco', 'gvkey'], axis=1).dropna(how='any'), right_on=['permno'], left_on=['permco'], how='left', suffixes=('', '_2'))

key_to_drop = list()
for key in ['fqtr', 'cpie_gpin']:
    hw_qtr_gpin[key] = hw_qtr_gpin[key].fillna(hw_qtr_gpin[f'{key}_1']).fillna(hw_qtr_gpin[f'{key}_2'])
    key_to_drop.extend([f'{key}_1', f'{key}_2'])

hw_qtr_gpin.drop(key_to_drop, axis=1, inplace=True)

hw_qtr_gpin_valid = hw_qtr_gpin.drop_duplicates(subset=['permco', 'fqtr'], keep='first')
hw_qtr_gpin_valid[const.YEAR] = hw_qtr_gpin_valid['fqtr'].dt.year
hw_qtr_gpin_valid[const.YEAR] = hw_qtr_gpin_valid[const.YEAR].replace(-1, np.nan)

hw_qtr_gpin_valid.loc[hw_qtr_gpin_valid[const.YEAR] == 1970, 'fqtr'] = np.nan
hw_qtr_gpin_valid.loc[hw_qtr_gpin_valid[const.YEAR] == 1970, 'cpie_gpin'] = np.nan
hw_qtr_gpin_valid2 = hw_qtr_gpin_valid.drop(['permno', const.YEAR], axis=1).drop_duplicates(subset=['permco', 'fqtr'], keep='first')
hw_qtr_gpin_valid2.to_csv(os.path.join(const.RESULT_PATH, '2010_2019_HW_QTR_GPIN.csv'), index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hw_qtr_gpin_valid[const.YEAR] = hw_qtr_gpin_valid['fqtr'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hw_qtr_gpin_valid[const.YEAR] = hw_qtr_gpin_valid[const.YEAR].replace(-1, np.nan)


In [98]:
hw_qtr_gpin['fqtr'].min()

Period('1993Q1', 'Q-DEC')

In [46]:
quarter_gpin_owr_link.loc[quarter_gpin_owr_link['permno'] == 11691]

Unnamed: 0,permno,fqtr,cpie_gpin,permco,gvkey
1041,11691,1970Q1,0.516297,11691.0,18358.0
1042,11691,2013Q1,0.227801,11691.0,18358.0
1043,11691,2013Q2,0.251667,11691.0,18358.0
1044,11691,2013Q3,0.245589,11691.0,18358.0
1045,11691,2013Q4,0.418306,11691.0,18358.0
1046,11691,2014Q1,0.536361,11691.0,18358.0
1047,11691,2014Q2,0.698007,11691.0,18358.0
1048,11691,2014Q3,0.581169,11691.0,18358.0
1049,11691,2014Q4,0.713639,11691.0,18358.0
1050,11691,2015Q2,0.462525,11691.0,18358.0


In [58]:
key_to_drop = list()
for key in ['fqtr', 'cpie_gpin']:
    hw_qtr_gpin[key] = hw_qtr_gpin[key].fillna(hw_qtr_gpin[f'{key}_1']).fillna(hw_qtr_gpin[f'{key}_2'])
    key_to_drop.extend([f'{key}_1', f'{key}_2'])

hw_qtr_gpin.drop(key_to_drop, axis=1, inplace=True)

In [64]:
hw_qtr_gpin_valid = hw_qtr_gpin.drop_duplicates(subset=['permco', 'fqtr'], keep='first')
hw_qtr_gpin_valid[const.YEAR] = hw_qtr_gpin_valid['fqtr'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hw_qtr_gpin_valid[const.YEAR] = hw_qtr_gpin_valid['fqtr'].dt.year


In [66]:
hw_qtr_gpin_valid[const.YEAR] = hw_qtr_gpin_valid[const.YEAR].replace(-1, np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hw_qtr_gpin_valid[const.YEAR] = hw_qtr_gpin_valid[const.YEAR].replace(-1, np.nan)


In [71]:
hw_qtr_gpin_valid.loc[hw_qtr_gpin_valid[const.YEAR] == 1970, 'fqtr'] = np.nan
hw_qtr_gpin_valid.loc[hw_qtr_gpin_valid[const.YEAR] == 1970, 'cpie_gpin'] = np.nan
hw_qtr_gpin_valid2 = hw_qtr_gpin_valid.drop(['permno', const.YEAR], axis=1).drop_duplicates(subset=['permco', 'fqtr'], keep='first')
hw_qtr_gpin_valid2.to_csv(os.path.join(const.RESULT_PATH, '2010_2016_HW_QTR_GPIN.csv'), index=False)

In [25]:
reg_df4: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241011_stock_act_reg_data_v2.dta'))

reg_df5: DataFrame = reg_df4.merge(
    annual_gpin_owr_link.drop(['permno', 'permco'], axis=1).dropna(subset=['gvkey']), on=[const.GVKEY, const.YEAR], 
    how='left', suffixes=("", "_gvkey")).merge(
    annual_gpin_owr_link.drop(['gvkey', 'permco'], axis=1).dropna(subset=['permno']), left_on=['PERMNO', const.YEAR], 
    right_on=['permno', const.YEAR], how='left', suffixes=('', '_permno')).merge(
    annual_gpin_owr_link.drop(['gvkey', 'permno'], axis=1), left_on=['PERMNO', const.YEAR], 
    right_on=['permco', const.YEAR], how='left', suffixes=('', '_permco'))

key_to_drop = ['permco', 'permno']
for key in ['cpie_gpin', 'cpie_owr']:
    reg_df5[key] = reg_df5[key].fillna(reg_df5[f'{key}_gvkey']).fillna(reg_df5[f'{key}_permno']).fillna(reg_df5[f'{key}_permco'])
    key_to_drop.append(f'{key}_permno')
    key_to_drop.append(f'{key}_permco')
    key_to_drop.append(f'{key}_gvkey')

reg_df5.drop(key_to_drop, axis=1, inplace=True)

In [28]:
reg_df5.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,SYNCHRONICITY_MKT_D,SYNCHRONICITY_IND_D,numest,numest_last,DISPERSION,DISPERSION_last,FCSTERROR,FCSTERROR_last,permno,permco
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,7140.0,7140.0,6637.0,6637.0,6219.0,6219.0,6570.0,6570.0,2754.0,256.0
mean,6036.5,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,...,-1.293022,-1.361813,8.781616,8.777158,0.084595,0.057665,0.146156,0.118434,61329.095861,36007.84375
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,-5.212167,-5.379391,1.0,1.0,9.4e-05,0.0,3.5e-05,0.0,10104.0,10104.0
25%,3018.25,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,...,-1.999417,-2.196078,3.166667,3.0,0.001137,0.000652,0.00168,0.00092,30940.0,15684.75
50%,6036.5,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,...,-1.057522,-1.174833,6.333333,6.0,0.002691,0.001626,0.005238,0.003289,75735.5,41080.0
75%,9054.75,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,...,-0.339558,-0.325174,12.583333,13.0,0.007994,0.005217,0.018693,0.01274,86799.0,54704.0
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,0.957713,1.317516,54.666667,54.0,4.209722,3.08,7.08739,5.981308,93436.0,59619.0
std,3485.60791,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,...,1.329277,1.436088,7.529601,7.763877,0.475061,0.341575,0.792531,0.676268,28727.498355,19106.167438


In [27]:
reg_df5.drop_duplicates(subset=[const.GVKEY, const.YEAR], keep='first', inplace=True)

In [29]:
reg_df5.to_stata(os.path.join(const.RESULT_PATH, '20241014_stock_act_reg_data_v1.dta'),
                 write_index=False, version=117)

In [21]:
[i for i in reg_df5.keys() if i.startswith('ticker')]

['ticker']

In [20]:
annual_analyst_df: DataFrame = pd.read_csv(os.path.join(const.TEMP_PATH, '2007_2016_annual_analyst_coverage.csv')).rename(columns={'year': const.YEAR})
df_df: DataFrame = pd.read_csv(os.path.join(const.TEMP_PATH, '2007_2016_annual_dispersion_fcsterror.csv'), 
                               usecols=['ticker', 'year', 'numest', 'permno', 'DISPERSION', 'FCSTERROR', 'numest_last'])
df_df_gvkey: DataFrame = df_df.merge(crsp_comp_link, left_on='permno', right_on='permco', how='left').rename(columns={'year': const.YEAR})
df_df_gvkey['gvkey'] = pd.to_numeric(df_df_gvkey['gvkey'], errors='coerce')

In [24]:
df_df_gvkey.keys()

Index(['ticker', 'fiscal_year', 'numest', 'permno', 'DISPERSION', 'FCSTERROR',
       'permco', 'gvkey'],
      dtype='object')

In [29]:
reg_df6: DataFrame = reg_df5.merge(annual_analyst_df.drop(['ticker', 'permno'], axis=1).dropna(subset=[const.GVKEY]), on=[const.GVKEY, const.YEAR], how='left').merge(
    annual_analyst_df.drop(['gvkey', 'ticker'], axis=1).dropna(subset=['permno']), left_on=['PERMNO', const.YEAR], right_on=['permno', const.YEAR], how='left', suffixes=('', '_permno')).merge(annual_analyst_df.drop(['gvkey', 'permno'], axis=1).dropna(subset=['ticker']), left_on=['ticker', const.YEAR], right_on=['ticker', const.YEAR], how='left', suffixes=('', '_ticker'))

key_to_drop = list()
for key in ['coverage']:
    reg_df6[key] = reg_df6[key].fillna(reg_df6[f'{key}_permno']).fillna(reg_df6[f'{key}_ticker'])
    key_to_drop.append(f'{key}_permno')
    key_to_drop.append(f'{key}_ticker')


In [32]:
reg_df6.drop(['permno', 'permco'], axis=1, inplace=True)

In [41]:
reg_df7: DataFrame = reg_df6.merge(df_df_gvkey.drop(['ticker', 'permno', 'permco'], axis=1).dropna(subset=[const.GVKEY]), on=[const.GVKEY, const.YEAR], how='left', suffixes=("", "_gvkey")).merge(
    df_df_gvkey.drop(['gvkey', 'ticker', 'permco'], axis=1).dropna(subset=['permno']), left_on=['PERMNO', const.YEAR], right_on=['permno', const.YEAR], how='left', suffixes=('', '_permno')).merge(
    df_df_gvkey.drop(['gvkey', 'permno', 'permco'], axis=1).dropna(subset=['ticker']), left_on=['ticker', const.YEAR], right_on=['ticker', const.YEAR], how='left', suffixes=('', '_ticker')).merge(
    df_df_gvkey.drop(['gvkey', 'permno', 'ticker'], axis=1).dropna(subset=['permco']), left_on=['PERMNO', const.YEAR], right_on=['permco', const.YEAR], how='left', suffixes=('', '_permco'))

for key in ['numest', 'DISPERSION', 'FCSTERROR']:
    if key != 'numest':
        reg_df7[key] = reg_df7[f'{key}_gvkey'].fillna(reg_df7[key])
        key_to_drop.append(f'{key}_gvkey')
    reg_df7[key] = reg_df7[key].fillna(reg_df7[f'{key}_permno']).fillna(reg_df7[f'{key}_ticker']).fillna(reg_df7[f'{key}_permco'])
    key_to_drop.append(f'{key}_permno')
    key_to_drop.append(f'{key}_ticker')
    key_to_drop.append(f'{key}_permco')

reg_df7['PERMNO'] = reg_df7['PERMNO'].fillna(reg_df7['permco']).fillna(reg_df7['permno'])
key_to_drop.append('permco')
key_to_drop.append('permno')
key_to_drop.append('permno_permno')
reg_df7.drop(key_to_drop, axis=1, inplace=True)

In [42]:
reg_df7.to_stata(os.path.join(const.RESULT_PATH, '20241009_stock_act_reg_data_v2.dta'), write_index=False, version=117)

In [7]:
# Merge the regression data with FPI = 1 IBES coverage data
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241010_stock_act_reg_data_v3.dta')).drop(
    ['DISPERSION', 'FCSTERROR', 'numest'], axis=1)
crsp_comp_link: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, 'crsp_comp.pkl'))

df_df: DataFrame = pd.read_csv(os.path.join(const.TEMP_PATH, '2007_2016_annual_dispersion_fcsterror.csv'), 
                               usecols=['ticker', 'year', 'numest', 'permno', 'DISPERSION', 'FCSTERROR', 'numest_last', 'DISPERSION_last', 'FCSTERROR_last'])
df_df_gvkey: DataFrame = df_df.merge(crsp_comp_link, left_on='permno', right_on='permco', how='left').rename(columns={'year': const.YEAR})
df_df_gvkey['gvkey'] = pd.to_numeric(df_df_gvkey['gvkey'], errors='coerce')

reg_df2: DataFrame = reg_df.merge(df_df_gvkey.drop(['ticker', 'permno', 'permco'], axis=1).dropna(subset=[const.GVKEY]), on=[const.GVKEY, const.YEAR], how='left').merge(
    df_df_gvkey.drop(['gvkey', 'ticker', 'permco'], axis=1).dropna(subset=['permno']), left_on=['PERMNO', const.YEAR], right_on=['permno', const.YEAR], how='left', suffixes=('', '_permno')).merge(
    df_df_gvkey.drop(['gvkey', 'permno', 'permco'], axis=1).dropna(subset=['ticker']), left_on=['ticker', const.YEAR], right_on=['ticker', const.YEAR], how='left', suffixes=('', '_ticker')).merge(
    df_df_gvkey.drop(['gvkey', 'permno', 'ticker'], axis=1).dropna(subset=['permco']), left_on=['PERMNO', const.YEAR], right_on=['permco', const.YEAR], how='left', suffixes=('', '_permco'))

key_to_drop = list()
for key in ['numest', 'DISPERSION', 'FCSTERROR', 'numest_last', 'DISPERSION_last', 'FCSTERROR_last']:
    reg_df2[key] = reg_df2[key].fillna(reg_df2[f'{key}_permno']).fillna(reg_df2[f'{key}_ticker']).fillna(reg_df2[f'{key}_permco'])
    key_to_drop.append(f'{key}_permno')
    key_to_drop.append(f'{key}_ticker')
    key_to_drop.append(f'{key}_permco')
    
reg_df2.drop(key_to_drop, axis=1, inplace=True)
reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20241010_stock_act_reg_data_v4.dta'), write_index=False, version=117)

In [2]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241010_stock_act_reg_data_v4.dta'))
reg_df.drop_duplicates([const.GVKEY, const.YEAR], keep='first').to_stata(os.path.join(const.RESULT_PATH, '20241010_stock_act_reg_data_v5.dta'), write_index=False, version=117)

# Merge CCC data

In [18]:
# sort original lee's data
lee_merged_df = DataFrame()
for file_name in ['erp_public_240107', 'erp_public_annual_240107']:
    lee_df: DataFrame = pd.read_csv(
        os.path.join(const.DATABASE_PATH, 'Cost of Capital', f'{file_name}.zip')).drop(['permno'], axis=1)
    lee_df['yearmonth'] = pd.to_datetime(lee_df['yearmonth'], format='%Y%m')
    lee_df[const.YEAR] = lee_df['yearmonth'].dt.year
    lee_df = lee_df[lee_df[const.YEAR] > 2000].copy()

    lee_df_year_mean = lee_df.drop(['yearmonth'], axis=1).groupby([const.GVKEY, const.YEAR]).mean().reset_index(
        drop=False)
    lee_df[const.YEAR] = lee_df['yearmonth'].apply(lambda x: int(x.year) if x.month > 4 else x.year - 1)
    lee_df_fyear_mean = lee_df.drop(['yearmonth'], axis=1).groupby([const.GVKEY, const.YEAR]).mean().reset_index(
        drop=False)
    lee_annual_df: DataFrame = lee_df_year_mean.merge(lee_df_fyear_mean, on=[const.GVKEY, const.YEAR], how='outer',
                                                      suffixes=('', '_f'))
    if lee_merged_df.empty:
        lee_merged_df = lee_annual_df.copy()
    else:
        lee_merged_df: DataFrame = lee_merged_df.merge(lee_annual_df, on=[const.GVKEY, const.YEAR], how='outer',
                                                       suffixes=('_month', '_annual'))

In [19]:
lee_merged_df.describe()

Unnamed: 0,gvkey,fiscal_year,CCC_month,ICCA_month,FIC_month,FBM_month,GLS_mech_month,OJM_mech_month,CAT_mech_month,PEG_mech_month,...,PEG_mech_f_annual,GLS_an_f_annual,OJM_an_f_annual,CAT_an_f_annual,PEG_an_f_annual,JLR_f_annual,LPV_f_annual,CER_f_annual,FF6_f_annual,QFM_f_annual
count,146423.0,146423.0,80950.0,53104.0,23842.0,127591.0,80092.0,65052.0,79653.0,80805.0,...,89628.0,55774.0,54993.0,55744.0,55757.0,119022.0,108042.0,119271.0,133747.0,117752.0
mean,70256.839028,2010.825506,0.005081,0.008331,0.004809,0.010305,0.0076,0.006667,-0.006975,0.010284,...,0.141053,0.096521,0.136138,0.087709,0.111895,0.117365,0.133779,0.124432,63554380.0,196456800.0
std,68057.655043,7.063588,0.008927,0.003674,0.007222,0.044137,0.004996,0.008338,0.019543,0.010415,...,0.152245,0.040238,0.088439,0.055094,0.076666,0.059251,0.057237,0.05431,23185120000.0,67410110000.0
min,1004.0,2000.0,-0.043607,-0.010509,-0.020493,-0.609848,-0.069955,-0.214025,-0.166263,0.0,...,0.0,-0.32697,-0.065804,-0.816707,0.0,-0.307594,-1.237563,-0.722296,-1.0,-1.0
25%,15343.0,2005.0,-0.000745,0.006263,0.000212,-0.004359,0.0049,0.002731,-0.01799,0.002002,...,0.032374,0.074909,0.088802,0.062401,0.072868,0.075944,0.099196,0.087408,0.1371167,0.1272966
50%,31549.0,2010.0,0.00338,0.007543,0.003985,0.010161,0.006729,0.004443,-0.006269,0.007797,...,0.099822,0.091855,0.111518,0.079282,0.095282,0.112574,0.128965,0.11815,0.3834588,0.3496569
75%,133515.5,2017.0,0.009011,0.009404,0.008334,0.025481,0.009163,0.007462,0.004857,0.014921,...,0.195093,0.111183,0.154139,0.099235,0.131071,0.154653,0.164866,0.158329,0.9538336,0.8657716
max,353444.0,2023.0,0.059215,0.057044,0.034654,5.24666,0.059265,0.059353,0.0591,0.059393,...,0.995763,0.954749,0.997459,0.977912,0.994184,0.452321,0.718949,0.452321,8479124000000.0,23131790000000.0


In [30]:
# append coc data to regression data
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20250226_stock_act_data_v1.dta'))
# drop_key = [i for i in lee_merged_df.keys() if i not in [const.GVKEY, const.YEAR] and i in reg_df.keys()]
# reg_df.drop(drop_key, axis=1, inplace=True)
# reg_df = reg_df.merge(lee_merged_df, on=[const.GVKEY, const.YEAR], how='left')

for lag_year in range(1, 5):
    tmp_df: DataFrame = lee_merged_df.copy()
    tmp_df.loc[:, const.YEAR] -= lag_year
    reg_df = reg_df.merge(tmp_df, on=[const.GVKEY, const.YEAR], how='left', suffixes=('', f'_{lag_year}'))

reg_df.to_stata(os.path.join(const.RESULT_PATH, '20250226_stock_act_data_v2.dta'), write_index=False,
                version=119)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20250226_stock_act_data_v1.dta'))


In [26]:
tmp_df.loc[: const.YEAR] -= lag_year

4

In [27]:
tmp_df: DataFrame = lee_merged_df.copy()

In [28]:
tmp_df[tmp_df[const.GVKEY] == 1004]


Unnamed: 0,gvkey,fiscal_year,CCC_month,ICCA_month,FIC_month,FBM_month,GLS_mech_month,OJM_mech_month,CAT_mech_month,PEG_mech_month,...,PEG_mech_f_annual,GLS_an_f_annual,OJM_an_f_annual,CAT_an_f_annual,PEG_an_f_annual,JLR_f_annual,LPV_f_annual,CER_f_annual,FF6_f_annual,QFM_f_annual
0,1004,2000,,,,,,,,,...,0.0,0.114858,0.195181,0.10013,0.179299,0.169598,0.191878,0.180738,0.708568,0.081715
1,1004,2001,0.006436,0.011225,,0.010167,0.008862,0.005951,0.007287,0.003421,...,0.068267,0.114735,0.197855,0.085209,0.180122,0.17793,0.19326,0.185595,0.79157,0.763279
2,1004,2002,0.008524,0.011162,,-0.031971,0.007793,0.006363,0.005246,0.012794,...,0.225126,0.119747,0.220773,0.073214,0.214893,0.170685,0.191952,0.181318,2.055469,0.773539
3,1004,2003,0.009196,0.012236,,0.050966,0.00757,,0.004362,0.01524,...,0.162827,0.111222,0.166229,0.102591,0.160515,0.145563,0.160814,0.153189,2.01963,1.41938
4,1004,2004,0.003473,0.010255,,-0.001592,0.006466,0.001468,-0.003577,0.007665,...,0.069361,0.103224,0.147176,0.087442,0.141912,0.1639,0.181857,0.172878,0.282041,0.078498
5,1004,2005,0.001524,0.0083,0.004458,-0.00618,0.006158,0.001924,-0.007439,0.004807,...,0.055393,0.087962,0.131618,0.073541,0.126502,0.13838,0.167229,0.152804,1.079943,1.269554
6,1004,2006,-0.001248,0.009017,0.001741,0.032023,0.004832,0.001564,-0.017075,0.003962,...,0.049211,0.09289,0.139927,0.088229,0.134127,0.127504,0.152262,0.139883,0.389782,0.400409
7,1004,2007,-0.000621,0.008367,0.001779,-0.001763,0.005732,0.003205,-0.012482,0.0,...,0.0,0.094287,0.130329,0.085245,0.126522,0.116169,0.144663,0.130416,0.082771,0.110006
8,1004,2008,0.003496,0.010368,,-0.050908,0.007866,0.005965,-0.000258,0.0,...,0.0,0.130842,0.12762,0.128846,0.127009,0.125067,0.146771,0.135919,0.1538,0.663297
9,1004,2009,0.005231,0.008597,,0.039472,0.008751,0.006907,0.005005,0.0,...,0.0,0.114237,0.12038,0.097346,0.120551,0.070852,0.110993,0.090922,1.651102,1.038756


In [25]:
lee_merged_df[lee_merged_df[const.GVKEY] == 1004]

Unnamed: 0,gvkey,fiscal_year,CCC_month,ICCA_month,FIC_month,FBM_month,GLS_mech_month,OJM_mech_month,CAT_mech_month,PEG_mech_month,...,PEG_mech_f_annual,GLS_an_f_annual,OJM_an_f_annual,CAT_an_f_annual,PEG_an_f_annual,JLR_f_annual,LPV_f_annual,CER_f_annual,FF6_f_annual,QFM_f_annual
0,1004,2000,,,,,,,,,...,0.0,0.114858,0.195181,0.10013,0.179299,0.169598,0.191878,0.180738,0.708568,0.081715
1,1004,2001,0.006436,0.011225,,0.010167,0.008862,0.005951,0.007287,0.003421,...,0.068267,0.114735,0.197855,0.085209,0.180122,0.17793,0.19326,0.185595,0.79157,0.763279
2,1004,2002,0.008524,0.011162,,-0.031971,0.007793,0.006363,0.005246,0.012794,...,0.225126,0.119747,0.220773,0.073214,0.214893,0.170685,0.191952,0.181318,2.055469,0.773539
3,1004,2003,0.009196,0.012236,,0.050966,0.00757,,0.004362,0.01524,...,0.162827,0.111222,0.166229,0.102591,0.160515,0.145563,0.160814,0.153189,2.01963,1.41938
4,1004,2004,0.003473,0.010255,,-0.001592,0.006466,0.001468,-0.003577,0.007665,...,0.069361,0.103224,0.147176,0.087442,0.141912,0.1639,0.181857,0.172878,0.282041,0.078498
5,1004,2005,0.001524,0.0083,0.004458,-0.00618,0.006158,0.001924,-0.007439,0.004807,...,0.055393,0.087962,0.131618,0.073541,0.126502,0.13838,0.167229,0.152804,1.079943,1.269554
6,1004,2006,-0.001248,0.009017,0.001741,0.032023,0.004832,0.001564,-0.017075,0.003962,...,0.049211,0.09289,0.139927,0.088229,0.134127,0.127504,0.152262,0.139883,0.389782,0.400409
7,1004,2007,-0.000621,0.008367,0.001779,-0.001763,0.005732,0.003205,-0.012482,0.0,...,0.0,0.094287,0.130329,0.085245,0.126522,0.116169,0.144663,0.130416,0.082771,0.110006
8,1004,2008,0.003496,0.010368,,-0.050908,0.007866,0.005965,-0.000258,0.0,...,0.0,0.130842,0.12762,0.128846,0.127009,0.125067,0.146771,0.135919,0.1538,0.663297
9,1004,2009,0.005231,0.008597,,0.039472,0.008751,0.006907,0.005005,0.0,...,0.0,0.114237,0.12038,0.097346,0.120551,0.070852,0.110993,0.090922,1.651102,1.038756


In [23]:
reg_df[drop_key].describe()

Unnamed: 0,CCC_month,GLS_mech_month,OJM_mech_month,CAT_mech_month,PEG_mech_month,GLS_an_month,OJM_an_month,CAT_an_month,PEG_an_month,CCC_f_month,...,PEG_an_annual,CCC_f_annual,GLS_mech_f_annual,OJM_mech_f_annual,CAT_mech_f_annual,PEG_mech_f_annual,GLS_an_f_annual,OJM_an_f_annual,CAT_an_f_annual,PEG_an_f_annual
count,8458.0,8401.0,7048.0,8378.0,8452.0,5926.0,5863.0,5922.0,5926.0,8997.0,...,5926.0,8997.0,8947.0,7619.0,8930.0,8991.0,6021.0,5976.0,6019.0,6021.0
mean,0.004327,0.007559,0.006046,-0.008369,0.009615,0.007689,0.009913,0.006715,0.008582,0.004422,...,0.110052,0.05989,0.096764,0.082076,-0.074422,0.133442,0.096741,0.128107,0.084949,0.110433
std,0.008327,0.004792,0.009462,0.017892,0.010236,0.002547,0.004822,0.003304,0.004583,0.008264,...,0.0641,0.112788,0.067506,0.126675,0.196326,0.15015,0.035088,0.070107,0.050185,0.065261
min,-0.029441,-0.054025,-0.214025,-0.113768,0.0,-0.008767,-0.002616,-0.000959,0.0,-0.028432,...,0.0,-0.292465,-0.486484,-0.889861,-0.757861,0.0,-0.221117,-0.029669,-0.010476,0.0
25%,-0.000773,0.005083,0.002605,-0.017767,0.001316,0.006197,0.007029,0.005015,0.006078,-0.000679,...,0.075699,-0.007988,0.062607,0.032228,-0.190739,0.027931,0.076175,0.087138,0.061189,0.074953
50%,0.002696,0.006651,0.004068,-0.007678,0.00692,0.00752,0.008601,0.006274,0.007707,0.002761,...,0.096739,0.034021,0.082542,0.050098,-0.088269,0.088311,0.093769,0.108559,0.077601,0.096167
75%,0.007244,0.00872,0.006141,0.001633,0.013715,0.008909,0.011455,0.007755,0.010167,0.007361,...,0.12943,0.09244,0.110296,0.076597,0.020276,0.180352,0.112324,0.147229,0.097444,0.130426
max,0.053367,0.05875,0.059331,0.059047,0.059054,0.035057,0.056564,0.043702,0.04937,0.052492,...,0.789173,0.848594,0.920133,0.997,0.999727,0.995763,0.406627,0.935309,0.89953,0.718845


# merge monthly statistic data

In [2]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20250316_stock_act_idiosyn_v2.dta'))
daily_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20250323_daily_stats.pkl')).reset_index(drop=False)

In [3]:
daily_df.keys()

Index(['PERMNO', 'fiscal_year', 'sigma', 'skewness', 'kurtosis'], dtype='object')

In [6]:
reg_df2: DataFrame = reg_df.merge(daily_df, left_on=['LPERMNO', const.YEAR], right_on=['PERMNO', const.YEAR], how='left', suffixes=('', '_d'))

In [8]:
daily_df[const.YEAR] -= 1
reg_df3: DataFrame = reg_df2.drop(['PERMNO'], axis=1).merge(
    daily_df, left_on=['LPERMNO', const.YEAR], right_on=['PERMNO', const.YEAR], how='left', suffixes=('', '_d1'))

key_to_drop = list()
for key in ['sigma', 'skewness', 'kurtosis']:
    key_to_drop.append(f'{key}_d1')
    reg_df3[f'{key}_d'] = reg_df3[f'{key}_d'].fillna(reg_df3[f'{key}_d1'])

reg_df3.drop(key_to_drop, axis=1, inplace=True)

In [9]:
reg_df3[['sigma_d', 'skewness_d', 'kurtosis_d']].describe()

Unnamed: 0,sigma_d,skewness_d,kurtosis_d
count,11830.0,11830.0,11830.0
mean,0.033091,0.413264,7.851011
std,0.021108,1.512949,15.053352
min,0.004538,-10.520995,-2.0
25%,0.019308,-0.167832,1.722131
50%,0.028412,0.249566,3.491931
75%,0.040694,0.785252,7.609703
max,0.436616,14.790606,226.206868


In [10]:
columns_to_winsorize = ['sigma_d', 'skewness_d', 'kurtosis_d']
for column in columns_to_winsorize:
    non_na_data = reg_df3[column].dropna()
    winsorized_data = winsorize(non_na_data, limits=[0.01, 0.01])
    reg_df3.loc[non_na_data.index, column] = winsorized_data

In [12]:
reg_df3.to_stata(os.path.join(const.RESULT_PATH, '20250323_stock_act_idiosyn_v1.dta'), write_index=False, version=119)


In [4]:
latest_reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250323_stock_act_idiosyn_v4.dta'))
previous_reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250220_stock_act_data_v2.dta')).loc[:, [const.GVKEY, const.YEAR, 'GLS_mech_annual', 'OJM_mech_annual', 'CAT_mech_annual', 'PEG_mech_annual', 'CCC_annual']]

reg_df: DataFrame = latest_reg_df.merge(previous_reg_df, on=[const.GVKEY, const.YEAR], how='left')

hm2005_df = pd.read_pickle(os.path.join(const.TEMP_PATH, 'hm_2005_price_delay.pkl')).loc[:, ['PERMNO', 'year', 'price_delay']]

reg_df2 = reg_df.merge(hm2005_df.rename(columns={'PERMNO': 'LPERMNO', 'year': const.YEAR}), on=['LPERMNO', const.YEAR], how='left').merge(hm2005_df.rename(columns={'PERMNO': 'LPERMCO', 'year': const.YEAR}), on=['LPERMCO', const.YEAR], how='left', suffixes=('', '_permco'))
reg_df2.loc[:, 'price_delay'] = reg_df2['price_delay'].fillna(reg_df2['price_delay_permco'])
reg_df2.drop(['price_delay_permco'], axis=1, inplace=True)

reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20250416_stock_act_data_v1.dta'), write_index=False, version=119)

# Merge Harry's data


In [3]:
harry_df: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 't7_data.dta'))
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20250428_stock_act_data_v1.dta'))

harry_df[const.YEAR] = pd.to_numeric(harry_df[const.YEAR])
reg_df2: DataFrame = reg_df.merge(harry_df.loc[:, 'govContractTerms RegPolicyRisk fiscal_year gvkey'.split(' ')], on=[const.GVKEY, const.YEAR], how='left')
reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20250501_stock_act_data_v1.dta'), write_index=False, version=119)

In [4]:
harry_df.dtypes

gvkey                 int32
fiscal_year           int64
RegPolicyRisk       float32
govContractTerms    float32
dtype: object

# Construct 2002-2010 data

In [5]:
# ============ 1. 设定路径 ============

major_path = os.path.join(const.DATA_PATH, 'fromZGY', "20022010MajorCustomer.dta")
mf_path = os.path.join(const.DATA_PATH, 'fromZGY', "20022010MF.dta")
compustat_zip_path = os.path.join(const.COMPUSTAT_PATH, "2000_2023_ctat_all_data.zip")

# ============ 2. 加载主要数据并合并 ============

df_major = pd.read_stata(major_path)
df_mf = pd.read_stata(mf_path)

df_major['tic'] = df_major['tic'].astype(str)
df_mf['tic'] = df_mf['tic'].astype(str)
df_major['fiscal_year'] = df_major['fiscal_year'].astype(int)
df_mf['fiscal_year'] = df_mf['fiscal_year'].astype(int)

df_merged = pd.merge(df_major, df_mf, on=["tic", "fiscal_year"], how="outer")

In [9]:
df_merged.shape

(66868, 5)

In [25]:
# ============ 3. 处理 Compustat 并构建控制变量 ============
df_comp = pd.read_csv(compustat_zip_path)

df_comp['tic'] = df_comp['tic'].astype(str)

# 转换 datadate 为 datetime 类型
df_comp['datadate_parsed'] = pd.to_datetime(df_comp['datadate'], errors='coerce')

# 找到 fyear 缺失的位置
missing_mask = df_comp['fyear'].isna()

# 只对缺失的位置执行 np.where 逻辑
df_comp.loc[missing_mask, 'fyear'] = np.where(
    df_comp.loc[missing_mask, 'datadate_parsed'].dt.month < 7,
    df_comp.loc[missing_mask, 'datadate_parsed'].dt.year - 1,
    df_comp.loc[missing_mask, 'datadate_parsed'].dt.year
)
df_comp = df_comp.dropna(subset=['tic', 'fyear'])
df_comp['fyear'] = df_comp['fyear'].astype(int)

df_comp['Size'] = np.log(df_comp['csho'] * df_comp['prcc_f'])
df_comp['LEV'] = df_comp['dltt'] / df_comp['at']
df_comp['BM'] = df_comp['ceq'] / (df_comp['csho'] * df_comp['prcc_f'])
df_comp['ROA'] = df_comp['ib'] / df_comp['at']

  df_comp = pd.read_csv(compustat_zip_path)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [37]:
controls = df_comp[['tic', 'fyear', 'Size', 'LEV', 'BM', 'ROA', 'gvkey', 'sic']].dropna()

df_full = pd.merge(df_merged, controls, left_on=["tic", "fiscal_year"], right_on=["tic", "fyear"], how="inner").replace([np.inf, -np.inf], np.nan)

In [38]:
df_full.describe()

Unnamed: 0,fiscal_year,gov_counts,gov_indicator,GuidanceForecast,fyear,Size,LEV,BM,ROA,gvkey,sic
count,35811.0,32438.0,32438.0,10875.0,35811.0,35808.0,35809.0,35808.0,35809.0,35811.0,35811.0
mean,2005.965095,0.169123,0.103582,11.582253,2005.965095,5.527158,0.195302,1.1337,-1.037056,66151.168328,4642.516182
std,2.569809,0.63565,0.304723,11.098491,2.569809,2.532658,0.602216,1788.646067,137.118749,63435.988134,2062.62708
min,2002.0,0.0,0.0,1.0,2002.0,-9.734589,0.0,-90207.0,-25884.807692,1004.0,100.0
25%,2004.0,0.0,0.0,3.0,2004.0,3.79802,0.0,0.226073,-0.078674,11657.0,3290.0
50%,2006.0,0.0,0.0,8.0,2006.0,5.674747,0.094851,0.449162,0.024114,30614.0,3841.0
75%,2008.0,0.0,0.0,16.0,2008.0,7.254319,0.276569,0.763476,0.070551,124319.0,6510.0
max,2010.0,15.0,1.0,105.0,2010.0,12.864634,60.3,210998.0,70.895735,287882.0,9997.0


In [39]:
for key in ['gov_indicator', 'GuidanceForecast']:
    df_full[key] = df_full[key].fillna(0)

In [45]:
# ============ 5. 构造 post 和 winsorize ============

df_full['post'] = (df_full['fiscal_year'] >= 2006).astype(int)


def winsorize(s, lower=0.01, upper=0.99):
    return s.clip(lower=s.quantile(lower), upper=s.quantile(upper))

for col in ['Size', 'LEV', 'BM', 'ROA', 'GuidanceForecast']:
    if col in df_full.columns:
        df_full[col] = winsorize(df_full[col])

df_full['log_frequency'] = np.log1p(df_full['GuidanceForecast'])
df_full = df_full.dropna(subset=['Size', 'LEV', 'BM', 'ROA'], how='any')
df_full = df_full[(df_full['Size'] >= 0) | (df_full['BM'] >= 0)].copy()

In [46]:
df_full.describe()

Unnamed: 0,fiscal_year,gov_counts,gov_indicator,GuidanceForecast,fyear,Size,LEV,BM,ROA,gvkey,sic,post,log_frequency
count,32386.0,29142.0,32386.0,32386.0,32386.0,32386.0,32386.0,32386.0,32386.0,32386.0,32386.0,32386.0,32386.0
mean,2005.992311,0.17202,0.094207,3.698697,2005.992311,5.84663,0.15131,0.652062,-0.033599,65828.275736,4611.45075,0.557093,0.708222
std,2.560523,0.643063,0.292122,7.781468,2.560523,2.273907,0.173272,0.604958,0.241133,63884.695195,2040.556884,0.496737,1.131273
min,2002.0,0.0,0.0,0.0,2002.0,0.829934,0.0,0.024937,-1.334799,1004.0,100.0,0.0,0.0
25%,2004.0,0.0,0.0,0.0,2004.0,4.230908,0.0,0.284749,-0.038428,11257.0,3310.0,0.0,0.0
50%,2006.0,0.0,0.0,0.0,2006.0,5.901309,0.089895,0.491712,0.02956,30277.0,3841.0,1.0,0.0
75%,2008.0,0.0,0.0,3.0,2008.0,7.386958,0.257858,0.799225,0.073842,124434.0,6331.0,1.0,1.386294
max,2010.0,15.0,1.0,39.0,2010.0,11.093075,0.677035,3.759745,0.319571,287882.0,9997.0,1.0,3.688879


In [47]:
df_full.to_stata(os.path.join(const.RESULT_PATH, '20250516_2002_2010_regression_data.dta',), write_index=False, version=119)

# Merge regulatory intensity and firm exposure to government

In [2]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20250501_stock_act_data_v1.dta'))
gov_expo_df: DataFrame = pd.read_csv(os.path.join(const.DATABASE_PATH, r'textual analysis measure\Regulatory Costs\Government Exposure',
                                                  'AGH_Data_20211217.csv'))
reg_intensity_df: DataFrame = pd.read_stata(os.path.join(const.DATABASE_PATH, r'textual analysis measure\Regulatory Costs\regulatory intensity files',
                                                         'regulatory intensity (year-firm).dta'))

In [19]:
def median_split(df, variable, year_variable='fiscal_year', split_type='overall'):
    """
    Performs median split on a specified variable.

    Parameters:
    - df: pandas DataFrame.
    - variable: str, the column name to split on.
    - year_variable: str or None, the column name indicating year, required if split_type='yearly'.
    - split_type: 'overall' or 'yearly'.

    Returns:
    - df: DataFrame with an added column: variable + '_high' (1 if above/equal median, 0 if below).
    """

    invalid_index = df[df[variable].isnull()].index

    if split_type == 'overall':
        new_col = variable + '_highall'
        median_value = df[variable].median()
        df[new_col] = (df[variable] >= median_value).astype(int)

    elif split_type == 'yearly':
        new_col = variable + '_highann'
        if year_variable is None:
            raise ValueError("year_variable must be provided for yearly split.")
        df[new_col] = 0
        for year, group in df.groupby(year_variable):
            median_value = group[variable].median()
            df.loc[group.index, new_col] = (group[variable] >= median_value).astype(int)
    else:
        raise ValueError("split_type must be 'overall' or 'yearly'.")

    df.loc[invalid_index, new_col] = np.nan

    return df


In [8]:
reg_intensity_df.keys()

Index(['gvkey', 'year', 'RegIn_Regulations', 'RegIn_Responses', 'RegIn_Time',
       'RegIn_Dollar'],
      dtype='object')

In [12]:
gov_expo_df['gov_exposure'] = gov_expo_df['total'] / gov_expo_df['length']
gov_expo_df_valid = gov_expo_df[[const.GVKEY, 'fyear', 'gov_exposure']].rename(columns={'fyear': const.YEAR}).drop_duplicates(
    subset=[const.GVKEY, const.YEAR], keep='first')
reg_intensity_df.rename(columns={'year': const.YEAR}, inplace=True)

In [20]:
reg_df2: DataFrame = reg_df.merge(gov_expo_df_valid, on=[const.GVKEY, const.YEAR], how='left').merge(
    reg_intensity_df, on=[const.GVKEY, const.YEAR], how='left')

In [14]:
reg_df2.shape[0], reg_df.shape[0]

(12074, 12074)

In [16]:
reg_df2.keys()

Index(['gvkey', 'fiscal_year', 'mf_indicator', 'frequency', 'log_frequency',
       'MajorGovCustomer', 'post', 'log_market_value', 'lev', 'ROA',
       ...
       'FCSTERROR', 'lnANALYSTS', 'ANALYSTS', 'govContractTerms',
       'RegPolicyRisk', 'gov_exposure', 'RegIn_Regulations', 'RegIn_Responses',
       'RegIn_Time', 'RegIn_Dollar'],
      dtype='object', length=206)

In [15]:
reg_df2.describe()

Unnamed: 0,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,ROA,...,FCSTERROR,lnANALYSTS,ANALYSTS,govContractTerms,RegPolicyRisk,gov_exposure,RegIn_Regulations,RegIn_Responses,RegIn_Time,RegIn_Dollar
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,6570.0,12074.0,12074.0,12074.0,12074.0,9793.0,8746.0,8746.0,8746.0,8746.0
mean,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,-0.054435,...,0.118434,1.08779,4.824748,0.469262,0.057435,0.01228,108.249641,114.954506,111.474731,101.809639
min,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,-33.150838,...,0.0,0.0,0.0,0.0,0.0,0.0,19.786411,17.507309,16.652386,10.901248
25%,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,-0.039075,...,0.00092,0.0,0.0,0.0,0.0,0.003481,103.744537,104.891708,103.406654,91.487915
50%,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,0.027111,...,0.003289,0.693147,1.0,0.0,0.0,0.006875,107.10463,117.095093,113.665619,101.736969
75%,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,0.066365,...,0.01274,2.079442,7.0,0.0,0.0,0.015896,113.111603,126.162086,120.207878,111.740311
max,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,1.247209,...,5.981308,4.007333,54.0,56.662514,11.129248,0.127124,138.604034,158.453918,153.121399,154.008682
std,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,0.496132,...,0.676267,1.146382,7.22498,2.0078,0.302885,0.013952,9.366395,15.798062,13.74218,16.487452


In [23]:
for key in ['gov_exposure', 'RegIn_Regulations', 'RegIn_Responses', 'RegIn_Time', 'RegIn_Dollar']:
    for split_type in ['overall', 'yearly']:
        reg_df2 = median_split(reg_df2, key, split_type=split_type)

In [24]:
reg_df2.describe()

Unnamed: 0,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,ROA,...,gov_exposure_highall,gov_exposure_highann,RegIn_Regulations_highall,RegIn_Regulations_highann,RegIn_Responses_highall,RegIn_Responses_highann,RegIn_Time_highall,RegIn_Time_highann,RegIn_Dollar_highall,RegIn_Dollar_highann
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,9793.0,9793.0,8746.0,8746.0,8746.0,8746.0,8746.0,8746.0,8746.0,8746.0
mean,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,-0.054435,...,0.500051,0.500255,0.5,0.500343,0.5,0.500343,0.5,0.500343,0.5,0.500343
min,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,-33.150838,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,-0.039075,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,0.027111,...,1.0,1.0,0.5,1.0,0.5,1.0,0.5,1.0,0.5,1.0
75%,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,0.066365,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,1.247209,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
std,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,0.496132,...,0.500026,0.500025,0.500029,0.500028,0.500029,0.500028,0.500029,0.500028,0.500029,0.500028


In [25]:
reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20250602_stock_act_reg_data_v1.dta'), write_index=False, version=119)

In [29]:
def multi_year_average_split(df, variable, year_variable='fiscal_year', target_years=(2008, 2009, 2010, 2011)):
    """
    Creates a split (high/low) based on the average value over specified years.

    Parameters:
    - df: pandas DataFrame.
    - variable: str, the column to average (e.g., 'GuidanceForecast').
    - year_variable: str, the year column name (e.g., 'fiscal_year').
    - target_years: list of int, the years over which to calculate the average.

    Returns:
    - df: DataFrame with a new column: variable + '_multi_year_high' (1 = above/equal median, 0 = below).
    """
    # Filter target years and calculate mean per firm
    avg_df = (df[df[year_variable].isin(target_years)]
              .groupby('gvkey')[variable]
              .mean()
              .reset_index()
              .rename(columns={variable: f'{variable}_my_avg'}))

    # Merge back to main DataFrame
    df = df.merge(avg_df, on='gvkey', how='left')

    # Compute median across firms (ignoring missing)
    median_value = df[f'{variable}_my_avg'].fillna(0).median()

    # Assign high/low based on median split
    df[f'{variable}_my_high'] = (df[f'{variable}_my_avg'] >= median_value).astype(int)

    return df


In [30]:
reg_df3 = reg_df2.copy()
for key in ['gov_exposure', 'RegIn_Regulations', 'RegIn_Responses', 'RegIn_Time', 'RegIn_Dollar']:
    reg_df3 = multi_year_average_split(reg_df3, key)

In [31]:
reg_df3.to_stata(os.path.join(const.RESULT_PATH, '20250603_stock_act_reg_data_v1.dta'), write_index=False, version=119)
