In [28]:
import os

import pandas as pd
from pandas import DataFrame
import numpy as np
from scipy.stats.mstats import winsorize

from Constants import Constants as const

In [10]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20240825_stock_act_reg_data.dta'))
new_year_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'fromZGY', 'new_shock_year_data_compustat.csv'),
                                     usecols=[const.GVKEY, 'year', 'post', 'freq']).rename(
    columns={'post': 'post60min', 'freq': 'freq60min', 'year': 'fiscal_year'})
# new_quarterly_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'fromZGY', 'new_year_quarter_data.csv'))

In [14]:
reg_df_60: DataFrame = reg_df.merge(new_year_df, on=[const.GVKEY, const.YEAR], how='left').drop(['busdesc'], axis=1)
event_index = reg_df_60.loc[reg_df_60[const.YEAR].apply(lambda x: 2008 < x < 2015)].index
reg_df_60.loc[event_index, 'freq60min'] = reg_df_60['freq60min'].fillna(0)
reg_df_60.loc[event_index, 'ln_freq60min'] = reg_df_60['freq60min'].apply(lambda x: np.log(x + 1))
reg_df_60.loc[event_index, 'freq60min'] = (reg_df_60.loc[event_index, const.YEAR] > 2011).astype(int)
reg_df_60.to_stata(os.path.join(const.RESULT_PATH, '20240918_stock_act_reg_data.dta'),
                   write_index=False)

In [20]:
# merge quarterly data
new_quarterly_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'fromZGY', 'new_year_quarter_data_gvkey.csv'))
majgovcustomer_gvkey = reg_df.loc[reg_df['MajorGovCustomer'] == 1, 'gvkey'].unique()
new_quarterly_df.loc[:, 'MajorGovCustomer'] = 0
new_quarterly_df.loc[new_quarterly_df[const.GVKEY].isin(majgovcustomer_gvkey), 'MajorGovCustomer'] = 1

new_quarterly_df['yearquarter'] = new_quarterly_df.apply(lambda x: '{}q{}'.format(int(x['Year']), int(x['Quarter'])), axis=1)
new_quarterly_df.loc[:, 'post'] = (new_quarterly_df['Year'] > 2011).astype(int)
new_quarterly_df['ln_freq'] = new_quarterly_df['freq'].apply(lambda x: np.log(x + 1))

int_terms = list()
for year in range(2008, 2015):
    for quarter in range(1, 5):
        new_quarterly_df.loc[:, f'd{year}q{quarter}'] = (new_quarterly_df['yearquarter'] == f'{year}q{quarter}').astype(int)
        int_terms.append(f'1.d{year}q{quarter}#1.MajorGovCustomer')

new_quarterly_df.to_stata(os.path.join(const.RESULT_PATH, '20240918_stock_act_reg_data_quarterly.dta'), write_index=False)

print(' '.join(int_terms))

1.d2008q1#1.MajorGovCustomer 1.d2008q2#1.MajorGovCustomer 1.d2008q3#1.MajorGovCustomer 1.d2008q4#1.MajorGovCustomer 1.d2009q1#1.MajorGovCustomer 1.d2009q2#1.MajorGovCustomer 1.d2009q3#1.MajorGovCustomer 1.d2009q4#1.MajorGovCustomer 1.d2010q1#1.MajorGovCustomer 1.d2010q2#1.MajorGovCustomer 1.d2010q3#1.MajorGovCustomer 1.d2010q4#1.MajorGovCustomer 1.d2011q1#1.MajorGovCustomer 1.d2011q2#1.MajorGovCustomer 1.d2011q3#1.MajorGovCustomer 1.d2011q4#1.MajorGovCustomer 1.d2012q1#1.MajorGovCustomer 1.d2012q2#1.MajorGovCustomer 1.d2012q3#1.MajorGovCustomer 1.d2012q4#1.MajorGovCustomer 1.d2013q1#1.MajorGovCustomer 1.d2013q2#1.MajorGovCustomer 1.d2013q3#1.MajorGovCustomer 1.d2013q4#1.MajorGovCustomer 1.d2014q1#1.MajorGovCustomer 1.d2014q2#1.MajorGovCustomer 1.d2014q3#1.MajorGovCustomer 1.d2014q4#1.MajorGovCustomer


In [24]:
inter_terms = '1.d2009q1#1.MajorGovCustomer 1.d2009q2#1.MajorGovCustomer 1.d2009q3#1.MajorGovCustomer 1.d2009q4#1.MajorGovCustomer 1.d2010q1#1.MajorGovCustomer 1.d2010q2#1.MajorGovCustomer 1.d2010q3#1.MajorGovCustomer 1.d2010q4#1.MajorGovCustomer 1.d2011q1#1.MajorGovCustomer 1.d2011q2#1.MajorGovCustomer 1.d2011q3#1.MajorGovCustomer 1.d2011q4#1.MajorGovCustomer 1.d2012q1#1.MajorGovCustomer 1.d2012q2#1.MajorGovCustomer 1.d2012q3#1.MajorGovCustomer 1.d2012q4#1.MajorGovCustomer 1.d2013q1#1.MajorGovCustomer 1.d2013q2#1.MajorGovCustomer 1.d2013q3#1.MajorGovCustomer 1.d2013q4#1.MajorGovCustomer 1.d2014q1#1.MajorGovCustomer 1.d2014q2#1.MajorGovCustomer 1.d2014q3#1.MajorGovCustomer 1.d2014q4#1.MajorGovCustomer'.split(' ')
xlabel = list()
for i, term in enumerate(inter_terms):
    year = term.split('q')[0][-4:]
    quarter = term.split('q')[1][0]
    xlabel.append(f'{i+1} "{year}q{quarter}"')
    
print(' '.join(xlabel))

1 "2009q1" 2 "2009q2" 3 "2009q3" 4 "2009q4" 5 "2010q1" 6 "2010q2" 7 "2010q3" 8 "2010q4" 9 "2011q1" 10 "2011q2" 11 "2011q3" 12 "2011q4" 13 "2012q1" 14 "2012q2" 15 "2012q3" 16 "2012q4" 17 "2013q1" 18 "2013q2" 19 "2013q3" 20 "2013q4" 21 "2014q1" 22 "2014q2" 23 "2014q3" 24 "2014q4"


# Merge Analysts FCSTERROR DISPERSION market information data.

In [34]:
# load regression data
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20240825_stock_act_reg_data_v2.dta'))

# load sue123 data
sue123_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20241006_sue123_data_annual.pkl'))

# load analysts fcsterror dispersion data
adf_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20241006_analysts_dispersion_fcsterror.pkl'))

# load synchrony data
synchrony_df: DataFrame = pd.read_pickle(os.path.join(const.TEMP_PATH, '20241006_synchrony_weekly.pkl'))

In [35]:
# merge with sue 123 data
sue123_df['gvkey'] = sue123_df['gvkey'].astype(int)
sue123_df['fyearq'] = sue123_df['fyearq'].astype(int)
sue123_df.rename(columns={'fyearq': const.YEAR}, inplace=True)

reg_df2: DataFrame = reg_df.merge(sue123_df, on=[const.GVKEY, const.YEAR], how='left')

In [23]:
adf_df[adf_df[['ticker', 'fyear']].duplicated()]

Unnamed: 0,ticker,fpedats,anndats_act,meanest,numest,stdev,actual,prcc_f,fyear,ANALYSTS,lnANALYSTS,DISPERSION,FCSTERROR,fiscal_year
9,EXPD,2013-12-31,2014-02-25,1.96,1.0,,1.68,44.25,2013.0,1.0,0.693147,,0.006328,2013.0
10,JJSF,2010-03-31,2010-04-22,0.46,3.0,0.03,0.48,43.19,2009.0,3.0,1.386294,0.000695,0.000463,2009.0
11,NSM,2009-05-31,2009-06-11,-0.42,7.0,0.02,-0.28,,2008.0,7.0,2.079442,,,2008.0
25,NSM,2010-08-31,2010-09-09,0.21,14.0,0.03,0.36,,2009.0,14.0,2.708050,,,2009.0
27,WGL,2016-06-30,2016-08-03,0.11,4.0,0.19,0.33,57.67,2015.0,4.0,1.609438,0.003295,0.003815,2015.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155663,FOSL,2006-09-30,2006-11-14,0.34,9.0,0.02,0.32,21.51,2005.0,9.0,2.302585,0.000930,0.000930,2005.0
155664,LF,2010-03-31,2010-05-03,-0.29,5.0,0.04,-0.37,3.91,2009.0,5.0,1.791759,0.010230,0.020460,2009.0
155665,OXPS,2006-12-31,2007-01-31,0.95,4.0,0.06,1.15,22.69,2006.0,4.0,1.609438,0.002644,0.008814,2006.0
155666,STGN,2006-12-31,2007-03-06,0.40,2.0,0.03,0.25,7.44,2006.0,2.0,1.098612,0.004032,0.020161,2006.0


In [36]:
# merge adf data
# adf_df.rename(columns={'tic': 'ticker'}, inplace=True)
adf_df.loc[:, const.YEAR] = adf_df['fyear']
adf_valid: DataFrame = adf_df.loc[:, ['ticker', const.YEAR, 'ANALYSTS', 'lnANALYSTS', 'DISPERSION', 'FCSTERROR']]

reg_df3: DataFrame = reg_df2.merge(adf_valid, on=['ticker', const.YEAR], how='left')
reg_df3.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,QFM_f_annual,predicted_costcap,predicted_hurdle,annual_sue1,annual_sue2,annual_sue3,ANALYSTS,lnANALYSTS,DISPERSION,FCSTERROR
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,11630.0,8505.0,8505.0,6726.0,6726.0,6032.0,3948.0,3948.0,3778.0,3828.0
mean,6036.5,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,...,2.679648,0.092023,0.117034,0.00031,0.000311,-0.03255,11.545593,2.289246,552.9143,40.643852
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,-0.879652,0.04955,0.060928,-0.580058,-0.580058,-6.885936,1.0,0.693147,0.0,0.0
25%,3018.25,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,...,0.207093,0.083509,0.105448,0.0,0.0,-0.364511,5.0,1.791759,0.001275257,0.003183
50%,6036.5,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,...,0.434483,0.092779,0.118525,0.0,0.0,0.0,9.0,2.302585,0.003047777,0.008994
75%,9054.75,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,...,0.895383,0.101097,0.129915,0.0,0.0,0.3653,16.0,2.833213,0.009602861,0.029951
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,4931.937993,0.124678,0.167867,1.412761,1.412761,7.976155,56.0,4.043051,1899558.0,134482.758621
std,3485.60791,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,...,59.815111,0.012435,0.017691,0.023666,0.023675,0.970669,8.677086,0.719222,31028.71,2178.351333


In [37]:
# Define the columns to winsorize
columns_to_winsorize = ['DISPERSION', 'FCSTERROR']

# Winsorize each column in the dataset reg_df3 at the 1st and 99th percentiles
for column in columns_to_winsorize:
    non_na_data = reg_df3[column].dropna()
    winsorized_data = winsorize(non_na_data, limits=[0.01, 0.01])
    reg_df3.loc[non_na_data.index, column] = winsorized_data

# Display the updated dataframe
reg_df3.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,QFM_f_annual,predicted_costcap,predicted_hurdle,annual_sue1,annual_sue2,annual_sue3,ANALYSTS,lnANALYSTS,DISPERSION,FCSTERROR
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,11630.0,8505.0,8505.0,6726.0,6726.0,6032.0,3948.0,3948.0,3778.0,3828.0
mean,6036.5,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,...,2.679648,0.092023,0.117034,0.00031,0.000311,-0.03255,11.545593,2.289246,0.239984,0.357989
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,-0.879652,0.04955,0.060928,-0.580058,-0.580058,-6.885936,1.0,0.693147,0.0001,0.000244
25%,3018.25,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,...,0.207093,0.083509,0.105448,0.0,0.0,-0.364511,5.0,1.791759,0.001275,0.003183
50%,6036.5,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,...,0.434483,0.092779,0.118525,0.0,0.0,0.0,9.0,2.302585,0.003048,0.008994
75%,9054.75,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,...,0.895383,0.101097,0.129915,0.0,0.0,0.3653,16.0,2.833213,0.009603,0.029951
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,4931.937993,0.124678,0.167867,1.412761,1.412761,7.976155,56.0,4.043051,15.540258,20.199805
std,3485.60791,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,...,59.815111,0.012435,0.017691,0.023666,0.023675,0.970669,8.677086,0.719222,1.672322,2.191517


In [42]:
# merge sync data
synchrony_df.rename(columns={'TICKER': 'ticker'}, inplace=True)
synchrony_df.loc[:, const.YEAR] = synchrony_df['year']
synchrony_df_valid: DataFrame = synchrony_df.loc[:, ['ticker', const.YEAR, 'SYNCHRONICITY', 'SYNCHRONICITY_MKT', 'SYNCHRONICITY_IND', 'PERMNO']]

reg_df4: DataFrame = reg_df3.merge(synchrony_df_valid, on=['ticker', const.YEAR], how='left')
reg_df4.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,annual_sue2,annual_sue3,ANALYSTS,lnANALYSTS,DISPERSION,FCSTERROR,SYNCHRONICITY,SYNCHRONICITY_MKT,SYNCHRONICITY_IND,PERMNO
count,12097.0,12097.0,12097.0,12097.0,12097.0,12097.0,12097.0,12097.0,12097.0,12097.0,...,6737.0,6043.0,3959.0,3959.0,3789.0,3839.0,6514.0,6516.0,6516.0,6536.0
mean,6034.526412,81207.029842,2011.54683,0.363644,4.820286,0.869019,0.153922,0.508969,6.518905,0.181628,...,0.000311,-0.031686,11.544329,2.289662,0.239291,0.35697,-0.645522,-1.019009,-2.501537,69670.389994
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,-0.580058,-6.885936,1.0,0.693147,0.0001,0.000244,-4.990253,-7.275702,-10.594919,10104.0
25%,3022.0,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.01263,0.000957,...,0.0,-0.360182,5.0,1.791759,0.001266,0.003165,-1.263905,-1.686802,-3.129783,54704.0
50%,6027.0,62549.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.522533,0.14859,...,0.0,0.0,9.0,2.302585,0.003041,0.008948,-0.610676,-0.915708,-2.379155,82597.0
75%,9050.0,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.980268,0.264009,...,0.0,0.369015,16.0,2.833213,0.009537,0.029913,0.007481,-0.224288,-1.768891,89617.0
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,1.412761,7.976155,56.0,4.043051,15.540258,20.199805,3.52255,3.879664,0.488883,93436.0
std,3483.503621,73648.719452,2.279853,0.481068,8.549417,1.245601,0.360889,0.49994,2.16305,0.273647,...,0.023656,0.97004,8.666932,0.718506,1.669942,2.188456,0.928811,1.107196,1.080441,27371.276395


In [45]:
reg_df4['PERMNO'] = reg_df4.groupby('gvkey')['PERMNO'].bfill()
reg_df4['PERMNO'] = reg_df4.groupby('gvkey')['PERMNO'].ffill()

In [43]:
import zipfile

# Define the path to the zip file
zip_file_path = os.path.join(const.DATA_PATH, 'cpie_data.zip')
csv_file_name = 'cpie_daily.csv'

# Open the zip file and read the CSV file
with zipfile.ZipFile(zip_file_path, 'r') as z:
    # Extract the CSV file from the zip and load it into a pandas DataFrame
    with z.open(csv_file_name) as f:
        df = pd.read_csv(f)

# Display the first few rows of the DataFrame to ensure it loaded correctly
print(df.head())


   permno  year      date  cpie_pin   cpie_dy  cpie_gpin  cpie_owr  cpie_mech  \
0   10057  1993  19930104  0.005020  0.607892   0.010702  0.848057          0   
1   10057  1993  19930105  0.000045  0.003326   0.070915  0.875569          0   
2   10057  1993  19930106  0.000708  0.211896   0.013788  0.911562          0   
3   10057  1993  19930107  0.025740  0.131155   0.432916  0.815852          0   
4   10057  1993  19930108  0.096240  0.945716   0.002560  0.724871          1   

      ret_o     ret_d       y_e  n_buys  n_sells  turn  
0 -0.006629 -0.013827 -0.352941       4        7    11  
1  0.011277  0.012019  0.930233       2        0     2  
2 -0.016238 -0.014314 -0.375000       3        5     8  
3  0.000910  0.014728  0.259259       8        4    12  
4  0.000973 -0.006758 -0.569444       4       10    14  


In [44]:
df.keys()

Index(['permno', 'year', 'date', 'cpie_pin', 'cpie_dy', 'cpie_gpin',
       'cpie_owr', 'cpie_mech', 'ret_o', 'ret_d', 'y_e', 'n_buys', 'n_sells',
       'turn'],
      dtype='object')

In [46]:
annual_gpin_owr = df.groupby(['permno', 'year']).agg({
    'cpie_pin': 'mean',
    'cpie_dy': 'mean',
    'cpie_gpin': 'mean',
}).reset_index(drop=False)
annual_gpin_owr.rename(columns={'permno': 'PERMNO', 'year': const.YEAR}, inplace=True)

In [47]:
reg_df5: DataFrame = reg_df4.merge(annual_gpin_owr, on=['PERMNO', const.YEAR], how='left')
reg_df5.to_stata(os.path.join(const.RESULT_PATH, '20241006_stock_act_reg_data.dta'), write_index=False, version=117)