In [2]:
import os

import pandas as pd
from pandas import DataFrame
import numpy as np
from scipy.stats.mstats import winsorize

from Constants import Constants as const

In [2]:
pc3_df: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'firm_year_num_political_donar_3years.dta')).rename(
    columns={'recipient_ext_id': 'numDonation3year'}).drop(['index'], axis=1)
pc4_df: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'firm_year_num_political_donar_4year.dta')).rename(
    columns={'recipient_ext_id': 'numDonation4year'}).drop(['index'], axis=1)

In [7]:
pc3_avg = pc3_df.groupby([const.GVKEY])['numDonation3year'].mean()
pc4_avg = pc4_df.groupby([const.GVKEY])['numDonation4year'].mean()

In [9]:
pc3_avg_df: DataFrame = pc3_avg.reset_index(drop=False)
pc4_avg_df: DataFrame = pc4_avg.reset_index(drop=False)

In [10]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241014_stock_act_reg_data_v2.dta'))
gvkey_series = reg_df[const.GVKEY].unique()

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241014_stock_act_reg_data_v2.dta'))


In [13]:
# Update the function to handle numpy.ndarray as input
def ensure_gvkeys_with_ndarray(df, gvkey_array, num_col_name):
    # Convert the numpy array to a set for quick lookup
    gvkey_set = set(gvkey_array)
    # Identify missing gvkeys
    existing_gvkeys = set(df['gvkey'])
    missing_gvkeys = gvkey_set - existing_gvkeys
    # Create a DataFrame with missing gvkeys
    missing_rows = pd.DataFrame({
        'gvkey': list(missing_gvkeys),
        num_col_name: [None] * len(missing_gvkeys)
    })
    # Concatenate the original DataFrame with missing rows and sort by gvkey
    updated_df = pd.concat([df, missing_rows], ignore_index=True)
    updated_df = updated_df.sort_values('gvkey').reset_index(drop=True)
    return updated_df

In [14]:
# Update pc3_avg_df and pc4_avg_df
pc3_avg_df2 = ensure_gvkeys_with_ndarray(pc3_avg_df, gvkey_series, 'numDonation3year')
pc4_avg_df2 = ensure_gvkeys_with_ndarray(pc4_avg_df, gvkey_series, 'numDonation4year')


  updated_df = pd.concat([df, missing_rows], ignore_index=True)


In [16]:
pc_avg_df: DataFrame = pc3_avg_df2.merge(pc4_avg_df2, on=[const.GVKEY])
pc_avg_df.shape

(1837, 3)

In [17]:
for key in ['numDonation3year', 'numDonation4year']:
    pc_avg_df[key] = pc_avg_df[key].fillna(0)

In [20]:
pc_avg_df['highDonation3Year'] = (pc_avg_df['numDonation3year'] > pc_avg_df['numDonation3year'].median()).astype(int)
pc_avg_df['highDonation4Year'] = (pc_avg_df['numDonation4year'] > pc_avg_df['numDonation4year'].median()).astype(int)

In [22]:
reg_df2: DataFrame = reg_df.merge(pc_avg_df, on=[const.GVKEY], how='left')
reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20241212_stock_act_reg_data.dta'), write_index=False, version=119)

# Append Government Contract data

In [2]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241212_stock_act_reg_data.dta'))
gov_contract_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'fromZGY', 'gov_characters_data.csv')).drop(['tic'], axis=1)

In [3]:
gov_contract_df.rename(columns=lambda x: '{}_num'.format(x) if x.endswith('gov') else x, inplace=True)

In [4]:
gov_contract_df

Unnamed: 0,fiscal_year,fed_gov_num,gov_num,state_gov_num,loc_gov_num,gvkey
0,2008.0,0.0,0.0,0.0,0.0,151832
1,2009.0,0.0,0.0,0.0,0.0,151832
2,2010.0,0.0,0.0,0.0,0.0,151832
3,2011.0,0.0,0.0,0.0,0.0,151832
4,2012.0,0.0,0.0,0.0,0.0,151832
...,...,...,...,...,...,...
85811,2011.0,0.0,0.0,0.0,0.0,30165
85812,2012.0,0.0,0.0,0.0,0.0,30165
85813,2013.0,0.0,0.0,0.0,0.0,30165
85814,2014.0,0.0,0.0,0.0,0.0,30165


In [10]:
reg_df_gov_num = reg_df.merge(gov_contract_df, on=[const.GVKEY, const.YEAR], how='left')
for key in gov_contract_df.keys():
    if key not in {const.GVKEY, const.YEAR}:
        reg_df_gov_num[key].fillna(0, inplace=True)
        reg_df_gov_num[key.replace('num', 'dummy')] = reg_df_gov_num[key].apply(lambda x: int(x > 0))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reg_df_gov_num[key].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reg_df_gov_num[key].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

In [11]:
reg_df_gov_num.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,highDonation3Year,highDonation4Year,fed_gov_num,gov_num,state_gov_num,loc_gov_num,fed_gov_dummy,gov_dummy,state_gov_dummy,loc_gov_dummy
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0
mean,6036.5,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,...,0.439043,0.465629,0.263541,0.295511,0.023025,0.008945,0.152725,0.160593,0.016316,0.006957
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3018.25,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6036.5,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9054.75,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,1.0,1.0,19.0,19.0,10.0,3.0,1.0,1.0,1.0,1.0
std,3485.60791,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,...,0.496291,0.498838,0.862415,0.934783,0.219107,0.114772,0.359737,0.36717,0.126693,0.083122


In [12]:
reg_df_gov_num.to_stata(os.path.join(const.RESULT_PATH, '20241223_stock_act_reg_data.dta'), write_index=False, version=119)

# append some annual data

In [50]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241223_stock_act_reg_data.dta'))
gov_contract: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'gov_contract.dta')).rename(
    columns={'year': const.YEAR, 'if_gov_contract': 'hasGovContractPanel'}).drop(
    ['index', 'if_cpdata', 'if_costplus', 'if_cas', 'if_noncomm'], axis=1).dropna(how='any').drop_duplicates(
    subset=[const.GVKEY, const.YEAR], keep='first')
govsales_percentage: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'govsales_percentage.dta')).rename(
    columns={'tic': const.TICKER, 'annual_per_sale': 'GovSalePercent'}).drop(
    ['index', 'gov_annual_sales', 'all_annual_sales'], axis=1).dropna(how='any')



In [51]:
dollar_amount = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'fec_2008_2015_firmlevel_dollarnum.dta')).rename(
    columns={'year': const.YEAR, 'amount': 'contrAmt'}).drop(['index'], axis=1).dropna(how='any')
people_num = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'fec_2008_2015_firmlevel_peoplenum.dta')).rename(
    columns={'year': const.YEAR, 'recipient_ext_id': 'numPolitician'}).drop(['index'], axis=1).dropna(how='any')

In [52]:
gov_contract[const.YEAR] = gov_contract[const.YEAR].astype(int)
govsales_percentage[const.YEAR] = govsales_percentage[const.YEAR].astype(int)
dollar_amount[const.YEAR] = dollar_amount[const.YEAR].astype(int)
people_num[const.YEAR] = people_num[const.YEAR].astype(int)
gov_contract[const.GVKEY] = gov_contract[const.GVKEY].astype(int)
dollar_amount[const.GVKEY] = dollar_amount[const.GVKEY].astype(int)
people_num[const.GVKEY] = people_num[const.GVKEY].astype(int)

In [63]:
reg_df2: DataFrame = reg_df.merge(govsales_percentage, on=[const.TICKER, const.YEAR], how='left').merge(
    gov_contract, on=[const.GVKEY, const.YEAR], how='left').merge(
    dollar_amount, on=[const.GVKEY, const.YEAR], how='left').merge(
    people_num, on=[const.GVKEY, const.YEAR], how='left')
reg_df2.loc[:, 'hasGovContractPanel'] = reg_df2['hasGovContractPanel'].fillna(0)


In [47]:
reg_df2[['hasGovContract', 'hasGovContractPanel']].describe()

Unnamed: 0,hasGovContract,hasGovContractPanel
count,12074.0,12074.0
mean,0.281762,0.430098
std,0.449877,0.49511
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,1.0
max,1.0,1.0


In [64]:
reg_df2['hasGovPanelMajor'] = reg_df2.apply(lambda x: int((x['hasGovContractPanel'] == 1) and (x['MajorGovCustomer'] == 0)), axis=1)
reg_df2['hasGovPanelFed'] = reg_df2.apply(lambda x: int((x['hasGovContractPanel'] == 1) and (x['fed_gov_dummy'] == 0)), axis=1)

In [55]:
reg_df2.loc[:, ['GovSalePercent', 'contrAmt', 'numPolitician']].describe()

Unnamed: 0,GovSalePercent,contrAmt,numPolitician
count,1362.0,4984.0,4984.0
mean,0.421257,30176.18,33.989767
std,0.341576,104663.4,114.45102
min,0.001727,-6618.0,1.0
25%,0.104907,1250.0,2.0
50%,0.327493,5300.0,6.0
75%,0.694477,22643.0,22.0
max,1.0,3602357.0,2581.0


In [56]:
def create_high_low_dummy(df: DataFrame, check_key: str):
    df_median = df[check_key].median()
    df[f'High{check_key}'] = 0
    df[f'Low{check_key}'] = 0
    df.loc[df[check_key] > df_median, f'High{check_key}'] = 1
    df.loc[df[check_key] < df_median, f'Low{check_key}'] = 1

    return df


In [65]:
reg_df3 = reg_df2.copy()
for key in ['GovSalePercent', 'contrAmt', 'numPolitician']:
    reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)


  reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)
  reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)
  reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)


In [71]:
reg_df3['GovSalePercent'] = reg_df3['GovSalePercent'].fillna(0)
reg_df3['contrAmt'] = reg_df3['contrAmt'].fillna(0)
reg_df3['numPolitician'] = reg_df3['numPolitician'].fillna(0)

In [72]:
reg_df3.to_stata(os.path.join(const.RESULT_PATH, '20241231_stock_act_data.dta'), write_index=False, version=117)

# Prepare regression data for totalP and disclosure

In [45]:
main_reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250704_stock_act_reg_data_v1.dta'))
placebo_reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250516_2002_2010_regression_data.dta'))

const.YEAR

'fiscal_year'

In [46]:
placebo_reg_df = placebo_reg_df[placebo_reg_df['gov_counts'].notnull()].copy()
usekey_list = 'log_frequency gov_indicator sic fiscal_year gvkey Size LEV BM ROA GuidanceForecast'.split(' ')
placebo_reg_df_useful = placebo_reg_df[usekey_list].rename(columns={'gov_indicator': 'MajorGovCustomer', 'log_frequency': 'logGuidanceForecast'})

In [47]:
main_reg_useful_df = main_reg_df[['frequency', 'log_frequency_w', 'MajorGovCustomer', 'log_market_value_w', 'lev_w', 'BM_w', 'ROA_w', 'sic', 'fiscal_year', 'gvkey']].rename(
    columns={'log_frequency_w': 'logGuidanceForecast', 'frequency': 'GuidanceForecast', 'log_market_value_w': "Size", 'lev_w': 'LEV', 'BM_w': 'BM', 'ROA_w': 'ROA'})
merged_df = pd.concat([placebo_reg_df_useful, main_reg_useful_df], axis=0, ignore_index=True).drop_duplicates(subset=[const.GVKEY, const.YEAR], keep='last')

In [48]:
pc_df = pd.read_stata(os.path.join(const.DATA_PATH, '20190217_FEC_general_firm_year_interaction_newly_without_duplicates_interactions3.dta'))
pc_valid_df = pc_df[['CSTAT_gvkey', 'year', 'FEC_LoseP', 'FEC_WonP', 'FEC_TotalP']].rename(
    columns={'CSTAT_gvkey': const.GVKEY, 'year': const.YEAR, 'FEC_LoseP': 'LoseP', 'FEC_WonP': 'WinP', 'FEC_TotalP': 'TotalP'})
pc_valid_df.loc[:, 'NumP'] = pc_valid_df['WinP'] + pc_valid_df['LoseP']
pc_valid_df['WinRatio'] = pc_valid_df['WinP'] / pc_valid_df['NumP']
pc_valid_df['LoseRatio'] = pc_valid_df['LoseP'] / pc_valid_df['NumP']

In [51]:
tmp_pc_df = pc_valid_df.copy()
pc_dis_df = merged_df.merge(tmp_pc_df, on=[const.GVKEY, const.YEAR], how='right')
pc_dis_df['sic'] = pc_dis_df['sic'].astype(str)
pc_dis_df.to_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_pc_reg_data.dta'), write_index=False)

In [15]:
tmp_pc_df = pc_valid_df.copy()
tmp_pc_df.loc[:, const.YEAR] -= 1
pc_dis_df = merged_df.merge(tmp_pc_df, on=[const.GVKEY, const.YEAR], how='inner')
pc_dis_df['sic'] = pc_dis_df['sic'].astype(int)
pc_dis_df.to_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_pc_reg_data_v2.dta'), write_index=False)

In [20]:
def get_election_cycle(fiscal_year):
    if fiscal_year % 2 == 0:
        return fiscal_year - 2
    else:
        return fiscal_year - 1

tmp_pc_df = pc_valid_df.rename(columns={const.YEAR: 'ElectionCycle'})
merged_df.loc[:, 'ElectionCycle'] = merged_df[const.YEAR].apply(get_election_cycle)

pc_dis_df = merged_df.merge(tmp_pc_df, on=[const.GVKEY, 'ElectionCycle'], how='inner')
pc_dis_df['sic'] = pc_dis_df['sic'].astype(int)
pc_dis_df.to_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_pc_reg_data_v3.dta'), write_index=False)


In [21]:
tmp_pc_df = pc_valid_df.rename(columns={const.YEAR: 'ElectionCycle'})
tmp_dis_df = merged_df.groupby([const.GVKEY, 'ElectionCycle'])[['Size', 'LEV', 'BM', 'ROA', 'GuidanceForecast']].mean().reset_index(drop=False)
pc_dis_df = tmp_dis_df.merge(tmp_pc_df, on=[const.GVKEY, 'ElectionCycle'], how='inner')
pc_dis_df['logGuidanceForecast'] = pc_dis_df['GuidanceForecast'].apply(np.log1p)
pc_dis_df.to_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_pc_reg_data_v4.dta'), write_index=False)

## merge with special election data

In [43]:
spc_df = pd.read_stata(os.path.join(const.DATA_PATH, '20180829_FEC_federal_special_firm_year_candidate_merged_data.dta'))
useful_col = 'year FEC_margin FEC_is_win FEC_is_close_election CSTAT_gvkey'.split(' ')
spc_df = spc_df[useful_col].rename(columns={'year': const.YEAR, 'CSTAT_gvkey': const.GVKEY})

In [53]:
spc_dis_df = merged_df.merge(spc_df, on=[const.GVKEY, const.YEAR], how='right')
spc_dis_df.describe()
spc_dis_df['sic'] = spc_dis_df['sic'].astype(str)
spc_dis_df.drop_duplicates(subset=[const.GVKEY, const.YEAR], keep='last').to_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_spc_reg_data.dta'), write_index=False)

# Construct Regression data for the following variables
## GPIN

In [15]:
crsp_comp_link = pd.read_pickle(os.path.join(const.TEMP_PATH, 'crsp_comp.pkl'))
crsp_comp_link['permco'] = crsp_comp_link['permco'].astype(int)
crsp_comp_link[const.GVKEY] = crsp_comp_link[const.GVKEY].astype(int)

In [3]:
cpin_daily_full = pd.read_csv(os.path.join(const.RESULT_PATH, '1993_2019_cpie_daily.csv'))
annual_gpin = cpin_daily_full.groupby(['permno', 'year']).agg({
    'cpie_gpin': 'mean',
}).reset_index(drop=False)
annual_gpin.rename(columns={'year': const.YEAR}, inplace=True)

In [16]:
annual_gpin_gvkey = pd.merge(annual_gpin, crsp_comp_link,
                             left_on='permno', right_on='permco', how='left')

## IDIOSYN

In [5]:
synchrony_monthly_df = pd.read_pickle(os.path.join(const.TEMP_PATH, '20250712_synchrony_monthly.pkl'))
synchrony_monthly_df['IDIOSYN'] = synchrony_monthly_df['SYNCHRONICITY'] * -1

In [18]:
synchrony_gvkey_df = pd.merge(synchrony_monthly_df.drop(['SYNCHRONICITY', 'SYNCHRONICITY_MKT', 'SYNCHRONICITY_IND'], axis=1),
                                crsp_comp_link,
                             left_on='PERMNO', right_on='permco', how='left')

In [8]:
idiosyn_df = pd.read_pickle(os.path.join(const.TEMP_PATH, '20250717_idiosyn_monthly.pkl'))
idiosyn_df.describe()

Unnamed: 0,PERMNO,year,IDIOSYN
count,444782.0,444782.0,385109.0
mean,52339.816402,1994.478781,-1.445707
std,29679.84376,20.945936,1.985146
min,10000.0,1926.0,-36.736801
25%,20722.0,1982.0,-2.151028
50%,54317.0,1997.0,-1.271565
75%,80830.0,2011.0,-0.478086
max,93436.0,2024.0,8.074132


## DELAY

In [4]:
price_delay_df = pd.read_pickle(os.path.join(const.TEMP_PATH, 'hm2005_all_price_delay.pkl'))

In [5]:
price_delay_df.keys()

Index(['PERMNO', 'year', 'price_delay', 'r2_full', 'r2_restricted', 'n_obs'], dtype='object')

In [6]:
price_delay_df.describe()

Unnamed: 0,PERMNO,year,price_delay,r2_full,r2_restricted,n_obs
count,435361.0,435361.0,433884.0,433884.0,433884.0,435361.0
mean,52247.041577,1994.373591,0.474898,0.269389,0.1803564,45.99847
std,29631.207052,21.023619,0.327104,0.201966,0.20887,7.169638
min,10000.0,1926.0,0.0,0.00074,1.197764e-11,10.0
25%,20748.0,1982.0,0.171768,0.119707,0.02459033,48.0
50%,54114.0,1997.0,0.430954,0.213365,0.1018642,48.0
75%,80723.0,2011.0,0.784484,0.363214,0.2621346,48.0
max,93436.0,2024.0,1.0,1.0,1.0,49.0


In [28]:
delay_gvkey_df = pd.merge(price_delay_df, crsp_comp_link,
                          left_on='PERMNO', right_on='permco', how='left').dropna(subset=[const.GVKEY]).rename(columns={'year': const.YEAR}).drop(['PERMNO', 'r2_full',  'r2_restricted', 'n_obs', 'permco'], axis=1)

## ANALYSTS, FCSTERROR and DISPERSION

In [5]:
afd_df = pd.read_pickle(os.path.join(const.TEMP_PATH, '1976_2023_analysts_fd_data.pkl'))
afd_df.describe()

Unnamed: 0,fiscal_year,eps_mean,eps_sd,eps_actual,numest,eps_mean_last,eps_sd_last,eps_actual_last,numest_last,permno,...,mthprevprc,mthret,vwretd,date,Price_lag,DISPERSION,DISPERSION_last,FCSTERROR,FCSTERROR_last,ANALYSTS
count,172208.0,172159.0,147356.0,167065.0,172208.0,172159.0,147356.0,167065.0,172208.0,172208.0,...,172131.0,172127.0,172208.0,172208,172208.0,147356.0,147356.0,167036.0,167036.0,0.0
mean,2001.524035,20929.03,191892.9,-1553316.0,6.669149,-12709.12,149432.1,-1534378.0,6.642345,59601.25234,...,50.25009,0.015151,0.017939,2002-06-30 22:38:55.835733504,47.237687,86731.6,55748.89,60380.95,58659.43,
min,1977.0,-975000000.0,0.0,-35645870000.0,1.0,-988235300.0,0.0,-37632000000.0,1.0,10001.0,...,0.015625,-1.0,-0.225361,1977-12-30 00:00:00,0.02,0.0,0.0,0.0,0.0,
25%,1993.0,0.1958333,0.02916667,0.06833333,1.916667,0.16,0.02,0.0602,2.0,30728.0,...,7.8125,-0.048321,0.003676,1993-12-31 00:00:00,8.375,0.001230753,0.00073903,0.002003949,0.001181536,
50%,2001.0,0.7983333,0.06583333,0.6958333,4.166667,0.78,0.05,0.7,4.0,75055.0,...,17.67,0.01087,0.018294,2001-12-31 00:00:00,17.66,0.003423006,0.002305919,0.00758825,0.005151602,
75%,2011.0,1.74,0.1641964,1.645,9.0,1.74,0.13,1.67,9.0,84001.0,...,32.5,0.072959,0.031839,2011-12-30 00:00:00,31.6425,0.01074177,0.008212652,0.03143124,0.02495423,
max,2021.0,4800000000.0,4806799000.0,4080000000.0,54.666667,4800000000.0,5623709000.0,4080000000.0,57.0,93436.0,...,416876.0,15.774194,0.129674,2021-12-31 00:00:00,347815.0,2247453000.0,1739205000.0,2482667000.0,2900800000.0,
std,11.561869,16461400.0,23129750.0,173130100.0,6.704304,14381310.0,21607610.0,174521000.0,6.810433,28512.242719,...,2305.437187,0.173908,0.035051,,2083.659628,10385270.0,7201449.0,8490775.0,9372193.0,


## Guaidance RangeForecast

In [6]:
guidance_df = pd.read_pickle(os.path.join(const.TEMP_PATH, '20250718_ibes_firm_year_guidance.pkl'))
guidance_df.describe()

Unnamed: 0,fiscal_year,guidancecount,rangeforecast
count,60140.0,60140.0,60140.0
mean,2009.203475,9.603991,0.640832
std,6.836092,9.613754,0.384548
min,1992.0,1.0,0.0
25%,2004.0,2.0,0.333333
50%,2009.0,6.0,0.793103
75%,2015.0,14.0,1.0
max,2021.0,131.0,1.0


## Merge all five dataset

In [15]:
print(annual_gpin.keys())
print(idiosyn_df.keys())
print(afd_df.keys())
print(price_delay_df.keys())
print(guidance_df.keys())

Index(['permno', 'fiscal_year', 'cpie_gpin'], dtype='object')
Index(['PERMNO', 'year', 'IDIOSYN'], dtype='object')
Index(['permno', 'fiscal_year', 'numest', 'numest_last', 'DISPERSION',
       'DISPERSION_last', 'FCSTERROR', 'FCSTERROR_last'],
      dtype='object')
Index(['PERMNO', 'year', 'price_delay', 'r2_full', 'r2_restricted', 'n_obs'], dtype='object')
Index(['permno', 'fiscal_year', 'guidancecount', 'rangeforecast'], dtype='object')


In [9]:
id_df = idiosyn_df.rename(columns={'year': const.YEAR, 'PERMNO': 'permno'}).dropna(how='any')
afd_df = afd_df[['permno', const.YEAR, 'numest', 'numest_last', 'DISPERSION', 'DISPERSION_last', 'FCSTERROR', 'FCSTERROR_last']].dropna(subset=['permno', const.YEAR], how='any')
pd_df = price_delay_df[['PERMNO', 'year', 'price_delay']].rename(columns={'year': const.YEAR, 'PERMNO': 'permno'}).dropna(how='any')
dep_df = annual_gpin.merge(id_df, on=['permno', const.YEAR], how='outer').merge(
    afd_df, on=['permno', const.YEAR], how='outer').merge(
    pd_df, on=['permno', const.YEAR], how='outer').merge(
    guidance_df, on=['permno', const.YEAR], how='outer')

In [10]:
dep_df.describe()

Unnamed: 0,fiscal_year,cpie_gpin,IDIOSYN,numest,numest_last,DISPERSION,DISPERSION_last,FCSTERROR,FCSTERROR_last,price_delay,guidancecount,rangeforecast
count,436027.0,36593.0,385109.0,172208.0,172208.0,147356.0,147356.0,167036.0,167036.0,433884.0,60140.0,60140.0
mean,1994.438402,0.3864901,-1.445707,6.669149,6.642345,86731.6,55748.89,60380.95,58659.43,0.474898,9.603991,0.640832
std,21.005916,0.182265,1.985146,6.704304,6.810433,10385270.0,7201449.0,8490775.0,9372193.0,0.327104,9.613754,0.384548
min,1926.0,7.634524000000001e-272,-36.736801,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,1982.0,0.2285479,-2.151028,1.916667,2.0,0.001230753,0.00073903,0.002003949,0.001181536,0.171768,2.0,0.333333
50%,1997.0,0.4126209,-1.271565,4.166667,4.0,0.003423006,0.002305919,0.00758825,0.005151602,0.430954,6.0,0.793103
75%,2011.0,0.5244818,-0.478086,9.0,9.0,0.01074177,0.008212652,0.03143124,0.02495423,0.784484,14.0,1.0
max,2024.0,1.0,8.074132,54.666667,57.0,2247453000.0,1739205000.0,2482667000.0,2900800000.0,1.0,131.0,1.0


In [11]:
dep_df.to_pickle(os.path.join(const.TEMP_PATH, '20250718_temp_dependent_variables.pkl'))

## Construct Regression data

In [12]:
dep_df = pd.read_pickle(os.path.join(const.TEMP_PATH, '20250718_temp_dependent_variables.pkl'))
ccm_path = r'D:\Users\wangy\Documents\data\ccm'
ccm_df = pd.read_sas(os.path.join(os.path.join(ccm_path, 'ccm_lookup.sas7bdat')), encoding='utf-8')

In [13]:
print('dep_df keys: ', list(dep_df.keys()))
print('ccm_df keys: ', list(ccm_df.keys()))

dep_df keys:  ['permno', 'fiscal_year', 'cpie_gpin', 'IDIOSYN', 'numest', 'numest_last', 'DISPERSION', 'DISPERSION_last', 'FCSTERROR', 'FCSTERROR_last', 'price_delay', 'guidancecount', 'rangeforecast']
ccm_df keys:  ['GVKEY', 'LPERMNO', 'LPERMCO', 'LINKDT', 'LINKENDDT', 'conm', 'tic', 'cusip', 'cik', 'sic', 'naics', 'gsubind', 'gind', 'year1', 'year2']


In [6]:
# 标准化列名
dep_df.columns = dep_df.columns.str.lower()
ccm_df.columns = ccm_df.columns.str.lower()

# 保留需要的字段
ccm_sub = ccm_df[['gvkey', 'lpermno', 'year1', 'year2']].copy()

# 为提高 merge 效率，先 merge on permno，再做年份筛选
merged = pd.merge(dep_df, ccm_sub, left_on='permno', right_on='lpermno', how='left')

# 年份条件筛选：fiscal_year ∈ [year1, year2]
matched = merged[
    (merged['fiscal_year'] >= merged['year1']) &
    (merged['fiscal_year'] <= merged['year2'])
].copy()

# 如果多个匹配，选择最近的 year1（可选）
matched = matched.sort_values(by=['permno', 'fiscal_year', 'year1'], ascending=[True, True, False])
matched = matched.drop_duplicates(subset=['permno', 'fiscal_year'], keep='first')

# 删除辅助列（可选）
matched = matched.drop(columns=['lpermno', 'year1', 'year2'])
matched[const.GVKEY] = matched[const.GVKEY].astype(int)


In [2]:
reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_pc_reg_data_v2.dta'))
spc_reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_spc_reg_data_v2.dta'))

In [7]:
# merge to next year
tmp_dep_df = matched.copy()
tmp_dep_df['fiscal_year'] = tmp_dep_df['fiscal_year'] + 1

reg_df2 = reg_df.merge(tmp_dep_df, on=[const.GVKEY, const.YEAR], how='left').drop_duplicates(subset=[const.GVKEY, const.YEAR], keep='first')
reg_df2['coverage'] = reg_df2['coverage'].fillna(0)

reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20250713_staock_act_pc_reg_data.dta'), write_index=False)

spc_reg_df2 = spc_reg_df.merge(tmp_dep_df, on=[const.GVKEY, const.YEAR], how='left').drop_duplicates(subset=[const.GVKEY, const.YEAR], keep='first')
spc_reg_df2['coverage'] = spc_reg_df2['coverage'].fillna(0)

spc_reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20250713_staock_act_spc_reg_data.dta'), write_index=False)

In [15]:
ctrl_df = pd.read_pickle(os.path.join(const.TEMP_PATH, '20250718_ctat_controls.pkl'))
# 清理极端值（可选）
for var in ['lev', 'bm', 'roa', 'size']:
    ctrl_df[var] = ctrl_df[var].replace([np.inf, -np.inf], np.nan)
ctrl_df.describe()

Unnamed: 0,datadate,fyear,at,dltt,ceq,csho,prcc_f,ib,mkvalt,dlc,size,lev,bm,roa
count,557558,557558.0,491792.0,489197.0,475336.0,550220.0,461084.0,489218.0,460293.0,484236.0,460196.0,491293.0,391611.0,488206.0
mean,1998-12-31 11:40:59.638638848,1998.038767,6037.858243,1130.930462,1017.306112,138.664963,26.206488,108.175955,2519.428044,542.675002,4.689986,,,
min,1950-06-30 00:00:00,1950.0,0.0,-0.023,-139965.0,0.0,1e-06,-99289.0,0.0,-3753.453,-12.144413,,,
25%,1986-12-31 00:00:00,1986.0,17.43975,0.105,5.423,2.813,3.499996,-1.234,19.4675,0.061,2.969639,,,
50%,2000-08-31 00:00:00,2000.0,111.415,7.798,40.721,10.425,12.1865,1.588,96.12,1.775,4.566277,,,
75%,2013-05-31 00:00:00,2012.0,826.6605,131.982,259.50025,39.46,25.999978,19.81275,553.7411,18.1,6.317074,,,
max,2025-06-30 00:00:00,2025.0,4349731.0,4216909.0,649368.0,25974962.446,141600.0,104821.0,3522211.138,614237.411,15.0746,,,
std,,17.21838,68113.67766,24821.395229,6814.797664,35103.410682,508.372195,1133.313807,21754.476467,9100.107825,2.481943,,,


In [18]:
ctrl_df[ctrl_df['lev'].isnull()]

Unnamed: 0,gvkey,datadate,fyear,at,dltt,ceq,csho,prcc_f,ib,mkvalt,dlc,size,lev,bm,roa
2684,001119,1982-12-31,1982,,,,17.13,17.375,,297.63375,,5.695864,,,
2685,001119,1983-12-31,1983,,,,17.13,17.875,,306.19875,,5.724234,,,
2686,001119,1984-12-31,1984,,,,20.32,16.625,,337.82,,5.822513,,,
2687,001119,1985-12-31,1985,,,,21.313,19.375,,412.939375,,6.023301,,,
2688,001119,1986-12-31,1986,,,,22.313,19.125,,426.736125,,6.056166,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85456,330942,2024-12-31,2024,,,,39.9,105.92,,4226.208,,8.34906,,,
85496,333359,2022-12-31,2022,,,,8.53,91.9844,,784.626932,,6.665208,,,
85497,333359,2024-12-31,2024,,,,10.555,92.0937,,972.049003,,6.879406,,,
85668,354003,2023-12-31,2023,,,,0.8,51.9389,,41.55112,,3.726924,,,


In [19]:
ctrl_df.keys()

Index(['gvkey', 'datadate', 'fyear', 'at', 'dltt', 'ceq', 'csho', 'prcc_f',
       'ib', 'mkvalt', 'dlc', 'size', 'lev', 'bm', 'roa'],
      dtype='object')