In [1]:
import os

import pandas as pd
from pandas import DataFrame
import numpy as np
from scipy.stats.mstats import winsorize

from Constants import Constants as const

In [2]:
pc3_df: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'firm_year_num_political_donar_3years.dta')).rename(
    columns={'recipient_ext_id': 'numDonation3year'}).drop(['index'], axis=1)
pc4_df: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'firm_year_num_political_donar_4year.dta')).rename(
    columns={'recipient_ext_id': 'numDonation4year'}).drop(['index'], axis=1)

In [7]:
pc3_avg = pc3_df.groupby([const.GVKEY])['numDonation3year'].mean()
pc4_avg = pc4_df.groupby([const.GVKEY])['numDonation4year'].mean()

In [9]:
pc3_avg_df: DataFrame = pc3_avg.reset_index(drop=False)
pc4_avg_df: DataFrame = pc4_avg.reset_index(drop=False)

In [10]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241014_stock_act_reg_data_v2.dta'))
gvkey_series = reg_df[const.GVKEY].unique()

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241014_stock_act_reg_data_v2.dta'))


In [13]:
# Update the function to handle numpy.ndarray as input
def ensure_gvkeys_with_ndarray(df, gvkey_array, num_col_name):
    # Convert the numpy array to a set for quick lookup
    gvkey_set = set(gvkey_array)
    # Identify missing gvkeys
    existing_gvkeys = set(df['gvkey'])
    missing_gvkeys = gvkey_set - existing_gvkeys
    # Create a DataFrame with missing gvkeys
    missing_rows = pd.DataFrame({
        'gvkey': list(missing_gvkeys),
        num_col_name: [None] * len(missing_gvkeys)
    })
    # Concatenate the original DataFrame with missing rows and sort by gvkey
    updated_df = pd.concat([df, missing_rows], ignore_index=True)
    updated_df = updated_df.sort_values('gvkey').reset_index(drop=True)
    return updated_df

In [14]:
# Update pc3_avg_df and pc4_avg_df
pc3_avg_df2 = ensure_gvkeys_with_ndarray(pc3_avg_df, gvkey_series, 'numDonation3year')
pc4_avg_df2 = ensure_gvkeys_with_ndarray(pc4_avg_df, gvkey_series, 'numDonation4year')


  updated_df = pd.concat([df, missing_rows], ignore_index=True)


In [16]:
pc_avg_df: DataFrame = pc3_avg_df2.merge(pc4_avg_df2, on=[const.GVKEY])
pc_avg_df.shape

(1837, 3)

In [17]:
for key in ['numDonation3year', 'numDonation4year']:
    pc_avg_df[key] = pc_avg_df[key].fillna(0)

In [20]:
pc_avg_df['highDonation3Year'] = (pc_avg_df['numDonation3year'] > pc_avg_df['numDonation3year'].median()).astype(int)
pc_avg_df['highDonation4Year'] = (pc_avg_df['numDonation4year'] > pc_avg_df['numDonation4year'].median()).astype(int)

In [22]:
reg_df2: DataFrame = reg_df.merge(pc_avg_df, on=[const.GVKEY], how='left')
reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20241212_stock_act_reg_data.dta'), write_index=False, version=119)

# Append Government Contract data

In [2]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241212_stock_act_reg_data.dta'))
gov_contract_df: DataFrame = pd.read_csv(os.path.join(const.DATA_PATH, 'fromZGY', 'gov_characters_data.csv')).drop(['tic'], axis=1)

In [3]:
gov_contract_df.rename(columns=lambda x: '{}_num'.format(x) if x.endswith('gov') else x, inplace=True)

In [4]:
gov_contract_df

Unnamed: 0,fiscal_year,fed_gov_num,gov_num,state_gov_num,loc_gov_num,gvkey
0,2008.0,0.0,0.0,0.0,0.0,151832
1,2009.0,0.0,0.0,0.0,0.0,151832
2,2010.0,0.0,0.0,0.0,0.0,151832
3,2011.0,0.0,0.0,0.0,0.0,151832
4,2012.0,0.0,0.0,0.0,0.0,151832
...,...,...,...,...,...,...
85811,2011.0,0.0,0.0,0.0,0.0,30165
85812,2012.0,0.0,0.0,0.0,0.0,30165
85813,2013.0,0.0,0.0,0.0,0.0,30165
85814,2014.0,0.0,0.0,0.0,0.0,30165


In [10]:
reg_df_gov_num = reg_df.merge(gov_contract_df, on=[const.GVKEY, const.YEAR], how='left')
for key in gov_contract_df.keys():
    if key not in {const.GVKEY, const.YEAR}:
        reg_df_gov_num[key].fillna(0, inplace=True)
        reg_df_gov_num[key.replace('num', 'dummy')] = reg_df_gov_num[key].apply(lambda x: int(x > 0))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reg_df_gov_num[key].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reg_df_gov_num[key].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

In [11]:
reg_df_gov_num.describe()

Unnamed: 0,index,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,...,highDonation3Year,highDonation4Year,fed_gov_num,gov_num,state_gov_num,loc_gov_num,fed_gov_dummy,gov_dummy,state_gov_dummy,loc_gov_dummy
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0
mean,6036.5,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,...,0.439043,0.465629,0.263541,0.295511,0.023025,0.008945,0.152725,0.160593,0.016316,0.006957
min,0.0,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3018.25,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6036.5,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9054.75,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12073.0,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,...,1.0,1.0,19.0,19.0,10.0,3.0,1.0,1.0,1.0,1.0
std,3485.60791,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,...,0.496291,0.498838,0.862415,0.934783,0.219107,0.114772,0.359737,0.36717,0.126693,0.083122


In [12]:
reg_df_gov_num.to_stata(os.path.join(const.RESULT_PATH, '20241223_stock_act_reg_data.dta'), write_index=False, version=119)

# append some annual data

In [50]:
reg_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20241223_stock_act_reg_data.dta'))
gov_contract: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'gov_contract.dta')).rename(
    columns={'year': const.YEAR, 'if_gov_contract': 'hasGovContractPanel'}).drop(
    ['index', 'if_cpdata', 'if_costplus', 'if_cas', 'if_noncomm'], axis=1).dropna(how='any').drop_duplicates(
    subset=[const.GVKEY, const.YEAR], keep='first')
govsales_percentage: DataFrame = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'govsales_percentage.dta')).rename(
    columns={'tic': const.TICKER, 'annual_per_sale': 'GovSalePercent'}).drop(
    ['index', 'gov_annual_sales', 'all_annual_sales'], axis=1).dropna(how='any')



In [51]:
dollar_amount = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'fec_2008_2015_firmlevel_dollarnum.dta')).rename(
    columns={'year': const.YEAR, 'amount': 'contrAmt'}).drop(['index'], axis=1).dropna(how='any')
people_num = pd.read_stata(os.path.join(const.DATA_PATH, 'fromZGY', 'fec_2008_2015_firmlevel_peoplenum.dta')).rename(
    columns={'year': const.YEAR, 'recipient_ext_id': 'numPolitician'}).drop(['index'], axis=1).dropna(how='any')

In [52]:
gov_contract[const.YEAR] = gov_contract[const.YEAR].astype(int)
govsales_percentage[const.YEAR] = govsales_percentage[const.YEAR].astype(int)
dollar_amount[const.YEAR] = dollar_amount[const.YEAR].astype(int)
people_num[const.YEAR] = people_num[const.YEAR].astype(int)
gov_contract[const.GVKEY] = gov_contract[const.GVKEY].astype(int)
dollar_amount[const.GVKEY] = dollar_amount[const.GVKEY].astype(int)
people_num[const.GVKEY] = people_num[const.GVKEY].astype(int)

In [63]:
reg_df2: DataFrame = reg_df.merge(govsales_percentage, on=[const.TICKER, const.YEAR], how='left').merge(
    gov_contract, on=[const.GVKEY, const.YEAR], how='left').merge(
    dollar_amount, on=[const.GVKEY, const.YEAR], how='left').merge(
    people_num, on=[const.GVKEY, const.YEAR], how='left')
reg_df2.loc[:, 'hasGovContractPanel'] = reg_df2['hasGovContractPanel'].fillna(0)


In [47]:
reg_df2[['hasGovContract', 'hasGovContractPanel']].describe()

Unnamed: 0,hasGovContract,hasGovContractPanel
count,12074.0,12074.0
mean,0.281762,0.430098
std,0.449877,0.49511
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,1.0
max,1.0,1.0


In [64]:
reg_df2['hasGovPanelMajor'] = reg_df2.apply(lambda x: int((x['hasGovContractPanel'] == 1) and (x['MajorGovCustomer'] == 0)), axis=1)
reg_df2['hasGovPanelFed'] = reg_df2.apply(lambda x: int((x['hasGovContractPanel'] == 1) and (x['fed_gov_dummy'] == 0)), axis=1)

In [55]:
reg_df2.loc[:, ['GovSalePercent', 'contrAmt', 'numPolitician']].describe()

Unnamed: 0,GovSalePercent,contrAmt,numPolitician
count,1362.0,4984.0,4984.0
mean,0.421257,30176.18,33.989767
std,0.341576,104663.4,114.45102
min,0.001727,-6618.0,1.0
25%,0.104907,1250.0,2.0
50%,0.327493,5300.0,6.0
75%,0.694477,22643.0,22.0
max,1.0,3602357.0,2581.0


In [56]:
def create_high_low_dummy(df: DataFrame, check_key: str):
    df_median = df[check_key].median()
    df[f'High{check_key}'] = 0
    df[f'Low{check_key}'] = 0
    df.loc[df[check_key] > df_median, f'High{check_key}'] = 1
    df.loc[df[check_key] < df_median, f'Low{check_key}'] = 1

    return df


In [65]:
reg_df3 = reg_df2.copy()
for key in ['GovSalePercent', 'contrAmt', 'numPolitician']:
    reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)


  reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)
  reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)
  reg_df3: DataFrame = reg_df3.groupby(const.YEAR).apply(create_high_low_dummy, check_key=key).reset_index(drop=True)


In [71]:
reg_df3['GovSalePercent'] = reg_df3['GovSalePercent'].fillna(0)
reg_df3['contrAmt'] = reg_df3['contrAmt'].fillna(0)
reg_df3['numPolitician'] = reg_df3['numPolitician'].fillna(0)

In [72]:
reg_df3.to_stata(os.path.join(const.RESULT_PATH, '20241231_stock_act_data.dta'), write_index=False, version=117)

# Prepare regression data for totalP and disclosure

In [45]:
main_reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250704_stock_act_reg_data_v1.dta'))
placebo_reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250516_2002_2010_regression_data.dta'))

const.YEAR

'fiscal_year'

In [46]:
placebo_reg_df = placebo_reg_df[placebo_reg_df['gov_counts'].notnull()].copy()
usekey_list = 'log_frequency gov_indicator sic fiscal_year gvkey Size LEV BM ROA GuidanceForecast'.split(' ')
placebo_reg_df_useful = placebo_reg_df[usekey_list].rename(columns={'gov_indicator': 'MajorGovCustomer', 'log_frequency': 'logGuidanceForecast'})

In [47]:
main_reg_useful_df = main_reg_df[['frequency', 'log_frequency_w', 'MajorGovCustomer', 'log_market_value_w', 'lev_w', 'BM_w', 'ROA_w', 'sic', 'fiscal_year', 'gvkey']].rename(
    columns={'log_frequency_w': 'logGuidanceForecast', 'frequency': 'GuidanceForecast', 'log_market_value_w': "Size", 'lev_w': 'LEV', 'BM_w': 'BM', 'ROA_w': 'ROA'})
merged_df = pd.concat([placebo_reg_df_useful, main_reg_useful_df], axis=0, ignore_index=True).drop_duplicates(subset=[const.GVKEY, const.YEAR], keep='last')

In [48]:
pc_df = pd.read_stata(os.path.join(const.DATA_PATH, '20190217_FEC_general_firm_year_interaction_newly_without_duplicates_interactions3.dta'))
pc_valid_df = pc_df[['CSTAT_gvkey', 'year', 'FEC_LoseP', 'FEC_WonP', 'FEC_TotalP']].rename(
    columns={'CSTAT_gvkey': const.GVKEY, 'year': const.YEAR, 'FEC_LoseP': 'LoseP', 'FEC_WonP': 'WinP', 'FEC_TotalP': 'TotalP'})
pc_valid_df.loc[:, 'NumP'] = pc_valid_df['WinP'] + pc_valid_df['LoseP']
pc_valid_df['WinRatio'] = pc_valid_df['WinP'] / pc_valid_df['NumP']
pc_valid_df['LoseRatio'] = pc_valid_df['LoseP'] / pc_valid_df['NumP']

In [51]:
tmp_pc_df = pc_valid_df.copy()
pc_dis_df = merged_df.merge(tmp_pc_df, on=[const.GVKEY, const.YEAR], how='right')
pc_dis_df['sic'] = pc_dis_df['sic'].astype(str)
pc_dis_df.to_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_pc_reg_data.dta'), write_index=False)

In [15]:
tmp_pc_df = pc_valid_df.copy()
tmp_pc_df.loc[:, const.YEAR] -= 1
pc_dis_df = merged_df.merge(tmp_pc_df, on=[const.GVKEY, const.YEAR], how='inner')
pc_dis_df['sic'] = pc_dis_df['sic'].astype(int)
pc_dis_df.to_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_pc_reg_data_v2.dta'), write_index=False)

In [20]:
def get_election_cycle(fiscal_year):
    if fiscal_year % 2 == 0:
        return fiscal_year - 2
    else:
        return fiscal_year - 1

tmp_pc_df = pc_valid_df.rename(columns={const.YEAR: 'ElectionCycle'})
merged_df.loc[:, 'ElectionCycle'] = merged_df[const.YEAR].apply(get_election_cycle)

pc_dis_df = merged_df.merge(tmp_pc_df, on=[const.GVKEY, 'ElectionCycle'], how='inner')
pc_dis_df['sic'] = pc_dis_df['sic'].astype(int)
pc_dis_df.to_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_pc_reg_data_v3.dta'), write_index=False)


In [21]:
tmp_pc_df = pc_valid_df.rename(columns={const.YEAR: 'ElectionCycle'})
tmp_dis_df = merged_df.groupby([const.GVKEY, 'ElectionCycle'])[['Size', 'LEV', 'BM', 'ROA', 'GuidanceForecast']].mean().reset_index(drop=False)
pc_dis_df = tmp_dis_df.merge(tmp_pc_df, on=[const.GVKEY, 'ElectionCycle'], how='inner')
pc_dis_df['logGuidanceForecast'] = pc_dis_df['GuidanceForecast'].apply(np.log1p)
pc_dis_df.to_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_pc_reg_data_v4.dta'), write_index=False)

## merge with special election data

In [43]:
spc_df = pd.read_stata(os.path.join(const.DATA_PATH, '20180829_FEC_federal_special_firm_year_candidate_merged_data.dta'))
useful_col = 'year FEC_margin FEC_is_win FEC_is_close_election CSTAT_gvkey'.split(' ')
spc_df = spc_df[useful_col].rename(columns={'year': const.YEAR, 'CSTAT_gvkey': const.GVKEY})

In [53]:
spc_dis_df = merged_df.merge(spc_df, on=[const.GVKEY, const.YEAR], how='right')
spc_dis_df.describe()
spc_dis_df['sic'] = spc_dis_df['sic'].astype(str)
spc_dis_df.drop_duplicates(subset=[const.GVKEY, const.YEAR], keep='last').to_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_spc_reg_data.dta'), write_index=False)

# Construct Regression data for the following variables
## GPIN

In [15]:
crsp_comp_link = pd.read_pickle(os.path.join(const.TEMP_PATH, 'crsp_comp.pkl'))
crsp_comp_link['permco'] = crsp_comp_link['permco'].astype(int)
crsp_comp_link[const.GVKEY] = crsp_comp_link[const.GVKEY].astype(int)

In [13]:
cpin_daily_full = pd.read_csv(os.path.join(const.RESULT_PATH, '1993_2019_cpie_daily.csv'))
annual_gpin = cpin_daily_full.groupby(['permno', 'year']).agg({
    'cpie_gpin': 'mean',
}).reset_index(drop=False)
annual_gpin.rename(columns={'year': const.YEAR}, inplace=True)

In [16]:
annual_gpin_gvkey = pd.merge(annual_gpin, crsp_comp_link,
                             left_on='permno', right_on='permco', how='left')

## IDIOSYN

In [5]:
synchrony_monthly_df = pd.read_pickle(os.path.join(const.TEMP_PATH, '20250712_synchrony_monthly.pkl'))
synchrony_monthly_df['IDIOSYN'] = synchrony_monthly_df['SYNCHRONICITY'] * -1

In [18]:
synchrony_gvkey_df = pd.merge(synchrony_monthly_df.drop(['SYNCHRONICITY', 'SYNCHRONICITY_MKT', 'SYNCHRONICITY_IND'], axis=1),
                                crsp_comp_link,
                             left_on='PERMNO', right_on='permco', how='left')

## DELAY

In [6]:
price_delay_df = pd.read_pickle(os.path.join(const.TEMP_PATH, 'hm_2005_price_delay.pkl'))

In [26]:
price_delay_df.keys()

Index(['PERMNO', 'year', 'price_delay', 'r2_full', 'r2_restricted', 'n_obs'], dtype='object')

In [28]:
delay_gvkey_df = pd.merge(price_delay_df, crsp_comp_link,
                          left_on='PERMNO', right_on='permco', how='left').dropna(subset=[const.GVKEY]).rename(columns={'year': const.YEAR}).drop(['PERMNO', 'r2_full',  'r2_restricted', 'n_obs', 'permco'], axis=1)

## ANALYSTS

In [19]:
analyst_coverage = pd.read_csv(os.path.join(const.TEMP_PATH, '2001_2016_annual_analyst_coverage.csv'), usecols=['gvkey', 'permno', 'coverage', const.YEAR])

## FCSTERROR and DISPERSION

In [10]:
fcsterror_dispersion_df = pd.read_csv(os.path.join(const.TEMP_PATH, '2001_2016_annual_dispersion_fcsterror.csv'),
                                      usecols=['permno', 'year', 'DISPERSION', 'DISPERSION_last',
       'FCSTERROR', 'FCSTERROR_last'])

In [20]:
fd_gvkey_df = pd.merge(fcsterror_dispersion_df, crsp_comp_link,
                             left_on='permno', right_on='permco', how='left')

## Merge all five dataset

In [62]:
print(annual_gpin.keys())
print(synchrony_monthly_df.keys())
print(analyst_coverage.keys())
print(price_delay_df.keys())
print(fcsterror_dispersion_df.keys())

Index(['permno', 'fiscal_year', 'cpie_gpin'], dtype='object')
Index(['PERMNO', 'year', 'SYNCHRONICITY', 'SYNCHRONICITY_MKT',
       'SYNCHRONICITY_IND', 'IDIOSYN'],
      dtype='object')
Index(['fiscal_year', 'coverage', 'permno', 'gvkey'], dtype='object')
Index(['PERMNO', 'year', 'price_delay', 'r2_full', 'r2_restricted', 'n_obs'], dtype='object')
Index(['year', 'permno', 'DISPERSION', 'DISPERSION_last', 'FCSTERROR',
       'FCSTERROR_last'],
      dtype='object')


In [64]:
idiosyn_df = synchrony_monthly_df[['PERMNO', 'year', 'IDIOSYN']].rename(columns={'year': const.YEAR, 'PERMNO': 'permno'})
ac_df = analyst_coverage.drop(['gvkey'], axis=1)
pd_df = price_delay_df[['PERMNO', 'year', 'price_delay']].rename(columns={'year': const.YEAR, 'PERMNO': 'permno'})
fd_df = fcsterror_dispersion_df.rename(columns={'year': const.YEAR})
dep_df = annual_gpin.merge(idiosyn_df, on=['permno', const.YEAR], how='outer').merge(
    ac_df, on=['permno', const.YEAR], how='outer').merge(
    pd_df, on=['permno', const.YEAR], how='outer').merge(
    fd_df, on=['permno', const.YEAR], how='outer')

In [66]:
dep_df.to_pickle(os.path.join(const.TEMP_PATH, '20250712_temp_dependent_variables.pkl'))

## Construct Regression data

In [3]:
dep_df = pd.read_pickle(os.path.join(const.TEMP_PATH, '20250712_temp_dependent_variables.pkl'))
ccm_path = r'D:\Users\wangy\Documents\data\ccm'
ccm_df = pd.read_sas(os.path.join(os.path.join(ccm_path, 'ccm_lookup.sas7bdat')), encoding='utf-8')

In [4]:
print('dep_df keys: ', list(dep_df.keys()))
print('ccm_df keys: ', list(ccm_df.keys()))

dep_df keys:  ['permno', 'fiscal_year', 'cpie_gpin', 'IDIOSYN', 'coverage', 'price_delay', 'DISPERSION', 'DISPERSION_last', 'FCSTERROR', 'FCSTERROR_last']
ccm_df keys:  ['GVKEY', 'LPERMNO', 'LPERMCO', 'LINKDT', 'LINKENDDT', 'conm', 'tic', 'cusip', 'cik', 'sic', 'naics', 'gsubind', 'gind', 'year1', 'year2']


In [5]:
# 标准化列名
dep_df.columns = dep_df.columns.str.lower()
ccm_df.columns = ccm_df.columns.str.lower()

# 保留需要的字段
ccm_sub = ccm_df[['gvkey', 'lpermno', 'year1', 'year2']].copy()

# 为提高 merge 效率，先 merge on permno，再做年份筛选
merged = pd.merge(dep_df, ccm_sub, left_on='permno', right_on='lpermno', how='left')

# 年份条件筛选：fiscal_year ∈ [year1, year2]
matched = merged[
    (merged['fiscal_year'] >= merged['year1']) &
    (merged['fiscal_year'] <= merged['year2'])
].copy()

# 如果多个匹配，选择最近的 year1（可选）
matched = matched.sort_values(by=['permno', 'fiscal_year', 'year1'], ascending=[True, True, False])
matched = matched.drop_duplicates(subset=['permno', 'fiscal_year'], keep='first')

# 删除辅助列（可选）
matched = matched.drop(columns=['lpermno', 'year1', 'year2'])

In [13]:
matched[const.GVKEY] = matched[const.GVKEY].astype(int)

In [16]:
reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_pc_reg_data_v2.dta'))
spc_reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250711_stock_act_spc_reg_data_v2.dta'))

In [17]:
# merge to next year
tmp_dep_df = matched.copy()
tmp_dep_df['fiscal_year'] = tmp_dep_df['fiscal_year'] + 1

reg_df2 = reg_df.merge(tmp_dep_df, on=[const.GVKEY, const.YEAR], how='left')

reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20250713_staock_act_pc_reg_data.dta'), write_index=False)

spc_reg_df2 = spc_reg_df.merge(tmp_dep_df, on=[const.GVKEY, const.YEAR], how='left')

spc_reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20250713_staock_act_spc_reg_data.dta'), write_index=False)

In [58]:
# merge to next year
tmp_fd_df = fd_gvkey_df.rename(columns={'year': const.YEAR}).drop(['permno', 'permco'], axis=1).dropna(subset=[const.GVKEY])
tmp_analyst_df = analyst_coverage.drop(['permno'], axis=1).dropna(subset=[const.GVKEY])
tmp_gpin_df = annual_gpin_gvkey.drop(['permno', 'permco'], axis=1).dropna(subset=[const.GVKEY])
tmp_idio_df = synchrony_gvkey_df.rename(columns={'year': const.YEAR}).drop(['PERMNO', 'permco'], axis=1).dropna(subset=[const.GVKEY])
tmp_delay_df = delay_gvkey_df.copy()

reg_df2 = reg_df.merge(tmp_fd_df, on=[const.GVKEY, const.YEAR], how='left').merge(tmp_analyst_df, on=[const.GVKEY, const.YEAR], how='left').merge(tmp_gpin_df, on=[const.GVKEY, const.YEAR], how='left').merge(tmp_idio_df, on=[const.GVKEY, const.YEAR], how='left').merge(tmp_delay_df, on=[const.GVKEY, const.YEAR], how='left')

reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20250712_staock_act_pc_reg_data.dta'), write_index=False)

spc_reg_df2 = spc_reg_df.merge(tmp_fd_df, on=[const.GVKEY, const.YEAR], how='left').merge(tmp_analyst_df, on=[const.GVKEY, const.YEAR], how='left').merge(tmp_gpin_df, on=[const.GVKEY, const.YEAR], how='left').merge(tmp_idio_df, on=[const.GVKEY, const.YEAR], how='left').merge(tmp_delay_df, on=[const.GVKEY, const.YEAR], how='left')

spc_reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20250712_staock_act_spc_reg_data.dta'), write_index=False)

In [59]:
reg_df.loc[reg_df[const.GVKEY].isin(set(tmp_gpin_df[const.GVKEY])), const.GVKEY].unique()

array([ 23000.,   1177., 116004.,   1678.,   8542.,  60797.,  27925.,
       109919.,  63268.,  61355., 118121., 177376.,  63501.,   4194.,
       114524., 170617.,  31683.,   4839.,  30536.,  61077.,  19428.,
        22260.,  27928.,   6178.,  17239.,  61064., 164109.,  62784.,
        66435.,  31460.,  28758.,  63180.,  63527.,  64552.,  10426.,
        64807.,  29616.])

In [60]:
reg_df.loc[reg_df[const.GVKEY].isin(set(tmp_gpin_df[const.GVKEY])), const.GVKEY].shape

(180,)

In [34]:
tmp_fd_df[const.YEAR].describe()

count    3793.000000
mean     2011.491695
std         4.552379
min      2003.000000
25%      2007.000000
50%      2013.000000
75%      2016.000000
max      2017.000000
Name: fiscal_year, dtype: float64