In [1]:
import os

import pandas as pd
from pandas import DataFrame
import numpy as np


from Constants import  Constants as const

In [2]:
sec_df = pd.read_stata(os.path.join(const.DATA_PATH, 'SECAnalytics.dta'))
sec_df['form'].unique()

array(['8-K', '8-K/A', '8-K12B', '8-K12G3', '8-K15D5', '18-K/A',
       '8-K12B/A', '8-K12G3/A'], dtype=object)

In [3]:
valid_8k_forms = ['8-K', '8-K/A', '8-K12B', '8-K12B/A', '8-K12G3', '8-K12G3/A', '8-K15D5']

df_8k = sec_df[sec_df['form'].isin(valid_8k_forms)].copy()

In [4]:
df_8k['rdate'] = pd.to_datetime(df_8k['rdate'])

In [5]:
df_8k[const.YEAR] = df_8k['rdate'].dt.year
df_8k[const.GVKEY] = pd.to_numeric(df_8k['gvkey'])

In [10]:
# === 构建变量 1：每年每家公司唯一 8-K filing 数量 ===
df_8k_filings = df_8k.drop_duplicates([const.GVKEY, const.YEAR, 'wrdsfname'], keep='first')
num_8k_filings = df_8k_filings.groupby([const.GVKEY, const.YEAR])['wrdsfname'].size().reset_index(name='num_8k_filings')

In [16]:
df_8k_filings['nitem'][0]

'2.01'

In [17]:
# === 构建变量 2：每年每家公司 8-K item 总数 ===
num_8k_items = df_8k_filings.groupby([const.GVKEY, const.YEAR])['nitemno'].sum(min_count=1).reset_index(name='num_8k_items')

# === 构建变量 3：Earnings-related item 数量 ===
earnings_items = {'2.02', '7.01', '8.01', '1.01', '1.02', '2.01', '2.04','2.05', '2.06'}
df_8k['is_earn'] = df_8k['nitem'].isin(earnings_items)
num_earn_items = df_8k[df_8k['is_earn']].groupby([const.GVKEY, const.YEAR]).size().reset_index(name='num_earn_items')


In [21]:
# === 合并所有结果 ===
df_summary = num_8k_filings \
    .merge(num_8k_items, on=[const.GVKEY, const.YEAR], how='outer') \
    .merge(num_earn_items, on=[const.GVKEY, const.YEAR], how='outer')
df_summary['num_earn_items'] = df_summary['num_earn_items'].fillna(0)
df_summary['num_nonearn_items'] = df_summary['num_8k_items'] - df_summary['num_earn_items']

In [23]:
df_summary.keys()

Index(['gvkey', 'fiscal_year', 'num_8k_filings', 'num_8k_items',
       'num_earn_items', 'num_nonearn_items'],
      dtype='object')

In [22]:
reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250604_stock_act_reg_data_v1.dta'))

reg_df2 = reg_df.merge(df_summary, on=[const.GVKEY, const.YEAR], how='left')
reg_df2.describe()

Unnamed: 0,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,ROA,...,RegIn_Time_my_high,RegIn_Dollar_my_avg,RegIn_Dollar_my_high,latitude,longitude,distance_to_dc_km,num_8k_filings,num_8k_items,num_earn_items,num_nonearn_items
count,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,12074.0,...,12074.0,8786.0,12074.0,12074.0,12074.0,12074.0,10545.0,10545.0,10545.0,10545.0
mean,81220.205317,2011.54696,0.363508,4.817459,0.868412,0.153553,0.508945,6.51741,0.181687,-0.054435,...,0.500083,96.426826,0.500083,37.982031,-87.575054,1779.385976,13.11816,26.81394,11.951257,14.862684
min,1004.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.786519,0.0,-33.150838,...,0.0,14.131963,0.0,-34.605469,-157.883326,1.101796,1.0,1.0,0.0,0.0
25%,12142.0,2010.0,0.0,0.0,0.0,0.0,0.0,5.012064,0.00093,-0.039075,...,0.0,91.059692,0.0,33.975391,-98.393418,493.13121,8.0,16.0,6.0,9.0
50%,62592.0,2012.0,0.0,0.0,0.0,0.0,1.0,6.520895,0.148584,0.027111,...,1.0,94.937225,1.0,39.287801,-86.137865,1195.1859,11.0,23.0,10.0,13.0
75%,156153.0,2014.0,1.0,7.0,2.079442,0.0,1.0,7.976261,0.264176,0.066365,...,1.0,101.60939,1.0,41.673379,-75.550073,2419.348096,16.0,33.0,15.0,18.0
max,296753.0,2015.0,1.0,36.0,3.610918,1.0,1.0,11.561005,17.82545,1.247209,...,1.0,139.373062,1.0,61.186678,152.963505,15710.188305,162.0,321.0,157.0,164.0
std,73644.923742,2.279738,0.481029,8.551352,1.2453,0.360535,0.499941,2.163776,0.273871,0.496132,...,0.500021,10.007222,0.500021,6.698589,29.216612,1781.528276,7.763607,16.834492,8.706231,8.848418


In [24]:
for key in ['num_8k_filings', 'num_8k_items', 'num_earn_items', 'num_nonearn_items']:
    reg_df2[key] = reg_df2[key].fillna(0)
    reg_df2['ln_' + key] = reg_df2[key].apply(lambda x: np.log(x + 1))

reg_df2.to_stata(os.path.join(const.RESULT_PATH, '20250704_stock_act_reg_data_v1.dta'), write_index=False, version=119)