In [33]:
import os

import pandas as pd
import numpy as np
from pandas import DataFrame
from scipy import stats

from Constants import Constants as const

In [2]:
# Step 1: Load the data and create event time variable
file_path = os.path.join(const.RESULT_PATH, '20250220_stock_act_data_v2.dta')
df = pd.read_stata(file_path)

In [3]:
id_df = df[[const.GVKEY, const.YEAR, 'MajorGovCustomer']].copy()
id_df.to_csv(os.path.join(const.RESULT_PATH, '20250225_stock_act_firm_list.csv'), index=False)

# Construct Abnormal Trading Volume

First, Calculate the mean and standard deviation of trading volume using the data with in the [-252, -21] event window.
Then, using the trading volume around the event date minus the average trading volume, divided by the standard deviation of trading volume
Last, Calculate abnormal trading volume using the two event window [-2, 2] and [-1, 1]

In [11]:
#------------------------------------------------------------
# 1. Load the data
#------------------------------------------------------------
# Event data
events = pd.read_stata(os.path.join(const.RESULT_PATH, 'FinalCARData_20250306_lxt.dta'))
# CRSP data
crsp = pd.read_csv(os.path.join(const.DATABASE_PATH, 'crsp', '2007_2015_CRSP_vol_data.zip'), dtype={'NCUSIP': str})


In [10]:
crsp['NCUSIP'].dtypes

dtype('O')

In [12]:
# Convert dates to datetime
events['rdq'] = pd.to_datetime(events['rdq'])
crsp['date'] = pd.to_datetime(crsp['date'])

# Preprocess CRSP data: group by NCUSIP for faster lookup
crsp_grouped = crsp.groupby('NCUSIP')

def calculate_abnormal_volume(row):
    """Calculate abnormal volume for a single event row"""
    cusip = row['cusip_8']
    event_date = row['rdq']

    # Get firm data from CRSP
    try:
        firm_data = crsp_grouped.get_group(cusip).sort_values('date').reset_index(drop=True)
    except KeyError:
        return pd.Series([np.nan, np.nan])

    # Find event date position
    event_mask = (firm_data['date'] == event_date)
    if not event_mask.any():
        return pd.Series([np.nan, np.nan])

    event_idx = firm_data[event_mask].index[0]

    # Estimation window [-252, -21]
    start_est = max(0, event_idx - 252)
    end_est = max(0, event_idx - 21)

    if start_est >= end_est:
        return pd.Series([np.nan, np.nan])

    estimation_vol = firm_data.loc[start_est:end_est, 'VOL']
    mean_vol = estimation_vol.mean()
    std_vol = estimation_vol.std()

    if std_vol == 0:
        return pd.Series([np.nan, np.nan])

    # Calculate abnormal volumes
    def calc_window_abnormal(start, end):
        window_start = max(0, event_idx + start)
        window_end = min(len(firm_data)-1, event_idx + end)
        window_vol = firm_data.loc[window_start:window_end, 'VOL']
        return ((window_vol - mean_vol) / std_vol).sum()

    cav_2_2 = calc_window_abnormal(-2, 2)
    cav_1_1 = calc_window_abnormal(-1, 1)

    return pd.Series([cav_2_2, cav_1_1])

# Apply the calculation to each row
events[['CAV_2_2', 'CAV_1_1']] = events.apply(calculate_abnormal_volume, axis=1)

In [15]:
events.describe()

Unnamed: 0,fiscal_year,majorgovcustomer,rdq,CAPMCAR22,CAPMCAR11,CAPMCAR13,CAPMCAR15,CAPMCAR05,CAPMCAR55,FF3CAR22,...,mkvaltq,prccq,total_debt,size,roa,bm,lev,post,CAV_2_2,CAV_1_1
count,31954.0,31954.0,31954,31954.0,31954.0,31954.0,31954.0,31954.0,31954.0,31954.0,...,31866.0,31951.0,29950.0,31866.0,31857.0,31815.0,29950.0,31954.0,30859.0,30859.0
mean,2011.71919,0.164173,2012-06-01 06:39:43.476247040,0.001849,0.002528,0.001271,0.001266,0.000717,0.001381,0.001944,...,6760.489074,31.89119,1575.073242,6.803043,0.002896,1.397894,0.193197,0.538149,3.530836,2.942663
min,2008.0,0.0,2008-02-12 00:00:00,-0.879886,-0.750561,-0.843614,-1.107498,-1.040059,-1.281628,-0.933216,...,1.2611,0.0701,0.0,2.293272,-0.207989,0.120152,0.0,0.0,-7.461892,-4.730442
25%,2010.0,0.0,2010-07-21 00:00:00,-0.039554,-0.039003,-0.04662,-0.05028,-0.049351,-0.055744,-0.039331,...,225.16115,8.5,0.75025,5.416817,-0.000252,0.558467,0.003971,0.0,-0.540035,-0.156169
50%,2012.0,0.0,2012-07-25 00:00:00,-0.00035,0.000158,-0.000416,-0.000689,-0.001032,-0.000676,-0.000253,...,886.9696,19.83,109.861,6.787811,0.009551,0.997759,0.153694,1.0,1.680589,1.534981
75%,2014.0,0.0,2014-05-07 00:00:00,0.041397,0.042193,0.046423,0.050171,0.048048,0.055246,0.041203,...,3441.72935,39.11,846.596252,8.143729,0.020222,1.69023,0.315476,1.0,5.305096,4.263735
max,2015.0,1.0,2016-08-08 00:00:00,1.653363,1.847709,1.679236,1.944226,2.049613,1.939718,1.667979,...,717000.2515,4197.95,27464.0,11.638428,0.087504,8.648678,0.775204,1.0,1505.069737,973.714104
std,2.261275,0.370439,,0.093874,0.091006,0.101217,0.110006,0.107234,0.120843,0.094214,...,23775.584679,82.912287,4230.963867,2.045294,0.041155,1.393723,0.190146,0.498556,12.023182,8.599978


In [20]:
for key in ['CAPMCAR22', 'CAPMCAR11', 'CAPMCAR13', 'CAPMCAR15', 'CAPMCAR05',
       'CAPMCAR55', 'FF3CAR22', 'FF3CAR11', 'FF3CAR13', 'FF3CAR15', 'FF3CAR05',
       'FF3CAR55']:
    events[f'abs_{key}'] = events[key].abs()

events['qtr'] = events['fqtr'].apply(lambda x: int(x.split('Q')[1]))
events.to_stata(os.path.join(const.RESULT_PATH, '20250307_stock_act_eap_test.dta'), write_index=False)

# Remerge ICC data

In [2]:
# Load the datasets
stock_data = pd.read_stata(os.path.join(const.RESULT_PATH, '20250307_stock_act_data_v1.dta'))
link_data = pd.read_csv(os.path.join(const.DATABASE_PATH, 'crsp', 'crsp_compustat_link.zip'), compression='zip')

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  stock_data = pd.read_stata(os.path.join(const.RESULT_PATH, '20250307_stock_act_data_v1.dta'))


In [14]:
# Convert date columns with "E" handling
link_data['LINKENDDT'] = link_data['LINKENDDT'].replace('E', '20991231')  # Replace 'E' with future date
link_data['LINKDT'] = pd.to_datetime(link_data['LINKDT'], format='%Y%m%d', errors='coerce')
link_data['LINKENDDT'] = pd.to_datetime(link_data['LINKENDDT'], format='%Y%m%d', errors='coerce')

In [4]:
link_data.drop(['tic', 'cusip', 'cik', 'LINKPRIM', 'LIID', 'LINKTYPE'], axis=1, inplace=True)

In [5]:
link_data['LINKENDDT'] = link_data['LINKENDDT'].fillna(pd.to_datetime('2099-12-31'))

In [6]:
# Create fiscal_year_end (assuming fiscal year ends on Dec 31)
stock_data['fiscal_year_end'] = pd.to_datetime(
    stock_data['fiscal_year'].astype(str) + '1231',
    format='%Y%m%d',
    errors='coerce'
)

In [54]:
# Merge datasets on gvkey
merged = pd.merge(stock_data, link_data, on='gvkey', how='left')

In [63]:
# Split into two groups: matched and unmatched
mask_matched = merged['LINKDT'].notna()  # Rows with link data
matched = merged[mask_matched].copy()

In [64]:
# Process matched data
valid_mask = (
    (matched['fiscal_year_end'] >= matched['LINKDT']) &
    (matched['fiscal_year_end'] <= matched['LINKENDDT'])
)
valid_matched = matched[valid_mask]

In [66]:
unmatched = matched[~matched['index'].isin(valid_matched['index'])].copy()

In [28]:
# Deduplicate matched data (keep most recent link)
matched = matched.sort_values(['gvkey', 'fiscal_year', 'LINKDT'], ascending=[True, True, False])
matched = matched.drop_duplicates(['gvkey', 'fiscal_year'], keep='first')

In [67]:
# Recombine matched and unmatched data
final_merged = pd.concat([matched, unmatched], axis=0)

# Deduplicate matched data (keep most recent link)
final_merged = final_merged.sort_values(['gvkey', 'fiscal_year', 'LINKDT'], ascending=[True, True, False])
final_merged = final_merged.drop_duplicates(['gvkey', 'fiscal_year'], keep='first')

# Cleanup columns (optional)
final_merged = final_merged.drop(columns=['LINKDT', 'LINKENDDT', 'fiscal_year_end', 'index'])

In [68]:
final_merged.shape

(12074, 178)

In [44]:
file_name = 'erp_public_annual_240107'
lee_df: DataFrame = pd.read_csv(
        os.path.join(const.DATABASE_PATH, 'Cost of Capital', f'{file_name}.zip'))
lee_df['yearmonth'] = pd.to_datetime(lee_df['yearmonth'], format='%Y%m')
lee_df[const.YEAR] = lee_df['yearmonth'].dt.year

lee_gvkey_df: DataFrame = lee_df.drop(['permno'], axis=1)

lee_annual_df_gvkey = lee_gvkey_df.drop(['yearmonth'], axis=1).groupby([const.GVKEY, const.YEAR]).mean().reset_index(
    drop=False)

lee_permno_df: DataFrame = lee_df.drop(['gvkey'], axis=1)

lee_annual_df_permno = lee_permno_df.drop(['yearmonth'], axis=1).groupby(['permno', const.YEAR]).mean().reset_index(
    drop=False)

In [46]:
lee_annual_df_gvkey.head()

Unnamed: 0,gvkey,fiscal_year,CCC,ICCA,FIC,FBM,GLS_mech,OJM_mech,CAT_mech,PEG_mech,GLS_an,OJM_an,CAT_an,PEG_an,JLR,LPV,CER,FF6,QFM
0,1000,1971,0.138569,,,,0.135219,0.130235,0.061702,0.227121,,,,,,,,,
1,1000,1972,0.012362,,,0.156587,0.093629,0.069,-0.286032,0.172853,,,,,0.163472,0.164458,0.160909,0.363578,-0.050405
2,1000,1973,0.178617,,,0.394692,0.145194,0.259831,0.179648,0.129796,,,,,0.15952,0.205446,0.182483,0.677393,0.111991
3,1000,1974,0.254683,,,-0.335743,0.164369,0.371771,0.330095,0.152499,,,,,0.143503,0.201762,0.172632,-0.48193,-0.189555
4,1000,1975,0.27455,,,0.922904,0.181502,0.271667,0.372486,0.272544,,,,,0.146125,0.135327,0.140726,1.007678,0.83813


In [76]:
reg_df: DataFrame = final_merged.merge(lee_annual_df_gvkey, on=[const.GVKEY, const.YEAR], how='left').merge(
    lee_annual_df_permno, left_on=['LPERMNO', const.YEAR], right_on=['permno', const.YEAR], how='left', suffixes=('', '_permno')).drop(
    ['permno'], axis=1).merge(
    lee_annual_df_permno, left_on=['LPERMCO', const.YEAR], right_on=['permno', const.YEAR], how='left', suffixes=('', '_permco')
)

In [77]:
reg_df.shape

(12074, 230)

In [79]:
drop_keys = list()

for key in lee_annual_df_gvkey.keys():
    if key in {const.GVKEY, const.YEAR}:
        continue

    reg_df.loc[:, f'{key}'] = reg_df.loc[:, f'{key}'].fillna(reg_df.loc[:, f'{key}_permno']).fillna(
        reg_df.loc[:, f'{key}_permco'])
    drop_keys.append(f'{key}_permno')
    drop_keys.append(f'{key}_permco')

reg_df.drop(drop_keys, axis=1, inplace=True)

In [80]:
reg_df[[i for i in lee_annual_df_gvkey.keys() if i not in {const.GVKEY, const.YEAR}]].describe()

Unnamed: 0,CCC,ICCA,FIC,FBM,GLS_mech,OJM_mech,CAT_mech,PEG_mech,GLS_an,OJM_an,CAT_an,PEG_an,JLR,LPV,CER,FF6,QFM
count,8550.0,6016.0,1194.0,11517.0,8492.0,7155.0,8467.0,8543.0,6016.0,5953.0,6012.0,6016.0,11182.0,10958.0,11198.0,11517.0,11517.0
mean,0.059158,0.104154,0.038322,4.172387,0.096837,0.082764,-0.074231,0.130589,0.096918,0.128373,0.084776,0.110108,0.081406,0.115071,0.097533,5.896023,2.44875
std,0.113958,0.043104,0.076372,109.636933,0.069774,0.132642,0.19782,0.151203,0.03426,0.070667,0.046474,0.064241,0.051448,0.038292,0.043039,173.571609,58.567607
min,-0.301189,0.010139,-0.175428,-0.894024,-0.486484,-0.944422,-0.764966,0.0,-0.085789,-0.030945,-0.011442,0.0,-0.226653,-0.028575,-0.164294,-0.904504,-0.883545
25%,-0.009009,0.078564,-0.013902,0.178178,0.062833,0.031721,-0.191808,0.016581,0.076986,0.087738,0.062009,0.075722,0.051308,0.092938,0.072758,0.174696,0.158006
50%,0.033337,0.094436,0.033459,0.41477,0.083015,0.050049,-0.086795,0.086853,0.094095,0.10836,0.078039,0.096739,0.077922,0.110155,0.092738,0.422477,0.391153
75%,0.091487,0.117586,0.085393,0.888823,0.11015,0.076941,0.021548,0.17894,0.112365,0.147725,0.097221,0.129362,0.110229,0.131678,0.119286,0.946237,0.819179
max,0.867143,0.523528,0.329677,7989.308746,0.983913,0.997,0.990605,0.990756,0.534496,0.935309,0.673604,0.789173,0.285082,0.321112,0.303097,11518.744239,4957.995528


In [82]:
reg_df.to_stata(os.path.join(const.RESULT_PATH, '20250307_stock_act_data_v2.dta'), write_index=False, version=119)