In [1]:
import os

import pandas as pd
import numpy as np
from scipy import stats

from Constants import Constants as const

In [2]:
# Step 1: Load the data and create event time variable
file_path = os.path.join(const.RESULT_PATH, '20250220_stock_act_data_v2.dta')
df = pd.read_stata(file_path)

In [3]:
id_df = df[[const.GVKEY, const.YEAR, 'MajorGovCustomer']].copy()
id_df.to_csv(os.path.join(const.RESULT_PATH, '20250225_stock_act_firm_list.csv'), index=False)

# Construct Abnormal Trading Volume

First, Calculate the mean and standard deviation of trading volume using the data with in the [-252, -21] event window.
Then, using the trading volume around the event date minus the average trading volume, divided by the standard deviation of trading volume
Last, Calculate abnormal trading volume using the two event window [-2, 2] and [-1, 1]

In [11]:
#------------------------------------------------------------
# 1. Load the data
#------------------------------------------------------------
# Event data
events = pd.read_stata(os.path.join(const.RESULT_PATH, 'FinalCARData_20250306_lxt.dta'))
# CRSP data
crsp = pd.read_csv(os.path.join(const.DATABASE_PATH, 'crsp', '2007_2015_CRSP_vol_data.zip'), dtype={'NCUSIP': str})


In [10]:
crsp['NCUSIP'].dtypes

dtype('O')

In [12]:
# Convert dates to datetime
events['rdq'] = pd.to_datetime(events['rdq'])
crsp['date'] = pd.to_datetime(crsp['date'])

# Preprocess CRSP data: group by NCUSIP for faster lookup
crsp_grouped = crsp.groupby('NCUSIP')

def calculate_abnormal_volume(row):
    """Calculate abnormal volume for a single event row"""
    cusip = row['cusip_8']
    event_date = row['rdq']

    # Get firm data from CRSP
    try:
        firm_data = crsp_grouped.get_group(cusip).sort_values('date').reset_index(drop=True)
    except KeyError:
        return pd.Series([np.nan, np.nan])

    # Find event date position
    event_mask = (firm_data['date'] == event_date)
    if not event_mask.any():
        return pd.Series([np.nan, np.nan])

    event_idx = firm_data[event_mask].index[0]

    # Estimation window [-252, -21]
    start_est = max(0, event_idx - 252)
    end_est = max(0, event_idx - 21)

    if start_est >= end_est:
        return pd.Series([np.nan, np.nan])

    estimation_vol = firm_data.loc[start_est:end_est, 'VOL']
    mean_vol = estimation_vol.mean()
    std_vol = estimation_vol.std()

    if std_vol == 0:
        return pd.Series([np.nan, np.nan])

    # Calculate abnormal volumes
    def calc_window_abnormal(start, end):
        window_start = max(0, event_idx + start)
        window_end = min(len(firm_data)-1, event_idx + end)
        window_vol = firm_data.loc[window_start:window_end, 'VOL']
        return ((window_vol - mean_vol) / std_vol).sum()

    cav_2_2 = calc_window_abnormal(-2, 2)
    cav_1_1 = calc_window_abnormal(-1, 1)

    return pd.Series([cav_2_2, cav_1_1])

# Apply the calculation to each row
events[['CAV_2_2', 'CAV_1_1']] = events.apply(calculate_abnormal_volume, axis=1)

In [15]:
events.describe()

Unnamed: 0,fiscal_year,majorgovcustomer,rdq,CAPMCAR22,CAPMCAR11,CAPMCAR13,CAPMCAR15,CAPMCAR05,CAPMCAR55,FF3CAR22,...,mkvaltq,prccq,total_debt,size,roa,bm,lev,post,CAV_2_2,CAV_1_1
count,31954.0,31954.0,31954,31954.0,31954.0,31954.0,31954.0,31954.0,31954.0,31954.0,...,31866.0,31951.0,29950.0,31866.0,31857.0,31815.0,29950.0,31954.0,30859.0,30859.0
mean,2011.71919,0.164173,2012-06-01 06:39:43.476247040,0.001849,0.002528,0.001271,0.001266,0.000717,0.001381,0.001944,...,6760.489074,31.89119,1575.073242,6.803043,0.002896,1.397894,0.193197,0.538149,3.530836,2.942663
min,2008.0,0.0,2008-02-12 00:00:00,-0.879886,-0.750561,-0.843614,-1.107498,-1.040059,-1.281628,-0.933216,...,1.2611,0.0701,0.0,2.293272,-0.207989,0.120152,0.0,0.0,-7.461892,-4.730442
25%,2010.0,0.0,2010-07-21 00:00:00,-0.039554,-0.039003,-0.04662,-0.05028,-0.049351,-0.055744,-0.039331,...,225.16115,8.5,0.75025,5.416817,-0.000252,0.558467,0.003971,0.0,-0.540035,-0.156169
50%,2012.0,0.0,2012-07-25 00:00:00,-0.00035,0.000158,-0.000416,-0.000689,-0.001032,-0.000676,-0.000253,...,886.9696,19.83,109.861,6.787811,0.009551,0.997759,0.153694,1.0,1.680589,1.534981
75%,2014.0,0.0,2014-05-07 00:00:00,0.041397,0.042193,0.046423,0.050171,0.048048,0.055246,0.041203,...,3441.72935,39.11,846.596252,8.143729,0.020222,1.69023,0.315476,1.0,5.305096,4.263735
max,2015.0,1.0,2016-08-08 00:00:00,1.653363,1.847709,1.679236,1.944226,2.049613,1.939718,1.667979,...,717000.2515,4197.95,27464.0,11.638428,0.087504,8.648678,0.775204,1.0,1505.069737,973.714104
std,2.261275,0.370439,,0.093874,0.091006,0.101217,0.110006,0.107234,0.120843,0.094214,...,23775.584679,82.912287,4230.963867,2.045294,0.041155,1.393723,0.190146,0.498556,12.023182,8.599978


In [20]:
for key in ['CAPMCAR22', 'CAPMCAR11', 'CAPMCAR13', 'CAPMCAR15', 'CAPMCAR05',
       'CAPMCAR55', 'FF3CAR22', 'FF3CAR11', 'FF3CAR13', 'FF3CAR15', 'FF3CAR05',
       'FF3CAR55']:
    events[f'abs_{key}'] = events[key].abs()

events['qtr'] = events['fqtr'].apply(lambda x: int(x.split('Q')[1]))
events.to_stata(os.path.join(const.RESULT_PATH, '20250307_stock_act_eap_test.dta'), write_index=False)

['2008', '2']