In [1]:
import os

import pandas as pd
from pandas import DataFrame
import numpy as np
from scipy import stats

from Constants import Constants as const

In [2]:
# Step 1: Load the data and create event time variable
file_path = os.path.join(const.RESULT_PATH, '20250420_stock_act_data_v2.dta')
df = pd.read_stata(file_path).copy()

In [3]:
# Columns to be included in the analysis
columns = [
    "log_frequency_w", "log_frequency_ann_w", "log_frequency_qtr_w", "log_frequency_eps_w", "log_frequency_noneps_w", "log_frequency_day_w", 
    "IDIOSYN", "social_info_w", "width_w", "cpie_gpin_nyse", "numest_last", "FCSTERROR_last", "DISPERSION_last",
    "log_market_value_w", "lev_w", "BM_w", "ROA_w", "LOSS", "EarnVol_w", "ret_w", "turnover_w", "StkVol_w", 'price_delay'
]


In [4]:
# Split the data into two parts based on fiscal_year
before_2012_df = df[df['post'] == 0].copy()
after_2012_df = df[df['post'] == 1].copy()

In [5]:
# Function to perform comparison between treatment and control groups
def perform_comparison(df, columns):
    treatment_group = df[df['MajorGovCustomer'] == 1]
    control_group = df[df['MajorGovCustomer'] == 0]

    results = []

    for col in columns:
        treat_mean = treatment_group[col].mean()
        treat_median = treatment_group[col].median()
        treat_num = treatment_group[col].count()

        control_mean = control_group[col].mean()
        control_median = control_group[col].median()
        control_num = control_group[col].count()

        diff_mean = treat_mean - control_mean
        t_stat, p_value = stats.ttest_ind(treatment_group[col].dropna(), control_group[col].dropna(), equal_var=False)

        # Determine significance level
        if p_value < 0.01:
            significance = '***'
        elif p_value < 0.05:
            significance = '**'
        elif p_value < 0.1:
            significance = '*'
        else:
            significance = ''

        # Round values to three significant digits
        treat_mean = round(treat_mean, 3)
        treat_median = round(treat_median, 3)
        control_mean = round(control_mean, 3)
        control_median = round(control_median, 3)
        diff_mean = round(diff_mean, 3)
        t_stat = f"({round(t_stat, 3)})"

        results.append([treat_mean, treat_median, treat_num, control_mean, control_median, control_num, diff_mean, t_stat, significance])

    # Create a DataFrame to display the results
    results_df = pd.DataFrame(results, columns=[
        "Treatment Mean", "Treatment Median", "Treatment N", 
        "Control Mean", "Control Median", "Control N", 
        "Difference in Mean", "t-stat", "Significance"
    ], index=columns)
    return results_df

In [6]:
# Perform comparison for data before 2012
results_before_2012 = perform_comparison(before_2012_df, columns)
print("Results for data before 2012:")
print(results_before_2012)


Results for data before 2012:
                        Treatment Mean  Treatment Median  Treatment N  \
log_frequency_w                  0.752             0.000          933   
log_frequency_ann_w              0.684             0.000          933   
log_frequency_qtr_w              0.225             0.000          933   
log_frequency_eps_w              0.351             0.000          933   
log_frequency_noneps_w           0.656             0.000          933   
log_frequency_day_w              0.545             0.000          933   
IDIOSYN                         -1.603            -1.527          880   
social_info_w                   11.170            12.023          933   
width_w                          0.689             0.800          209   
cpie_gpin_nyse                   0.426             0.474          477   
numest_last                      4.815             2.000          933   
FCSTERROR_last                   0.141             0.004          561   
DISPERSION_last      

In [8]:
results_before_2012.loc[['IDIOSYN', 'cpie_gpin_nyse', 'price_delay']]

Unnamed: 0,Treatment Mean,Treatment Median,Treatment N,Control Mean,Control Median,Control N,Difference in Mean,t-stat,Significance
IDIOSYN,-1.603,-1.527,880,-1.639,-1.505,4754,0.036,(0.664),
cpie_gpin_nyse,0.426,0.474,477,0.424,0.474,2590,0.003,(0.524),
price_delay,0.249,0.158,873,0.268,0.167,4809,-0.019,(-2.063),**


In [7]:
results_before_2012.to_excel(os.path.join(const.REGRESSION_RESULT_PATH, '20250416', 'T2PA TvC before 2012.xlsx'))

In [9]:
# Perform comparison for data from 2012 onwards
results_after_2012 = perform_comparison(after_2012_df, columns)
print("Results for data from 2012 onwards:")
print(results_after_2012)

Results for data from 2012 onwards:
                        Treatment Mean  Treatment Median  Treatment N  \
log_frequency_w                  0.750             0.000          921   
log_frequency_ann_w              0.676             0.000          921   
log_frequency_qtr_w              0.229             0.000          921   
log_frequency_eps_w              0.363             0.000          921   
log_frequency_noneps_w           0.651             0.000          921   
log_frequency_day_w              0.545             0.000          921   
IDIOSYN                         -1.286            -1.189          884   
social_info_w                   11.640            12.616          921   
width_w                          0.732             0.800          139   
cpie_gpin_nyse                   0.454             0.493          532   
numest_last                      5.284             3.000          921   
FCSTERROR_last                   0.148             0.003          582   
DISPERSION_last

In [11]:
results_after_2012.loc[['IDIOSYN', 'cpie_gpin_nyse', 'price_delay']]


Unnamed: 0,Treatment Mean,Treatment Median,Treatment N,Control Mean,Control Median,Control N,Difference in Mean,t-stat,Significance
IDIOSYN,-1.286,-1.189,884,-1.208,-1.167,5091,-0.078,(-1.76),*
cpie_gpin_nyse,0.454,0.493,532,0.464,0.497,2867,-0.01,(-2.019),**
price_delay,0.403,0.326,871,0.403,0.327,5114,0.001,(0.074),


In [9]:
results_after_2012.to_excel(os.path.join(const.REGRESSION_RESULT_PATH, '20250416', 'T2PB TvC after 2012.xlsx'))


# Get stepwise observations number

In [2]:
ctat_df = pd.read_csv(os.path.join(const.COMPUSTAT_PATH, '1950_2022_ctat_all_data.zip'),
                                     usecols=['fyear', const.GVKEY, 'bkvlps', 'prcc_f', 'csho', 'mkvalt'],
                                     dtype={const.GVKEY: str}).rename(columns={'fyear': const.YEAR})
ctat_df = ctat_df.loc[ctat_df[const.YEAR].apply(lambda x: 2007 < x < 2016)]
ctat_df.shape

(99932, 6)

In [3]:
ctat_df.drop_duplicates(subset=['gvkey', const.YEAR]).shape

(90226, 6)

# Calculate the cost of capital observations

In [2]:
lee_df: DataFrame = pd.read_csv(
            os.path.join(const.DATABASE_PATH, 'Cost of Capital', f'erp_public_annual_240107.zip')).drop(['permno'], axis=1)

In [5]:
lee_df[['CCC', 'GLS_mech', 'OJM_mech', 'CAT_mech', 'PEG_mech']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CCC,2088311.0,0.090159,0.124445,-0.538879,0.007939,0.0659,0.145795,0.999968
GLS_mech,2068379.0,0.112307,0.072132,-0.683782,0.070185,0.099579,0.139144,0.99978
OJM_mech,1768009.0,0.111107,0.137101,-0.990699,0.039521,0.071381,0.130939,0.99998
CAT_mech,2054667.0,-0.00618,0.230304,-0.920344,-0.156598,-0.019888,0.129285,0.999998
PEG_mech,2080521.0,0.140521,0.14246,0.0,0.046016,0.106245,0.190371,0.999995


In [9]:
lee_df.loc[lee_df['CCC'].notnull() & lee_df['GLS_mech'].isnull(), ['CCC', 'GLS_mech', 'OJM_mech', 'CAT_mech', 'PEG_mech']]

Unnamed: 0,CCC,GLS_mech,OJM_mech,CAT_mech,PEG_mech
641,0.587013,,,,0.587013
642,0.502280,,,,0.502280
643,0.467239,,,,0.467239
644,0.451498,,,,0.451498
645,0.399652,,,,0.399652
...,...,...,...,...,...
3481805,0.161550,,,,0.161550
3481806,0.189709,,,,0.189709
3481807,0.191881,,,,0.191881
3481808,0.198166,,,,0.198166


# Check missing car variables

In [8]:
data_df: DataFrame = pd.read_stata(os.path.join(const.RESULT_PATH, '20250323_stock_act_idiosyn_v3.dta'))

In [27]:
data_df.head()

Unnamed: 0,gvkey,fiscal_year,mf_indicator,frequency,log_frequency,MajorGovCustomer,post,log_market_value,lev,ROA,...,kurtosis,sigma_d,skewness_d,kurtosis_d,SYNCHRONICITY,SYNCHRONICITY_MKT,SYNCHRONICITY_IND,IDIOSYN,IDIOSYN_IND,IDIOSYN_MKT
0,1004,2008,1.0,9.0,2.302585,1.0,0.0,6.350178,0.261162,0.029731,...,-0.449207,0.047688,0.391786,1.3415,0.473718,-0.68415,1.157868,-0.473718,-1.157868,0.68415
1,1004,2009,1.0,10.0,2.397895,1.0,0.0,6.657799,0.201838,0.042929,...,-0.956878,0.034468,1.173109,4.692126,0.49482,0.04684,0.44798,-0.49482,-0.44798,-0.04684
2,1004,2010,1.0,20.0,3.044522,1.0,0.0,6.957327,0.360588,0.030844,...,-0.948359,0.0278,-0.853567,5.917755,3.330836,1.618001,1.712835,-3.330836,-1.712835,-1.618001
3,1004,2011,1.0,4.0,1.609438,1.0,0.0,6.186804,0.331602,0.025738,...,0.284618,0.028483,-0.044077,1.864609,2.868695,0.900684,1.96801,-2.868695,-1.96801,-0.900684
4,1004,2012,1.0,16.0,2.833213,1.0,1.0,6.673302,0.288247,0.033144,...,-0.940611,0.02646,0.90739,5.861971,1.129614,0.697743,0.431871,-1.129614,-0.431871,-0.697743


In [34]:
data_df.loc[data_df['SYNCHRONICITY'].isnull() & data_df['sigma'].isnull(), ['LPERMNO', 'gvkey', 'cusip', 'ticker', const.YEAR, 'SYNCHRONICITY', 'sigma']]

Unnamed: 0,LPERMNO,gvkey,cusip,ticker,fiscal_year,SYNCHRONICITY,sigma
706,81770,3439,125896100,CMS,2008,,
707,81770,3439,125896100,CMS,2009,,
708,81770,3439,125896100,CMS,2010,,
709,81770,3439,125896100,CMS,2011,,
710,81770,3439,125896100,CMS,2012,,
...,...,...,...,...,...,...,...
11632,88315,201395,,,2012,,
11633,88315,201395,,,2013,,
11634,88315,201395,,,2014,,
11635,88315,201395,,,2015,,
