In [3]:
import os

import pandas as pd
import numpy as np
from scipy import stats

from Constants import Constants as const

In [5]:
# Step 1: Load the data and create event time variable
file_path = os.path.join(const.RESULT_PATH, '20241014_stock_act_reg_data_v2.dta')
df = pd.read_stata(file_path).copy()

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  df = pd.read_stata(file_path).copy()


In [13]:
# Columns to be included in the analysis
columns = [
    "log_frequency_w", "log_frequency_ann_w", "log_frequency_qtr_w", "log_frequency_eps_w", "log_frequency_noneps_w", "log_frequency_day_w", 
    "env_info_w", "social_info_w", "width_w", "cpie_gpin", "cpie_owr", "SYNCHRONICITY_MONTH", "SYNCHRONICITY_MKT_MONTH", 
    "SYNCHRONICITY_IND_MONTH", "numest_last", "FCSTERROR_last", "DISPERSION_last", 
    "log_market_value_w", "lev_w", "BM_w", "ROA_w", "LOSS", "EarnVol_w", "ret_w", "turnover_w", "StkVol_w"
]


In [14]:
# Split the data into two parts based on fiscal_year
before_2012_df = df[df['fiscal_year'] < 2012]
after_2012_df = df[df['fiscal_year'] >= 2012]

In [17]:
# Function to perform comparison between treatment and control groups
def perform_comparison(df, columns):
    treatment_group = df[df['MajorGovCustomer'] == 1]
    control_group = df[df['MajorGovCustomer'] == 0]

    results = []

    for col in columns:
        treat_mean = treatment_group[col].mean()
        treat_median = treatment_group[col].median()
        treat_num = treatment_group[col].count()

        control_mean = control_group[col].mean()
        control_median = control_group[col].median()
        control_num = control_group[col].count()

        diff_mean = treat_mean - control_mean
        t_stat, p_value = stats.ttest_ind(treatment_group[col].dropna(), control_group[col].dropna(), equal_var=False)

        # Determine significance level
        if p_value < 0.01:
            significance = '***'
        elif p_value < 0.05:
            significance = '**'
        elif p_value < 0.1:
            significance = '*'
        else:
            significance = ''

        # Round values to three significant digits
        treat_mean = round(treat_mean, 3)
        treat_median = round(treat_median, 3)
        control_mean = round(control_mean, 3)
        control_median = round(control_median, 3)
        diff_mean = round(diff_mean, 3)
        t_stat = f"({round(t_stat, 3)})"

        results.append([treat_mean, treat_median, treat_num, control_mean, control_median, control_num, diff_mean, t_stat, significance])

    # Create a DataFrame to display the results
    results_df = pd.DataFrame(results, columns=[
        "Treatment Mean", "Treatment Median", "Treatment N", 
        "Control Mean", "Control Median", "Control N", 
        "Difference in Mean", "t-stat", "Significance"
    ], index=columns)
    return results_df

In [18]:
# Perform comparison for data before 2012
results_before_2012 = perform_comparison(before_2012_df, columns)
print("Results for data before 2012:")
print(results_before_2012)


Results for data before 2012:
                         Treatment Mean  Treatment Median  Treatment N  \
log_frequency_w                   0.752             0.000          933   
log_frequency_ann_w               0.684             0.000          933   
log_frequency_qtr_w               0.225             0.000          933   
log_frequency_eps_w               0.351             0.000          933   
log_frequency_noneps_w            0.656             0.000          933   
log_frequency_day_w               0.545             0.000          933   
env_info_w                        3.815             0.000          933   
social_info_w                    11.170            12.023          933   
width_w                           0.689             0.800          209   
cpie_gpin                         0.426             0.474          517   
cpie_owr                          0.427             0.477          517   
SYNCHRONICITY_MONTH               1.583             1.520          612   
SYNCHRON

In [20]:
results_before_2012.to_excel(os.path.join(const.REGRESSION_RESULT_PATH, '20241018', 'TvC before 2012.xlsx'))

In [19]:
# Perform comparison for data from 2012 onwards
results_after_2012 = perform_comparison(after_2012_df, columns)
print("Results for data from 2012 onwards:")
print(results_after_2012)

Results for data from 2012 onwards:
                         Treatment Mean  Treatment Median  Treatment N  \
log_frequency_w                   0.750             0.000          921   
log_frequency_ann_w               0.676             0.000          921   
log_frequency_qtr_w               0.229             0.000          921   
log_frequency_eps_w               0.363             0.000          921   
log_frequency_noneps_w            0.651             0.000          921   
log_frequency_day_w               0.545             0.000          921   
env_info_w                        3.180             0.000          921   
social_info_w                    11.640            12.616          921   
width_w                           0.732             0.800          139   
cpie_gpin                         0.453             0.486          657   
cpie_owr                          0.417             0.431          657   
SYNCHRONICITY_MONTH               1.313             1.234          624   
SY

In [21]:
results_after_2012.to_excel(os.path.join(const.REGRESSION_RESULT_PATH, '20241018', 'TvC after 2012.xlsx'))
