Sample code to generate Table 6


In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from linearmodels import PanelOLS
# for latex plot format
from matplotlib import rc
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)
pd.options.display.max_columns = None

In [2]:
# load main file
ea = pd.read_hdf("../Proprietary Data (cannot be shared)/BSI_Summaries_EA_merged.h5")


# Regression: Hacked vs non-hacked


In [3]:
def getReg2(
    df,
    model,
    control_var=False,
    control_var_names=False,
    firm_fe=False,
    year_fe=False,
    date_fe=False,
    firm_year_fe=False,
    industry_fe=False,
    firm_date_fe=False,
    cluster_entity_time=False,
    cluster_time=False,
    exclude_other=False,
    firm_pm_fe=False,
    match_gr=False,
):

    if exclude_other:
        df = df[df["SOURCE_NAME"] != "OTHER"]

    if control_var:
        control = "+".join(control_var_names)
        model = model + "+" + control
    if year_fe:
        if firm_pm_fe:
            df = df.set_index(["PERMNO_pm", "YYYYQ_pm"])
        else:
            df = df.set_index(["PERMNO", "YYYYQ"])
        model = model + "+TimeEffects"
    elif firm_fe:
        if firm_pm_fe:
            df = df.set_index(["PERMNO_pm", "YYYYQ_pm"])
        else:
            df = df.set_index(["PERMNO", "YYYYQ"])
        model = model + "+EntityEffects"
    elif firm_year_fe:
        if firm_pm_fe:
            df = df.set_index(["PERMNO_pm", "YYYYQ_pm"])
        else:
            df = df.set_index(["PERMNO", "YYYYQ"])
        model = (
            model + "+EntityEffects+TimeEffects"
        )  # minus one is to remove the constant
    elif firm_date_fe:
        if firm_pm_fe:
            df = df.set_index(["PERMNO_pm", "EA_Date"])
        else:
            df = df.set_index(["PERMNO", "EA_Date"])
        model = (
            model + "+EntityEffects+TimeEffects"
        )  # minus one is to remove the constant
    elif match_gr:
        df = df.set_index(["match_gr", "YYYYQ"])
    else:
        if firm_pm_fe:
            df = df.set_index(["PERMNO_pm", "YYYYQ_pm"])
        else:
            df = df.set_index(["PERMNO", "YYYYQ"])
    # clustering
    if cluster_entity_time:
        reg_ = PanelOLS.from_formula(model, data=df).fit(
            cov_type="clustered", cluster_entity=True, cluster_time=True
        )
    elif match_gr:
        reg_ = PanelOLS.from_formula(model, data=df).fit(
            cov_type="clustered", cluster_entity=True
        )
    else:
        reg_ = PanelOLS.from_formula(model, data=df).fit(
            cov_type="clustered", cluster_time=True
        )
    return reg_


In [4]:
def getTable(reg_, var_names, regressor_order, controls=False, format_="{:.3f}"):
    all_reg_out = pd.DataFrame()
    for i in reg_.keys():
        res = reg_[i]
        t = pd.concat([res.params, res.std_errors, res.tstats], axis=1)
        star_params = []
        std_err = []
        for p, tstat, std in zip(t["parameter"], t["tstat"], t["std_error"]):
            if np.abs(tstat) >= 1.645 and np.abs(tstat) < 1.96:
                star_params.append(format_.format(p) + "*")
            elif np.abs(tstat) >= 1.96 and np.abs(tstat) < 2.576:
                star_params.append(format_.format(p) + "**")
            elif np.abs(tstat) >= 2.576:
                star_params.append(format_.format(p) + "***")
            else:
                star_params.append(format_.format(p))
            std_err.append("(" + "{:.2f}".format(std) + ")")
        t["parameter"] = star_params
        t["std_error"] = std_err
        t = t[["parameter", "std_error"]]

        # remove the control variables from the index
        if controls:
            if "lnMCAP" in t.index:
                control_var = True
            else:
                control_var = False
        else:
            control_var = False

        # order the regressors and at the same time keep the ones you want to show
        t = t.loc[regressor_order]

        var_names_index = t.index  # get the parameter names

        output = pd.DataFrame()  # prepare formatting the output table
        for ind in t.index:
            t2 = t.loc[[ind]].T
            t2.index = [ind, ""]
            t2.columns = ["Reg"]
            output = pd.concat([output, t2])
        output.loc["$N$"] = "{0:,d}".format(res.nobs)
        output.loc["$R^2$"] = "{:.3f}".format(res.rsquared)

        # specify if we add control variables:
        if control_var:
            output.loc["Controls"] = "Y"
        else:
            output.loc["Controls"] = "N"
        # add the row specifying fixed effects:
        fe_names = list(
            res.estimated_effects.index.names
        )  # get the names of the fixed-effects
        if res.included_effects == ["Time"]:
            fe_names = fe_names[1]
        if res.included_effects == ["Entity"]:
            fe_names = fe_names[0]
        if "YYYYQ" in fe_names:
            output.loc["Year-Quarter F.E."] = "Y"
        else:
            output.loc["Year-Quarter F.E."] = "N"
        if "PERMNO" in fe_names:
            output.loc["Firm F.E."] = "Y"
        else:
            output.loc["Firm F.E."] = "N"
        if "EA_Date" in fe_names:
            output.loc["Date F.E."] = "Y"
        else:
            output.loc["Date F.E."] = "N"
        # rename the index variables
        for names in var_names_index:
            output.rename(
                index={names: var_names[names]}, inplace=True
            )  # change the names for latex
        all_reg_out = pd.concat([all_reg_out, output], axis=1)
    # rename the column header
    ncol = len(all_reg_out.columns) + 1
    all_reg_out.columns = ["(" + str(n) + ")" for n in range(1, ncol)]
    return all_reg_out


In [5]:
var_names = {
    "Intercept": "Intercept",
    "hacked": "$\\mathbf{1}_{[\\text{Hacked}]}$",
    "pm": "$\\mathbf{1}_{[\\text{PM}]}$",
    "hacked_pm": "$\\mathbf{1}_{[\\text{Hacked}]} \\times \\mathbf{1}_{[\\text{PM}]}$",
}


## Table 5


## Panel A


In [6]:
ea_ = ea.copy()

reg_all = {}
for i in [
    "TURN_pm",
    "lnVolume_pm",
    "OIvol_LR_pm_abs",
    "lnOptVolPM",
    "EffectiveSpread_Percent_DW_pm",
    "PercentRealizedSpread_LR_DW_pm",
    "QuotedSpread_Percent_pm",
    "PercentPriceImpact_LR_DW_pm",
]:  #  'RS_PI_Percent_DW_pm',

    if (i != "lnVolume_pm") and (i != "lnOptVolPM") and (i != "lnOptVolPM_OTM"):
        ea_[i] = ea_[i] / ea_[i].std()
    reg_model = i + "~hacked"

    ea_[i].fillna(0, inplace=True)
    reg_all[i] = getReg2(
        ea_,
        reg_model,
        control_var=True,
        control_var_names=[
            "lnMCAP",
            "invPRC",
            "lnnumest",
            "ln_Story_Count_Relevant",
            "DCBS",
            "IO",
        ],
        firm_year_fe=True,
        cluster_entity_time=True,
    )


In [7]:
tab_pm = getTable(
    reg_all, var_names, regressor_order=["hacked"], controls=True, format_="{:.4f}"
)
tab_pm.drop("Date F.E.", inplace=True)
n_col = len(tab_pm.columns) + 1
tab_pm.columns = [
    ["Order flow measures", "", "", "", "Spread measures", "", "", ""],
    [
        "Share turn",
        "Log(share vol)",
        "\\big|OI\\big|",
        "Log(option vol)",
        "Effective spread",
        "Realized spread",
        "Price impact",
        "Quoted spread",
    ],
    ["(" + str(i) + ")" for i in range(1, n_col)],
]
tab_pm


Unnamed: 0_level_0,Order flow measures,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Spread measures,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0
Unnamed: 0_level_1,Share turn,Log(share vol),\big|OI\big|,Log(option vol),Effective spread,Realized spread,Price impact,Quoted spread
Unnamed: 0_level_2,(1),(2),(3),(4),(5),(6),(7),(8)
$\mathbf{1}_{[\text{Hacked}]}$,0.0490***,0.0350**,0.0080,0.0721**,0.0312***,0.0445***,0.0153,-0.0140
,(0.02),(0.01),(0.01),(0.03),(0.01),(0.02),(0.01),(0.02)
$N$,43687,43687,43687,43687,43687,43687,43687,43687
$R^2$,0.021,0.038,0.013,0.054,0.153,0.034,0.128,0.022
Controls,Y,Y,Y,Y,Y,Y,Y,Y
Year-Quarter F.E.,Y,Y,Y,Y,Y,Y,Y,Y
Firm F.E.,Y,Y,Y,Y,Y,Y,Y,Y


## Panel B


In [8]:
ea_ = ea.copy()

reg_all = {}
for i in [
    "TURN_am",
    "lnVolume_am",
    "OIvol_LR_am_abs",
    "lnOptVolAM",
    "EffectiveSpread_Percent_DW_am",
    "PercentRealizedSpread_LR_DW_am",
    "PercentPriceImpact_LR_DW_am",
    "QuotedSpread_Percent_am",
]:  #  'RS_PI_Percent_DW_am',

    if (i != "lnVolume_am") and (i != "lnOptVolAM"):
        ea_[i] = ea_[i] / ea_[i].std()
    reg_model = i + "~hacked"

    ea_[i].fillna(0, inplace=True)
    reg_all[i] = getReg2(
        ea_,
        reg_model,
        control_var=True,
        control_var_names=[
            "lnMCAP",
            "invPRC",
            "lnnumest",
            "ln_Story_Count_Relevant",
            "DCBS",
            "IO",
        ],
        firm_year_fe=True,
        cluster_entity_time=True,
    )


In [9]:
tab_am = getTable(
    reg_all, var_names, regressor_order=["hacked"], controls=True, format_="{:.4f}"
)
tab_am.drop("Date F.E.", inplace=True)
n_col = len(tab_am.columns) + 1
tab_am.columns = [
    ["Order flow measures", "", "", "", "Spread measures", "", "", ""],
    [
        "Share turn",
        "Log(share vol)",
        "\\big|OI\\big|",
        "Log(option vol)",
        "Effective spread",
        "Realized spread",
        "Price impact",
        "Quoted spread",
    ],
    ["(" + str(i) + ")" for i in range(1, n_col)],
]
tab_am


Unnamed: 0_level_0,Order flow measures,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Spread measures,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0
Unnamed: 0_level_1,Share turn,Log(share vol),\big|OI\big|,Log(option vol),Effective spread,Realized spread,Price impact,Quoted spread
Unnamed: 0_level_2,(1),(2),(3),(4),(5),(6),(7),(8)
$\mathbf{1}_{[\text{Hacked}]}$,-0.0092,-0.0176,-0.0099,0.0331,0.0063,0.0127,-0.0137,0.0178*
,(0.01),(0.02),(0.01),(0.03),(0.01),(0.01),(0.01),(0.01)
$N$,43687,43687,43687,43687,43687,43687,43687,43687
$R^2$,0.017,0.033,0.011,0.046,0.100,0.030,0.024,0.101
Controls,Y,Y,Y,Y,Y,Y,Y,Y
Year-Quarter F.E.,Y,Y,Y,Y,Y,Y,Y,Y
Firm F.E.,Y,Y,Y,Y,Y,Y,Y,Y


## Panel C


In [10]:
ea_ = ea.copy()
ea_.rename(
    columns={
        "OptVolAM": "OptVol_am",
        "OptVolPM": "OptVol_pm",
        "OIvol_LR_am_abs": "OIvol_LR_abs_am",
        "OIvol_LR_pm_abs": "OIvol_LR_abs_pm",
    },
    inplace=True,
)

reg_all = {}
pval = []
params = []
for i in [
    "TURN",
    "lnVolume",
    "OIvol_LR_abs",
    "OptVol",
    "EffectiveSpread_Percent_DW",
    "PercentRealizedSpread_LR_DW",
    "PercentPriceImpact_LR_DW",
    "QuotedSpread_Percent",
]:

    # pm dataset
    pm = ea_[
        [
            "PERMNO",
            "YYYYQ",
            i + "_pm",
            "lnMCAP",
            "hacked",
            "IBES_Timestamp",
            "invPRC",
            "lnnumest",
            "ln_Story_Count_Relevant",
            "DCBS",
            "IO",
        ]
    ].rename(columns={i + "_pm": i})
    pm["pm"] = 1

    if (i != "lnVolume") and (i != "OptVol"):  # standardized
        pm[i] = pm[i] / pm[i].std()

    # am dataset
    am = ea_[
        [
            "PERMNO",
            "YYYYQ",
            i + "_am",
            "lnMCAP",
            "hacked",
            "IBES_Timestamp",
            "invPRC",
            "lnnumest",
            "ln_Story_Count_Relevant",
            "DCBS",
            "IO",
        ]
    ].rename(columns={i + "_am": i})
    am["pm"] = 0

    if (i != "lnVolume") and (i != "OptVol"):  # standardized
        am[i] = am[i] / am[i].std()

    tmp = pd.concat([pm, am])
    tmp["hacked_pm"] = tmp["hacked"] * tmp["pm"]
    tmp["PERMNO_pm"] = tmp["PERMNO"].astype(str) + "_" + tmp["pm"].astype(str)
    tmp["YYYYQ_pm"] = tmp["YYYYQ"].astype(str) + tmp["pm"].astype(str)
    tmp["YYYYQ_pm"] = tmp["YYYYQ_pm"].astype(int)
    tmp["lnMCAP_pm"] = tmp["lnMCAP"] * tmp["pm"]
    tmp["invPRC_pm"] = tmp["invPRC"] * tmp["pm"]
    tmp["IO_pm"] = tmp["IO"] * tmp["pm"]
    tmp["lnnumest_pm"] = tmp["lnnumest"] * tmp["pm"]
    tmp["ln_Story_Count_Relevant_pm"] = tmp["ln_Story_Count_Relevant"] * tmp["pm"]
    tmp["DCBS_pm"] = tmp["DCBS"] * tmp["pm"]

    if i == "OptVol":
        ea_2 = tmp.copy()
        ea_2["lnOptVol"] = np.log(1 + ea_2["OptVol"])
        ea_2[i].fillna(0, inplace=True)
        reg_model = "lnOptVol~hacked+hacked_pm"
        reg_all[i] = getReg2(
            ea_2,
            reg_model,
            control_var=True,
            control_var_names=[
                "lnMCAP",
                "invPRC",
                "IO",
                "lnnumest",
                "DCBS",
                "ln_Story_Count_Relevant",
                "lnMCAP_pm",
                "invPRC_pm",
                "IO_pm",
                "lnnumest_pm",
                "DCBS_pm",
                "ln_Story_Count_Relevant_pm",
            ],
            firm_year_fe=True,
            cluster_entity_time=True,
            firm_pm_fe=True,
        )
    else:
        tmp[i].fillna(0, inplace=True)
        reg_model = i + "~hacked+hacked_pm"
        reg_all[i] = getReg2(
            tmp,
            reg_model,
            control_var=True,
            control_var_names=[
                "lnMCAP",
                "invPRC",
                "IO",
                "lnnumest",
                "DCBS",
                "ln_Story_Count_Relevant",
                "lnMCAP_pm",
                "invPRC_pm",
                "IO_pm",
                "lnnumest_pm",
                "DCBS_pm",
                "ln_Story_Count_Relevant_pm",
            ],
            firm_year_fe=True,
            cluster_entity_time=True,
            firm_pm_fe=True,
        )

    params.append("{:.4f}".format(reg_all[i].params["hacked_pm"]))
    pval.append("{:.4f}".format(reg_all[i].pvalues["hacked_pm"]))


In [11]:
diff = pd.DataFrame(
    data={"Difference       ": params, "P-value": pval}
).T  # \;\;\;\;\;\;\;\;\;\;\; Difference
n_col = len(diff.columns) + 1

diff.columns = [
    ["Order flow measures", "", "", "", "Spread measures", "", "", ""],
    [
        "Turnover",
        "Log(volume)",
        "\\big|OI\\big|",
        "Log(option vol)",
        "Effective spread",
        "Realized spread",
        "Price impact",
        "Quoted spread",
    ],
    ["(" + str(i) + ")" for i in range(1, n_col)],
]


In [12]:
diff


Unnamed: 0_level_0,Order flow measures,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Spread measures,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0
Unnamed: 0_level_1,Turnover,Log(volume),\big|OI\big|,Log(option vol),Effective spread,Realized spread,Price impact,Quoted spread
Unnamed: 0_level_2,(1),(2),(3),(4),(5),(6),(7),(8)
Difference,0.0582,0.0526,0.0179,0.039,0.0249,0.0318,-0.0003,-0.0026
P-value,0.0049,0.0123,0.3537,0.3533,0.1201,0.1594,0.9915,0.8545
