In [1]:
import pandas as pd
import numpy as np

import statsmodels.formula.api as smf

In [2]:
raw_data = pd.read_stata("data/CK1994.dta")

In [37]:
def compute_full_time_equivalent_employees(raw_data):
    full_time_equivalent_employees = np.zeros(len(raw_data))

    for i, row in raw_data.iterrows():

        if not np.isnan(row["empft"]):
            full_time_equivalent_employees[i] += row["empft"]
        
        if not np.isnan(row["emppt"]):
            full_time_equivalent_employees[i] += 0.5 * row["emppt"]

        if not np.isnan(row["nmgrs"]):
            full_time_equivalent_employees[i] += row["nmgrs"]
        
        if np.isnan([row["empft"], row["emppt"], row["nmgrs"]]).all():
            full_time_equivalent_employees[i] = np.nan

    return full_time_equivalent_employees

In [63]:
def clean_data(data):

    new = pd.DataFrame()
    
    new["state"] = data["state"].astype(int)
    new["time"] = data["time"].astype(int)

    new["state_name"] = new["state"].map({0: "Pennsylvania", 1: "New Jersey"})
    new["time_name"] = new["time"].map({0: "Before Increase", 1: "After Increase"})

    new["treatment_dummy"] = new["state"] * new["time"]
    
    new["full_time_equivalent_employees"] = compute_full_time_equivalent_employees(data)
        
    new["store_id"] = data["store"].astype(int)
    
    new["hours_open"] = data["hoursopen"]

    new = new.set_index(["store_id", "time"]).sort_index().reset_index()

    stores_with_missing_data = new.query("full_time_equivalent_employees.isnull()")["store_id"].unique()

    new = new.query("store_id not in @stores_with_missing_data")

    new = new.query("full_time_equivalent_employees > 0 & hours_open > 0")

    tmp = new.groupby("store_id")["time"].count()
    invalided_stores = tmp[tmp < 2].index
    return new.query("store_id not in @invalided_stores")

In [64]:
def recreate_table_1(data):
    table = data.groupby(["time_name", "state_name"])[["full_time_equivalent_employees"]].mean().unstack().loc[["Before Increase", "After Increase"]]
    row_difference = table.diff(axis="index").to_numpy()[1, :]
    table.loc["Difference"] = row_difference

    col_difference = -table.diff(axis="columns").to_numpy()[:, 1]

    table["Difference"] = col_difference
    return table


In [65]:
data = clean_data(raw_data)

In [66]:
recreate_table_1(data)

Unnamed: 0_level_0,full_time_equivalent_employees,full_time_equivalent_employees,Difference
state_name,New Jersey,Pennsylvania,Unnamed: 3_level_1
time_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Before Increase,20.325156,23.285714,-2.960558
After Increase,21.155469,21.230519,-0.075051
Difference,0.830313,-2.055195,2.885507


In [67]:
base_model = smf.ols(
    formula="full_time_equivalent_employees ~ 1 + state + time + treatment_dummy",
    data=data,
    missing="raise",
).fit()

base_model_se = base_model.get_robustcov_results(cov_type="cluster", groups=data["store_id"]).bse

In [68]:
pd.DataFrame({"coef": base_model.params, "se": base_model_se}).round(2)

Unnamed: 0,coef,se
Intercept,23.29,1.38
state,-2.96,1.47
time,-2.06,1.25
treatment_dummy,2.89,1.34


In [69]:
def apply_within_transformation(data, id_col, columns_to_demean):
    new_data = data.copy()
    for column in columns_to_demean:
        new_data[column] = new_data.groupby(id_col)[column].transform(lambda x: x - x.mean())
    return new_data

In [70]:
demeaned_data = apply_within_transformation(data, id_col="store_id", columns_to_demean=("full_time_equivalent_employees", "treatment_dummy", "time", "hours_open"))

In [71]:
one_way_fixed_effect_model = smf.ols(
    formula="full_time_equivalent_employees ~ time + treatment_dummy - 1",
    data=demeaned_data,
    missing="raise",
).fit()

one_way_fixed_effect_model_se = one_way_fixed_effect_model.get_robustcov_results(cov_type="cluster", groups=data["store_id"]).bse

In [72]:
pd.DataFrame({"coef": one_way_fixed_effect_model.params, "se": one_way_fixed_effect_model_se}).round(2)

Unnamed: 0,coef,se
time,-2.06,1.25
treatment_dummy,2.89,1.34


In [73]:
one_way_fixed_effect_model_with_controls = smf.ols(
    formula="full_time_equivalent_employees ~ time + treatment_dummy + hours_open - 1",
    data=demeaned_data,
    missing="raise",
).fit()

one_way_fixed_effect_model_with_controls_se = one_way_fixed_effect_model_with_controls.get_robustcov_results(cov_type="cluster", groups=data["store_id"]).bse

In [74]:
pd.DataFrame({"coef": one_way_fixed_effect_model_with_controls.params, "se": one_way_fixed_effect_model_with_controls_se}).round(2)

Unnamed: 0,coef,se
time,-2.17,1.22
treatment_dummy,3.0,1.32
hours_open,1.07,0.34


In [78]:
import jax.numpy as jnp
import jax

In [263]:
def simulate_data(n_samples, params, seed):
    key = jax.random.PRNGKey(seed)

    keys = jax.random.split(key, 2)

    x = jax.random.uniform(keys[0], (n_samples,))
    e = jax.random.normal(keys[1], (n_samples,))

    y = params["intercept"] + params["slope"] * x + e

    return {"x": x, "y": y}

In [264]:
def ols(x, y):
    x = jnp.stack([jnp.ones_like(x), x], axis=1)
    return jnp.linalg.lstsq(x, y, rcond=None)[0]

In [265]:
import estimagic as em
from functools import partial

In [266]:
def _log_likelihood(params, data):
    y_conditional_mean = params["intercept"] + params["slope"] * data["x"]
    return jax.scipy.stats.norm.logpdf(data["y"], loc=y_conditional_mean, scale=1).sum()

In [267]:
params = {"intercept": 1.0, "slope": 2.0}


In [268]:
data = simulate_data(1_000, params=params, seed=12345)

In [269]:
loglike = jax.jit(partial(_log_likelihood, data=data))

loglike_grad = jax.jit(jax.grad(loglike))

In [270]:
start_params = {"intercept": 1.5, "slope": 1.5}

In [271]:
loglike(start_params)

Array(-1447.5132, dtype=float32)

In [272]:
loglike_grad(start_params)

{'intercept': Array(-281.86456, dtype=float32, weak_type=True),
 'slope': Array(-101.01553, dtype=float32, weak_type=True)}

In [273]:
res = em.maximize(
    criterion=loglike,
    params=start_params,
    algorithm="scipy_lbfgsb",
    derivative=loglike_grad,
)

In [274]:
res

  table = report[columns].applymap(_format_float).astype(str)


Maximize with 2 free parameters terminated successfully after 7 criterion evaluations, 7 derivative evaluations and 4 iterations.

The value of criterion improved from -1447.51318359375 to -1397.094482421875.

The scipy_lbfgsb algorithm reported: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH

Independent of the convergence criteria used by scipy_lbfgsb, the strength of convergence can be assessed by the following criteria:

                             one_step    five_steps
relative_criterion_change          0***  0.03609   
relative_params_change     2.417e-07*     0.6241   
absolute_criterion_change          0***    50.42   
absolute_params_change      4.41e-07*     0.7518   

(***: change <= 1e-10, **: change <= 1e-8, *: change <= 1e-5. Change refers to a change between accepted steps. The first column only considers the last step. The second column considers the last five steps.)

In [275]:
res.params

{'intercept': 0.9562211893581772, 'slope': 2.019075223036825}