# Problem Set 4

Solutions to Computational Problems

## Panel Data

---

## Problem 2

In [1]:
from functools import partial
from typing import Final, TypedDict

import estimagic as em
import jax
import jax.numpy as jnp
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.regression.linear_model import RegressionResultsWrapper

### Task 1 (Data Cleaning)

In [2]:
raw_data = pd.read_stata("data/CK1994.dta")
raw_data.head()

Unnamed: 0,store,chain,co_owned,state,southj,centralj,northj,pa1,pa2,shore,...,firstinc,meals,open,hoursopen,pricesoda,pricefry,priceentree,nregisters,nregisters11,time
0,46.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,,2.0,6.5,16.5,1.03,1.03,0.52,3.0,3.0,0.0
1,49.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,,2.0,10.0,13.0,1.01,0.9,2.35,4.0,3.0,0.0
2,506.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.37,2.0,11.0,10.0,0.95,0.74,2.33,3.0,3.0,0.0
3,56.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.1,2.0,10.0,12.0,0.87,0.82,1.79,2.0,2.0,0.0
4,61.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.15,3.0,10.0,12.0,0.87,0.77,1.65,2.0,2.0,0.0


In [3]:
def clean_data(raw_data: pd.DataFrame) -> pd.DataFrame:
    """Clean the CK1994 data set.

    Args:
        raw_data: A DataFrame containing the raw CK1994 data.

    Returns:
        A DataFrame containing the cleaned CK1994 raw_data, including the constructed
        variable `outcome`, which represents the full-time equivalent employees per
        store.

    """
    clean = pd.DataFrame()

    # Convert ID variables to integers
    for id_col in ["store", "state", "time"]:
        clean[id_col] = raw_data[id_col].astype(int)

    # Add corresponding name to state and time ID's
    state_map = {0: "Pennsylvania", 1: "New Jersey"}
    time_map = {0: "Before Increase", 1: "After Increase"}

    clean["state_name"] = clean["state"].map(state_map)
    clean["time_name"] = clean["time"].map(time_map)

    # Add rest of the variables
    clean["hours_open"] = raw_data["hoursopen"].astype(float)

    clean["treatment_dummy"] = clean["state"] * clean["time"]

    clean["outcome"] = create_full_time_equivalent_employees(raw_data)

    clean = filter_invalid_stores(clean)
    return clean.sort_values(["store", "time"]).reset_index(drop=True)


def create_full_time_equivalent_employees(raw_data: pd.DataFrame) -> pd.Series:
    """Create the full-time equivalent employees per store.

    Args:
        raw_data: A DataFrame containing the raw CK1994 data.

    Returns:
        A Series containing the full-time equivalent employees per store. If all inputs
        are NaN for a store, the outcome for that store will be NaN.

    """
    # Calculate full-time equivalent employees per store
    outcome = (
        raw_data["empft"].fillna(0)
        + 0.5 * raw_data["emppt"].fillna(0)
        + raw_data["nmgrs"].fillna(0)
    )

    # Identify stores where all inputs are NaN and set their outcome to NaN
    mask = raw_data[["empft", "emppt", "nmgrs"]].isna().all(axis=1)
    outcome[mask] = np.nan

    return outcome


def filter_invalid_stores(clean: pd.DataFrame) -> pd.DataFrame:
    """Filter invalid stores.

    Invalid stores are defined as observations with missing data, zero, negative, or NaN
    outcomes, or insufficient time points.

    Args:
        clean: A DataFrame with all new variables, after initial cleaning and
            processing. Must contain the multiindex ('store', 'time'). Can contain
            NaN-values.

    Returns:
        A DataFrame with only valid stores.

    """
    # Remove stores with any missing outcome data
    stores_with_missing_data = clean.query("outcome.isnull()")[  # noqa: F841
        "store"
    ].unique()
    clean = clean.query("store not in @stores_with_missing_data")

    # Filter out stores with non-positive outcome or hours open
    clean = clean.query("outcome > 0 and hours_open > 0")

    # Ensure each store has at least two time points
    time_points_by_store = clean.groupby("store")["time"].count()
    valid_stores = time_points_by_store[time_points_by_store >= 2].index  # noqa: F841
    return clean.query("store in @valid_stores")

In [4]:
data = clean_data(raw_data)
data.head()

Unnamed: 0,store,state,time,state_name,time_name,hours_open,treatment_dummy,outcome
0,1,1,0,New Jersey,Before Increase,16.0,0,35.0
1,1,1,1,New Jersey,After Increase,16.0,1,44.0
2,2,1,0,New Jersey,Before Increase,14.0,0,16.0
3,2,1,1,New Jersey,After Increase,15.0,1,15.5
4,3,1,0,New Jersey,Before Increase,10.0,0,15.5


### Task 2

#### (a) Recreate Table 1

In [5]:
def recreate_table_1(data: pd.DataFrame) -> pd.DataFrame:
    table = data.pivot_table(
        values="outcome",
        index="time_name",
        columns="state_name",
        aggfunc="mean",
    )

    table = table.reindex(index=["Before Increase", "After Increase"])
    table = table.reindex(columns=["New Jersey", "Pennsylvania"])

    time_diff = table.loc["After Increase"] - table.loc["Before Increase"]
    table.loc["Difference"] = time_diff

    state_diff = table["New Jersey"] - table["Pennsylvania"]
    table["Difference"] = state_diff

    return table

In [6]:
recreate_table_1(data)

state_name,New Jersey,Pennsylvania,Difference
time_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Before Increase,20.325155,23.285715,-2.96056
After Increase,21.155468,21.23052,-0.075052
Difference,0.830313,-2.055195,2.885508


### Task 2 (Effect Estimation)

#### Helper Functions and constants

In [7]:
def visualize_estimation_results(result: RegressionResultsWrapper) -> pd.DataFrame:
    return pd.DataFrame({"coef": result.params, "se": result.bse}).round(2)


fit_kwargs: Final = {"cov_type": "cluster", "cov_kwds": {"groups": data["store"]}}


def apply_within_transformation(
    data: pd.DataFrame,
    id_col: str,
    columns_to_demean: list[str],
) -> pd.DataFrame:
    new_data = data.copy()
    for column in columns_to_demean:
        new_data[column] = new_data.groupby(id_col)[column].transform(
            lambda x: x - x.mean(),
        )
    return new_data

### (b)

In [8]:
base_model = smf.ols(
    formula="outcome ~ 1 + state + time + treatment_dummy",
    data=data,
    missing="raise",
)

base_result = base_model.fit(**fit_kwargs)

visualize_estimation_results(base_result)

Unnamed: 0,coef,se
Intercept,23.29,1.38
state,-2.96,1.47
time,-2.06,1.25
treatment_dummy,2.89,1.34


### (c)

In [9]:
demeaned_data = apply_within_transformation(
    data,
    id_col="store",
    columns_to_demean=["outcome", "treatment_dummy", "time", "hours_open"],
)

In [10]:
fixed_effect_model = smf.ols(
    formula="outcome ~ time + treatment_dummy - 1",
    data=demeaned_data,
    missing="raise",
)

fixed_effect_model_result = fixed_effect_model.fit(**fit_kwargs)

visualize_estimation_results(fixed_effect_model_result)

Unnamed: 0,coef,se
time,-2.06,1.25
treatment_dummy,2.89,1.34


In [11]:
fixed_effect_model_with_controls = smf.ols(
    formula="outcome ~ time + treatment_dummy + hours_open - 1",
    data=demeaned_data,
    missing="raise",
)

fixed_effect_model_with_controls_result = fixed_effect_model_with_controls.fit(
    **fit_kwargs,
)

visualize_estimation_results(fixed_effect_model_with_controls_result)

Unnamed: 0,coef,se
time,-2.17,1.22
treatment_dummy,3.0,1.32
hours_open,1.07,0.34


## Limited Dependent Variables

---

## Problem 3

In [12]:
class Params(TypedDict):
    intercept: float
    slope: float


class Data(TypedDict):
    x: jnp.ndarray
    y: jnp.ndarray


def simulate_data(n_samples: int, params: Params, seed: int) -> Data:
    """Simulate data for a simple linear regression model.

    Args:
        n_samples: The number of samples to simulate.
        params: A dictionary containing the parameters of the model. Has keys
            "intercept" and "slope".
        seed: The seed for the random number generator.

    Returns:
        A dictionary containing the simulated data. Has keys "x" and "y".

    """
    key = jax.random.PRNGKey(seed)
    keys = jax.random.split(key, 2)

    x = jax.random.uniform(keys[0], (n_samples,))
    e = jax.random.normal(keys[1], (n_samples,))

    y = params["intercept"] + params["slope"] * x + e

    return {"x": x, "y": y}

In [18]:
true_params = {"intercept": 1.0, "slope": 2.0}


simulated_data = simulate_data(n_samples=100, params=true_params, seed=12345)

### OLS

In [14]:
ols_result = smf.ols(
    formula="y ~ x",
    data=pd.DataFrame(simulated_data),
).fit()


ols_paramer_estimates = (
    ols_result.params.rename({"Intercept": "intercept", "x": "slope"})
    .round(7)
    .to_dict()
)

### Maximum Likelihood

In [15]:
def log_likelihood(params: Params, data: Data) -> float:
    y_conditional_mean = params["intercept"] + params["slope"] * data["x"]
    return jax.scipy.stats.norm.logpdf(data["y"], loc=y_conditional_mean, scale=1).sum()


# Define the log-likelihood function with the data fixed
partialled_log_likelihood = partial(log_likelihood, data=simulated_data)


# Define the gradient of the log-likelihood function and jit both functions. This
# improves the performance of the functions.
loglike = jax.jit(partialled_log_likelihood)
grad_loglike = jax.jit(jax.grad(loglike))

In [16]:
start_params = {"intercept": 0.0, "slope": 0.0}


res = em.maximize(
    criterion=loglike,
    params=start_params,
    algorithm="scipy_lbfgsb",
    derivative=grad_loglike,
)


ml_parameter_estimates = {k: np.round(v, 7) for k, v in res.params.items()}

### Comparison

In [17]:
print(
    "True Parameters:",
    true_params,
    "\n\nML Results:",
    ml_parameter_estimates,
    "\n\nOLS Results:",
    ols_paramer_estimates,
)

True Parameters: {'intercept': 1.0, 'slope': 2.0} 

ML Results: {'intercept': 0.7241056, 'slope': 2.241366} 

OLS Results: {'intercept': 0.724119, 'slope': 2.2413323}
