# Problem Set 3

Solutions to Computational Problems

## Problem 1

In [1]:
from functools import partial

import estimagic as em
import jax
import jax.numpy as jnp
import numpy as np
import pandas as pd
import statsmodels.api as sm
from jax.scipy import stats
from joblib import Parallel, delayed
from scipy.stats import norm
from sklearn.linear_model import LinearRegression

jax.config.update("jax_enable_x64", val=True)

### Functions

In [2]:
def simulate_tobit(n_samples: int, rng: np.random.Generator):
    """Simulate data from a Tobit model.

    Args:
        n_samples (int): Number of samples.
        rng (np.random.Generator): Random number generator.

    Returns:
        - np.ndarray: Features, has shape (n_samples, 1)
        - np.ndarray: Outcomes, has shape (n_samples, )

    """
    x = rng.standard_t(df=6, size=n_samples)
    u = rng.standard_normal(size=n_samples)
    y_unobserved = 3 + 2 * x + u
    y = np.clip(y_unobserved, a_min=2, a_max=None)
    return x.reshape(-1, 1), y

In [3]:
def ols_approach(x, y):
    """Run simple ordinary least squares regression.

    Args:
        x (np.ndarray): Features, has shape (n_samples, 1)
        y (np.ndarray): Outcomes, has shape (n_samples, )

    Returns:
        np.ndarray: Coefficients. First entry is the intercept.

    """
    model = LinearRegression(fit_intercept=True)
    model = model.fit(x, y)
    return np.array([model.intercept_, *model.coef_])

In [4]:
def probit_approach(x, y):
    """Run two step procedure.

    Args:
        x (np.ndarray): Features, has shape (n_samples, 1)
        y (np.ndarray): Outcomes, has shape (n_samples, )

    Returns:
        np.ndarray: Coefficients. First entry is the intercept.

    """
    mask = y > 2

    # First stage
    # ----------------------------------------------------------------------------------
    y_binary = mask.astype(int)
    probit_model = sm.Probit(y_binary, sm.add_constant(x)).fit(disp=False)

    # Second stage
    # ----------------------------------------------------------------------------------
    x_subset = sm.add_constant(x[mask])
    y_subset = y[mask]

    index = x_subset @ probit_model.params
    mills_ratio = norm.pdf(index) / norm.cdf(index)

    features = np.column_stack([x_subset, mills_ratio])

    linear_model = LinearRegression(fit_intercept=False)
    return linear_model.fit(features, y_subset).coef_[:2]

In [5]:
def large_x_approach(x, y, percentile):
    """Run OLS only on data points where x-value is among largest percentile.

    Args:
        x (np.ndarray): Features, has shape (n_samples, 1)
        y (np.ndarray): Outcomes, has shape (n_samples, )
        percentile (int): Percentage for the percentiles to compute. Values
            must be between 0 and 100 inclusive.

    Returns:
        np.ndarray: Coefficients. First entry is the intercept.

    """
    threshold = np.percentile(x, percentile)
    mask = x.flatten() > threshold
    return ols_approach(x=x[mask], y=y[mask])

In [6]:
def _simulation(
    n_samples: int,
    rng: np.random.Generator,
):
    """Run single simulation.

    Args:
        n_samples (int): Number of samples.
        rng (np.random.Generator): Random number generator.

    Returns:
        pd.DataFrame: Results of the simulation. Has rows "ols", "probit", and "large_x"
            and columns "intercept" and "slope".

    """
    x, y = simulate_tobit(n_samples, rng=rng)

    result = {
        "ols": ols_approach(x, y),
        "probit": probit_approach(x, y),
        "large_x": large_x_approach(x, y, percentile=80),
    }

    return pd.DataFrame(result, index=["intercept", "slope"]).T

### Computation

In [7]:
rng = np.random.default_rng(54321)

simulation = partial(_simulation, n_samples=500, rng=rng)

In [8]:
n_sims = 10_000

raw_result = Parallel(n_jobs=5)(delayed(simulation)() for _ in range(n_sims))

In [9]:
result = pd.concat(
    raw_result,
    axis=0,
    keys=range(n_sims),
    names=["simulation", "approach"],
)
result = result.groupby("approach").mean()

In [10]:
result  # noqa: B018

Unnamed: 0_level_0,intercept,slope
approach,Unnamed: 1_level_1,Unnamed: 2_level_1
large_x,3.165834,1.928257
ols,3.565667,1.310595
probit,3.024676,1.989339


## Problem 5

### Functions

In [11]:
def simulate_probit(n_samples: int, key: jnp.ndarray):
    """Simulate data from a Probit model.

    Args:
        n_samples (int): Number of samples.
        key (jnp.ndarray): A PRNG key.

    Returns:
        - np.ndarray: Features, has shape (n_samples, 1)
        - np.ndarray: Outcomes, has shape (n_samples, )

    """
    x_key, y_key = jax.random.split(key, num=2)

    x = jax.random.normal(x_key, shape=(n_samples,))
    probs = stats.norm.cdf(2 + 3 * x)
    y = jax.random.bernoulli(y_key, probs).astype(int)

    return x, y

In [12]:
def probit_lstsq_loss(params: dict, x: jnp.ndarray, y: jnp.ndarray):
    """Return probit least-squares loss.

    Args:
        params (dict): Params dictionary including 'intercept' and 'slope'.
        x (jnp.ndarray): Input features.
        y: (jnp.ndarray): Binary outcome.

    Returns:
        jnp.ndarray: Squared error loss.

    """
    index = params["intercept"] + x * params["slope"]
    residuals = y - stats.norm.cdf(index)
    return jnp.sum(residuals**2)

In [14]:
def get_loss_and_grad(x: jnp.ndarray, y: jnp.ndarray):
    """Return loss and gradient function.

    Args:
        x (jnp.ndarray): Input features.
        y: (jnp.ndarray): Binary outcome.

    Returns:
        - Callable: Loss function.
        - Callable: Gradient function.

    """
    loss = jax.jit(partial(probit_lstsq_loss, x=x, y=y))
    grad = jax.jit(jax.grad(loss))
    return loss, grad

In [15]:
def simulation(
    n_samples: int,
    n_sims: int,
    start_params: dict,
    seed: int,
):
    """Run a Monte Carlo simulation.

    Args:
        n_samples (int): Number of samples.
        n_sims (int): Number of simulations.
        start_params (dict): Params dictionary including 'intercept' and 'slope'.
        seed (int): Random seed.

    Returns:
        pd.DataFrame: Results of the simulation. Has columns 'intercept' and 'slope'.
            Rows correspond to simulations.

    """
    key = jax.random.PRNGKey(seed)

    result = []

    for _ in range(n_sims):
        key, subkey = jax.random.split(key)

        x, y = simulate_probit(n_samples, key=subkey)

        loss, grad = get_loss_and_grad(x=x, y=y)

        res = em.minimize(
            criterion=loss,
            derivative=grad,
            params=start_params,
            algorithm="scipy_lbfgsb",
        )

        result.append(res.params)

    return pd.DataFrame(result)

### Computation

In [16]:
results = []

for n_samples in [100, 1_000]:
    raw = simulation(
        n_samples=n_samples,
        n_sims=100,
        start_params={"intercept": 0.0, "slope": 0.0},
        seed=54321,
    )
    results.append(raw.mean())

result = pd.DataFrame(results, index=pd.Index([100, 1_000], name="n_samples"))
result  # noqa: B018

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


Unnamed: 0_level_0,intercept,slope
n_samples,Unnamed: 1_level_1,Unnamed: 2_level_1
100,16.856449,26.612203
1000,2.003203,3.016902
