# Problem Set 3

Solutions to Computational Problems

## Problem 1

In [1]:
from functools import partial

import numpy as np
import pandas as pd
import statsmodels.api as sm
from joblib import Parallel, delayed
from scipy.stats import norm
from sklearn.linear_model import LinearRegression

## Functions

In [2]:
def simulate_data(n_samples: int, rng: np.random.Generator):
    """Simulate data from a Tobit model.

    Args:
        n_samples (int): Number of samples.
        rng (np.random.Generator): Random number generator.

    Returns:
        - np.ndarray: Features, has shape (n_samples, 1)
        - np.ndarray: Outcomes, has shape (n_samples, )

    """
    x = rng.standard_t(df=6, size=n_samples)
    u = rng.standard_normal(size=n_samples)
    y_unobserved = 3 + 2 * x + u
    y = np.clip(y_unobserved, a_min=2, a_max=None)
    return x.reshape(-1, 1), y

In [3]:
def ols_approach(x, y):
    """Run simple ordinary least squares regression.

    Args:
        x (np.ndarray): Features, has shape (n_samples, 1)
        y (np.ndarray): Outcomes, has shape (n_samples, )

    Returns:
        np.ndarray: Coefficients. First entry is the intercept.

    """
    model = LinearRegression(fit_intercept=True)
    model = model.fit(x, y)
    return np.array([model.intercept_, *model.coef_])

In [4]:
def probit_approach(x, y):
    """Run two step procedure.

    Args:
        x (np.ndarray): Features, has shape (n_samples, 1)
        y (np.ndarray): Outcomes, has shape (n_samples, )

    Returns:
        np.ndarray: Coefficients. First entry is the intercept.

    """
    mask = y > 2

    # First stage
    # ----------------------------------------------------------------------------------
    y_binary = mask.astype(int)
    probit_model = sm.Probit(y_binary, sm.add_constant(x)).fit(disp=False)

    # Second stage
    # ----------------------------------------------------------------------------------
    x_subset = sm.add_constant(x[mask])
    y_subset = y[mask]

    _pred = x_subset @ probit_model.params
    mills_ratio = norm.pdf(_pred) / norm.cdf(_pred)

    features = np.column_stack([x_subset, mills_ratio])

    linear_model = LinearRegression(fit_intercept=False)
    return linear_model.fit(features, y_subset).coef_[:2]

In [9]:
def large_x_approach(x, y, percentile):
    """Run OLS only on data points where x-value is among largest percentile.

    Args:
        x (np.ndarray): Features, has shape (n_samples, 1)
        y (np.ndarray): Outcomes, has shape (n_samples, )
        percentile (int): Percentage for the percentiles to compute. Values
            must be between 0 and 100 inclusive.

    Returns:
        np.ndarray: Coefficients. First entry is the intercept.

    """
    threshold = np.percentile(x, percentile)
    mask = x.flatten() > threshold
    return ols_approach(x=x[mask], y=y[mask])

In [10]:
def _simulation(
    n_samples: int,
    rng: np.random.Generator,
):
    """Run single simulation.

    Args:
        n_samples (int): Number of samples.
        rng (np.random.Generator): Random number generator.

    Returns:
        pd.DataFrame: Results of the simulation. Has rows "ols", "probit", and "large_x"
            and columns "intercept" and "slope".

    """
    x, y = simulate_data(n_samples, rng=rng)

    result = {
        "ols": ols_approach(x, y),
        "probit": probit_approach(x, y),
        "large_x": large_x_approach(x, y, percentile=80),
    }

    return pd.DataFrame(result, index=["intercept", "slope"]).T

## Computation

In [11]:
rng = np.random.default_rng(54321)

simulation = partial(_simulation, n_samples=500, rng=rng)

In [25]:
n_sims = 10_000

raw_result = Parallel(n_jobs=5)(delayed(simulation)() for _ in range(n_sims))

In [26]:
result = pd.concat(
    raw_result,
    axis=0,
    keys=range(n_sims),
    names=["simulation", "approach"],
)
result = result.groupby("approach").mean()

In [27]:
result  # noqa: B018

Unnamed: 0_level_0,intercept,slope
approach,Unnamed: 1_level_1,Unnamed: 2_level_1
large_x,3.015311,1.99422
ols,3.581019,1.268736
probit,2.996936,2.001656
