# LAB 1 - Linear Regression
### INFO 251: Applied Machine Learning

* Author: Simón Ramírez Amaya
* Date: Jan 21, 2026

# Helper Functions

#### Execute these cells to load the helper functions into memory. No need to write any code here.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from matplotlib.ticker import FuncFormatter

def plot_joint_and_conditional(
    mu_x=14.0,
    mu_y=60000.0,
    sigma_x=2.5,
    sigma_y=15000.0,
    rho=0.7,
    break_1=12.0,
    break_2=16.0,
    jump_1=8000.0,
    jump_2=12000.0,
    grid_size=300,
    n_sigma=4,
    figsize=(14, 6),
    dpi=150
):
    """
    Plot a joint distribution of schooling and income with discontinuities
    in the conditional mean, and the corresponding conditional mean function.

    The slope of the conditional mean is:
        dE[Y|X]/dX = rho * (sigma_y / sigma_x)
    """

    # -----------------------------
    # Axis ranges
    # -----------------------------
    x_min = mu_x - n_sigma * sigma_x
    x_max = mu_x + n_sigma * sigma_x
    y_min = mu_y - n_sigma * sigma_y
    y_max = mu_y + n_sigma * sigma_y

    # -----------------------------
    # Grids
    # -----------------------------
    x = np.linspace(x_min, x_max, grid_size)
    y = np.linspace(y_min, y_max, grid_size)
    X, Y = np.meshgrid(x, y)

    # -----------------------------
    # Conditional mean with correct slope and discontinuities
    # -----------------------------
    slope = rho * (sigma_y / sigma_x)
    base_mean = mu_y + slope * (x - mu_x)

    conditional_mean_y = (
        base_mean
        + jump_1 * (x >= break_1)
        + jump_2 * (x >= break_2)
    )

    # -----------------------------
    # Joint density: f(y|x) f(x)
    # -----------------------------
    MU_Y = np.tile(conditional_mean_y, (grid_size, 1))
    fx = norm.pdf(x, loc=mu_x, scale=sigma_x)
    fy_given_x = norm.pdf(Y, loc=MU_Y, scale=sigma_y)
    Z = fy_given_x * fx

    # -----------------------------
    # Formatter for thousands ("k")
    # -----------------------------
    def thousands_formatter(val, pos):
        return f"{int(val / 1000):d}k"

    k_formatter = FuncFormatter(thousands_formatter)

    # -----------------------------
    # Prepare segmented CMF (no lines across jumps)
    # -----------------------------
    segments = [
        x < break_1,
        (x >= break_1) & (x < break_2),
        x >= break_2,
    ]

    # -----------------------------
    # Plot
    # -----------------------------
    fig, axes = plt.subplots(1, 2, figsize=figsize, dpi=dpi)

    # Panel 1: Joint PDF
    contour = axes[0].contourf(
        X, Y, Z, levels=30, cmap="viridis"
    )
    cbar = fig.colorbar(contour, ax=axes[0])
    cbar.set_label("Probability Density", fontsize=11)

    axes[0].set_xlim(x_min, x_max)
    axes[0].set_ylim(y_min, y_max)
    axes[0].yaxis.set_major_formatter(k_formatter)

    axes[0].set_xlabel("Years of Completed Schooling", fontsize=12)
    axes[0].set_ylabel("Annual Income", fontsize=12)
    axes[0].set_title("Joint Distribution of Schooling and Income", fontsize=13)

    # Panel 2: Conditional Mean Function (single color, segmented)
    for mask in segments:
        axes[1].plot(
            x[mask],
            conditional_mean_y[mask],
            linewidth=2.5,
            color="tab:blue"
        )

    axes[1].axvline(break_1, linestyle="--", linewidth=1, color="black")
    axes[1].axvline(break_2, linestyle="--", linewidth=1, color="black")

    axes[1].set_xlim(x_min, x_max)
    axes[1].set_ylim(y_min, y_max)
    axes[1].yaxis.set_major_formatter(k_formatter)

    axes[1].set_xlabel("Years of Completed Schooling", fontsize=12)
    axes[1].set_ylabel("Expected Annual Income", fontsize=12)
    axes[1].set_title("Conditional Mean Function", fontsize=13)

    # Aesthetics
    for ax in axes:
        ax.tick_params(labelsize=10)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)

    plt.tight_layout()
    plt.show()

    return conditional_mean_y

In [None]:
def sample_schooling_income(
    n,
    mu_x=14.0,
    mu_y=60000.0,
    sigma_x=2.5,
    sigma_y=15000.0,
    rho=0.7,
    break_1=12.0,
    break_2=16.0,
    jump_1=0,
    jump_2=0,
    random_state=None
):
    """
    Generate a random sample of size n from the schooling-income distribution
    with discontinuities in the conditional mean.

    Parameters
    ----------
    n : int
        Sample size.
    mu_x, mu_y : float
        Means of schooling and baseline income.
    sigma_x, sigma_y : float
        Standard deviations of schooling and income noise.
    rho : float
        Correlation used for slope of conditional mean.
    break_1, break_2 : float
        Schooling levels where income jumps occur.
    jump_1, jump_2 : float
        Magnitude of the jumps at the break points.
    random_state : int or None
        Seed for reproducibility.

    Returns
    -------
    X : ndarray of shape (n,)
        Simulated schooling values.
    Y : ndarray of shape (n,)
        Simulated income values.
    """

    rng = np.random.default_rng(random_state)

    # Step 1: sample schooling
    X = rng.normal(loc=mu_x, scale=sigma_x, size=n)

    # Step 2: compute conditional mean
    slope = rho * (sigma_y / sigma_x)
    m_X = mu_y + slope * (X - mu_x)
    m_X += jump_1 * (X >= break_1)
    m_X += jump_2 * (X >= break_2)

    # Step 3: sample income conditional on schooling
    Y = rng.normal(loc=m_X, scale=sigma_y, size=n)

    return X, Y

# Exercise 1

#### Execute these cells to plot the hypothetical joint distributions that we'll be using throughout the lab. Examine the plots.

### Joint Distribution A

In [None]:
# -----------------------------
# Plot pdf and cmf of joint distribution A
# -----------------------------
cmf_A = plot_joint_and_conditional(
    mu_x=14.0,
    mu_y=60000.0,
    sigma_x=2.5,
    sigma_y=15000.0,
    rho=0.0,
    break_1=12.0,
    break_2=16.0,
    jump_1=0,
    jump_2=0,
    grid_size=300,
    n_sigma=2.75
)

### Joint Distribution B

In [None]:
# -----------------------------
# Plot pdf and cmf of joint distribution B
# -----------------------------
cmf_B = plot_joint_and_conditional(
    mu_x=14.0,
    mu_y=60000.0,
    sigma_x=2.5,
    sigma_y=15000.0,
    rho=0.7,
    break_1=12.0,
    break_2=16.0,
    jump_1=0,
    jump_2=0,
    grid_size=300,
    n_sigma=2.75
)

### Joint Distribution C

In [None]:
# -----------------------------
# Plot pdf and cmf of joint distribution C
# -----------------------------
cmf_C = plot_joint_and_conditional(
    mu_x=14.0,
    mu_y=60000.0,
    sigma_x=2.5,
    sigma_y=15000.0,
    rho=0.7,
    break_1=12.0,
    break_2=16.0,
    jump_1=8000.0,
    jump_2=12000.0,
    grid_size=300,
    n_sigma=2.75
)

# Exercise 2

#### Execute this cell to draw sample from each distribution. Then move on to the next cell and  fill in the blanks in the binscatter plot function.

In [None]:
# Draw a sample of size n = 10000 from each of three distributions

X_sample_A, Y_sample_A = sample_schooling_income(
    n=10000,
    mu_x=14.0,
    mu_y=60000.0,
    sigma_x=2.5,
    sigma_y=15000.0,
    rho=0.0,
    break_1=12.0,
    break_2=16.0,
    jump_1=0,
    jump_2=0,
    random_state=100
)

X_sample_B, Y_sample_B = sample_schooling_income(
    n=10000,
    mu_x=14.0,
    mu_y=60000.0,
    sigma_x=2.5,
    sigma_y=15000.0,
    rho=0.7,
    break_1=12.0,
    break_2=16.0,
    jump_1=0,
    jump_2=0,
    random_state=100
)

X_sample_C, Y_sample_C = sample_schooling_income(
    n=10000,
    mu_x=14.0,
    mu_y=60000.0,
    sigma_x=2.5,
    sigma_y=15000.0,
    rho=0.7,
    break_1=12.0,
    break_2=16.0,
    jump_1=8000.0,
    jump_2=12000.0,
    random_state=100
) 

In [None]:
def binned_scatter(
    x,
    y,
    title,
    bin_start=8.0,
    bin_end=20.0,
    bin_width=0.5
):
    """
    Compute and plot a binned scatterplot of y against x.

    Bins are:
    - Uniformly spaced
    - Left-closed, right-open: [a, b)
    - Of width bin_width
    - Starting at bin_start and ending at bin_end

    The plot shows:
    - Bin midpoints on the x-axis
    - Mean of y within each bin on the y-axis
    - Y-axis range fixed from 20k to 100k

    Parameters
    ----------
    x : array-like
        Explanatory variable (e.g., years of schooling).
    y : array-like
        Outcome variable (e.g., income).
    title : str
        Plot title.
    """

    x = np.asarray(x)
    y = np.asarray(y)

    #YOUR CODE HERE: Define bins and midpoints (2-liner)
    #
    #

    #YOUR CODE HERE: Define and compute bin means (5-liner with a for loop)
    #
    #
    #
    #
    #
    
    # Plot
    fig, ax = plt.subplots(figsize=(6, 4), dpi=150)

    ax.scatter(
        #YOUR CODE HERE: This should be an array describing the middle points of the bins (1-liner)
        #
        #YOUR CODE HERE: This should be an array with the sample mean of the observations in each bin (1-liner)
        #
        s=35,
        color="tab:blue"
    )

    # Formatter for thousands ("k")
    def thousands_formatter(val, pos):
        return f"{int(val / 1000):d}k"

    ax.set_xlabel("Years of Completed Schooling", fontsize=11)
    ax.set_ylabel("Mean Annual Income", fontsize=11)
    ax.set_title(title, fontsize=12)
    ax.set_ylim(20000, 100000)
    ax.yaxis.set_major_formatter(FuncFormatter(thousands_formatter))
    ax.tick_params(labelsize=10)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    plt.tight_layout()
    plt.show()

In [None]:
binned_scatter(X_sample_A,Y_sample_A,'Binscatter - Distribution A')

In [None]:
binned_scatter(X_sample_B,Y_sample_B,'Binscatter - Distribution B')

In [None]:
binned_scatter(X_sample_C,Y_sample_C,'Binscatter - Distribution C')

# Exercise 4

#### Fill in the blanks in the functions and then use them to construct plots comparing the CMF and the regression line.

In [None]:
import numpy as np

def slr_estimate(x, y):
    """
    Compute simple linear regression coefficients for y ~ x.

    Parameters
    ----------
    x : array-like, shape (n,)
        Sample values for explanatory variable (e.g., years of schooling)
    y : array-like, shape (n,)
        Sample values for outcome variable (e.g., income)

    Returns
    -------
    beta0 : float
        Intercept estimate
    beta1 : float
        Slope estimate

    Notes
    -----
    - This function implements the closed-form solution directly.
    - Does not use any external libraries.
    """
    x = np.asarray(x)
    y = np.asarray(y)

    # YOUR CODE HERE: Compute the means (2-liner)
    #
    #

    # YOUR CODE HERE: Compute the slope (1-liner)
    # beta1 = 

    # YOUR CODE HERE: Compute the intercept (1-liner)
    # beta0 = 

    return beta0, beta1


def plot_cmf_vs_slr(cmf, beta0, beta1):
    """
    Plot the true conditional mean function (CMF) and the fitted simple regression line.

    Parameters
    ----------
    cmf : array-like
        True conditional mean values corresponding to the x-axis (schooling).
    beta0 : float
        Fitted intercept.
    beta1 : float
        Fitted slope.
    """
    # Define x-axis grid
    x = np.linspace(7.125, 20.85, len(cmf))

    # YOUR CODE HERE Compute the fitted regression line for the x-axis grid (1-liner)
    # y_hat = 
    
    # CMF handling 
    cmf = np.asarray(cmf)

    # Plot
    fig, ax = plt.subplots(figsize=(6, 4), dpi=150)

    # True CMF
    ax.plot(
        x,
        cmf,
        linewidth=2.5,
        color="tab:blue",
        label="True CMF"
    )

    # Fitted OLS line
    ax.plot(
        x,
        y_hat,
        linewidth=2.5,
        color="tab:orange",
        linestyle="--",
        label="OLS Fit"
    )

    # Formatter for thousands
    def thousands_formatter(val, pos):
        return f"{int(val/1000):d}k"

    # Labels and title
    ax.set_xlabel("Years of Completed Schooling", fontsize=11)
    ax.set_ylabel("Annual Income", fontsize=11)
    ax.set_ylim(20000, 100000)
    ax.set_title("True CMF vs Fitted OLS Regression", fontsize=12)

    # Axis formatting
    ax.tick_params(labelsize=10)
    ax.yaxis.set_major_formatter(FuncFormatter(thousands_formatter))

    # Clean spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    # Legend
    ax.legend(fontsize=10)

    plt.tight_layout()
    plt.show()

In [None]:
beta0_hat_A, beta1_hat_A = slr_estimate(X_sample_A, Y_sample_A)
print(beta0_hat_A, beta1_hat_A)
plot_cmf_vs_slr(cmf_A,beta0_hat_A,beta1_hat_A)

In [None]:
beta0_hat_B, beta1_hat_B = slr_estimate(X_sample_B, Y_sample_B)
print(beta0_hat_B, beta1_hat_B)
plot_cmf_vs_slr(cmf_B,beta0_hat_B,beta1_hat_B)

In [None]:
beta0_hat_C, beta1_hat_C = slr_estimate(X_sample_C, Y_sample_C)
print(beta0_hat_C, beta1_hat_C)
plot_cmf_vs_slr(cmf_C,beta0_hat_C,beta1_hat_C)

# Exercise 6

#### Fill in the blanks in the ols function and use it to revisit your estimates from exercise 4.

In [None]:
def ols_estimate(regressors, y):
    """
    Compute OLS estimates using the matrix formula:
        beta_hat = (X'X)^{-1} X'y

    Parameters
    ----------
    regressors : list of numpy.ndarray
        List of 1D arrays, each of shape (n,),
        containing the sample observations for each regressor.
        The constant is NOT included.
    y : numpy.ndarray
        1D array of shape (n,) containing the outcome variable.

    Returns
    -------
    beta_hat : numpy.ndarray
        1D array of length (k + 1,) containing the OLS estimates.
        beta_hat[0] is the intercept; remaining elements correspond
        to regressors in the order given.
    """

    # Convert outcome to numpy array
    y = np.asarray(y)

    #YOUR CODE HERE: Compute n and k (2-liner)
    #
    #

    #YOUR CODE HERE: Create the design matrix (6-liner with for loop)
    #
    #
    #    
    #
    #
    #

    #YOUR CODE HERE: Compute the OLS estimator (3-liner)
    #
    #
    #

    return beta_hat

In [None]:
# Single regressor: years of schooling
regressors = [X_sample_A]  # list of 1D numpy arrays
y = Y_sample_A

# Compute OLS estimates using the matrix formula
beta_hat = ols_estimate(regressors, y)

print("Intercept (beta0):", beta_hat[0])
print("Slope (beta1):", beta_hat[1])

In [None]:
# Single regressor: years of schooling
regressors = [X_sample_B]  # list of 1D numpy arrays
y = Y_sample_B

# Compute OLS estimates using the matrix formula
beta_hat = ols_estimate(regressors, y)

print("Intercept (beta0):", beta_hat[0])
print("Slope (beta1):", beta_hat[1])

In [None]:
# Single regressor: years of schooling
regressors = [X_sample_C]  # list of 1D numpy arrays
y = Y_sample_C

# Compute OLS estimates using the matrix formula
beta_hat = ols_estimate(regressors, y)

print("Intercept (beta0):", beta_hat[0])
print("Slope (beta1):", beta_hat[1])