# Homework 7

In [1]:
import numpy as np
import pandas as pd

In [4]:
W = np.random.normal(0, 1, (1000,))
X = W + np.random.normal(0, 1, (1000,)) 
Z = np.random.normal(0, 1, (1000,)) 
Y = X + Z + W + np.random.normal(0, 1, (1000,))

In [7]:

# set seed for reproducibility
np.random.seed(42)

n = 1_000_000  # large sample for stability

W = np.random.normal(0, 1, n)
X = W + np.random.normal(0, 1, n)
Z = np.random.normal(0, 1, n)
Y = X + Z + W + np.random.normal(0, 1, n)

# define the regression error term: everything in Y not explained by X
u = W + Z + (Y - (X + Z + W))  # but simpler: u = W + Z + eps_Y
# since we don't have eps_Y directly, just compute residuals from regressing Y on X
# but here we know the true construction:
eps_Y = Y - (X + Z + W)
u = W + Z + eps_Y

corr = np.corrcoef(X, u)[0,1]
print(corr)

0.40924807899898724


In [8]:
df = pd.read_csv('homework_7.1.csv', index_col=0)

In [9]:
df.head()

Unnamed: 0,X,W,Z,Y
0,1.137055,1.221768,0.327829,1.944532
1,-0.112905,0.465835,0.59965,0.655514
2,2.077755,1.795414,-0.063393,5.934411
3,0.456373,-0.512159,1.177413,-0.188064
4,-1.012402,0.080002,-0.275697,-0.533775


In [10]:
import pandas as pd
import statsmodels.formula.api as smf

def run_regression_on_subset(df_subset):
    """
    Runs an OLS regression of Y ~ X + Z on a given dataframe subset
    and returns the coefficient for X.
    """
    try:
        # Define and fit the model
        # The formula 'Y ~ X + Z' regresses Y on X and Z
        model = smf.ols('Y ~ X + Z', data=df_subset)
        results = model.fit()
        
        # Get the coefficient for 'X'
        x_coefficient = results.params['X']
        return x_coefficient
    except (ValueError, KeyError) as e:
        # Handle cases where the subset might be empty
        # or other regression errors
        print(f"Could not run regression on subset: {e}")
        return None

# --- Main Analysis ---

# 1. Load your data
# Make sure 'homework_7.1.csv' is in the same directory
try:
    df = pd.read_csv('homework_7.1.csv')

    # 2. Define the "windows" for W
    # We will look at data where W is within +/- 0.1 of our target values
    window = 0.1

    # 3. Create the data subsets
    df_neg1 = df.loc[(df['W'] > -1 - window) & (df['W'] < -1 + window)]
    df_zero = df.loc[(df['W'] > 0 - window)  & (df['W'] < 0 + window)]
    df_pos1 = df.loc[(df['W'] > 1 - window)  & (df['W'] < 1 + window)]

    print(f"Data points for W ~ -1: {len(df_neg1)}")
    print(f"Data points for W ~  0: {len(df_zero)}")
    print(f"Data points for W ~  1: {len(df_pos1)}")
    print("-" * 30)
    
    # 4. Run regression for each subset and get the coefficient for X
    coef_w_neg1 = run_regression_on_subset(df_neg1)
    coef_w_zero = run_regression_on_subset(df_zero)
    coef_w_pos1 = run_regression_on_subset(df_pos1)

    # 5. Print the results
    print(f"Coefficient of X when W is around -1: {coef_w_neg1:.4f}")
    print(f"Coefficient of X when W is around  0: {coef_w_zero:.4f}")
    print(f"Coefficient of X when W is around  1: {coef_w_pos1:.4f}")
    print("-" * 30)

except FileNotFoundError:
    print("Error: 'homework_7.1.csv' not found.")
    print("Please make sure the file is in the correct directory.")
except Exception as e:
    print(f"An error occurred: {e}")

Data points for W ~ -1: 488
Data points for W ~  0: 780
Data points for W ~  1: 455
------------------------------
Coefficient of X when W is around -1: 0.8580
Coefficient of X when W is around  0: 1.3832
Coefficient of X when W is around  1: 1.9581
------------------------------


In [11]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

def make_error(corr_const, num):
    """Generates an AR(1) error series."""
    err = list()
    prev = np.random.normal(0, 1)
    for n in range(num):
        # This is an AR(1) process: e_t = c*e_{t-1} + (1-c)*u_t
        prev = corr_const * prev + (1 - corr_const) * np.random.normal(0, 1)
        err.append(prev)
    return np.array(err)

def run_simulation(corr_const, num_trials=1000, num_obs=500):
    """
    Runs a Monte Carlo simulation for a given correlation constant.
    """
    beta_estimates = []
    se_estimates = []

    for _ in range(num_trials):
        # 1. Generate the data (DGP)
        # We need an exogenous variable Z to generate X
        Z = np.random.normal(0, 1, num_obs)
        
        # Create independent serially-correlated errors for X and Y
        e_X = make_error(corr_const, num_obs)
        e_Y = make_error(corr_const, num_obs)
        
        # Create X and Y. The true beta_1 is 1.
        X = Z + e_X 
        Y = X + e_Y 
        
        df = pd.DataFrame({'X': X, 'Y': Y})
        
        # 2. Run OLS regression (statsmodels includes an intercept by default)
        model = smf.ols('Y ~ X', data=df)
        results = model.fit()
        
        # 3. Store the estimate and its reported standard error
        beta_estimates.append(results.params['X'])
        se_estimates.append(results.bse['X'])
        
    # (i) Calculate the TRUE standard deviation of the beta estimates
    true_std_dev = np.std(beta_estimates)
    
    # (ii) Calculate the MEAN of the REPORTED standard errors
    mean_reported_se = np.mean(se_estimates)
    
    # Calculate the ratio
    ratio = true_std_dev / mean_reported_se
    
    return true_std_dev, mean_reported_se, ratio

# --- Run the experiment for all correlation levels ---
corr_levels = [0.2, 0.5, 0.8]
print(f"{'Corr Const':<12} | {'(i) True SD':<12} | {'(ii) Reported SE':<15} | {'Ratio (i)/(ii)':<15}")
print("-" * 57)

for c in corr_levels:
    (i, ii, ratio) = run_simulation(c)
    print(f"{c:<12.1f} | {i:<12.4f} | {ii:<15.4f} | {ratio:<15.4f}")

Corr Const   | (i) True SD  | (ii) Reported SE | Ratio (i)/(ii) 
---------------------------------------------------------
0.2          | 0.0299       | 0.0283          | 1.0545         
0.5          | 0.0244       | 0.0224          | 1.0895         
0.8          | 0.0169       | 0.0142          | 1.1888         
