# Week 8 Homework


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LogisticRegression
from scipy.spatial.distance import mahalanobis

In [2]:
df1 = pd.read_csv('homework_8.1.csv', index_col=0)
df2 = pd.read_csv('homework_8.2.csv', index_col=0)

In [6]:
# Estimate propensity scores P(X=1 | Z)
logit = LogisticRegression(solver='liblinear')
logit.fit(df1[['Z']], df1['X'])
df1['propensity'] = logit.predict_proba(df1[['Z']])[:, 1]

# Compute the inverse probability weights
df1['weight'] = np.where( df1['X'] == 1,
                          1 / df1['propensity'], 
                          1 / (1 - df1['propensity']))

# Weighted means of Y for treated and control groups
treated_mean = np.average(df1.loc[df1['X'] == 1, 'Y'], weights=df1.loc[df1['X'] == 1, 'weight'])
control_mean = np.average(df1.loc[df1['X'] == 0, 'Y'], weights=df1.loc[df1['X'] == 0, 'weight'])

# ATE
ate = treated_mean - control_mean
print(f'Estimated ATE using IPW: {ate}')

Estimated ATE using IPW: 2.2743275711428588


In [8]:
print("Propensity scores of the first three items:")
print(df1['propensity'].head(3).values)


Propensity scores of the first three items:
[0.84011371 0.58464597 0.71108245]


In [11]:
# Split into treted and control groups
treated = df2[df2['X'] == 1].reset_index(drop=True)
control = df2[df2['X'] == 0].reset_index(drop=True)

# Build the covariance matrix for Mahalanobis distance
Z = df2[['Z1', 'Z2']].values
cov = np.cov(Z.T)
inv_cov = np.linalg.inv(cov)

# Find the nearest control for each treated unit
matches = []

for i in range(len(treated)):
    z_t = treated.loc[i, ['Z1', 'Z2']].values
    dists = control[['Z1', 'Z2']].apply(lambda row: mahalanobis(z_t, row.values, inv_cov), axis=1)
    j = dists.idxmin()
    matched_control = control.loc[j]
    matches.append({'treated_Y': treated.loc[i, 'Y'],
                    'control_Y': matched_control['Y'],
                    'diff': treated.loc[i, 'Y'] - matched_control['Y']})
    
matches_df = pd.DataFrame(matches)

ate = matches_df['diff'].mean()
print(f'Estimated ATE using Nearest Neighbor Matching: {ate}')

Estimated ATE using Nearest Neighbor Matching: 3.4376789979126094


In [13]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis

# Load your dataset
df2 = pd.read_csv("homework_8.2.csv")

# Split into treated and control groups
treated = df2[df2['X'] == 1].reset_index(drop=True)
control = df2[df2['X'] == 0].reset_index(drop=True)

# Build the covariate matrix (Z1, Z2)
Z = df2[['Z1', 'Z2']].values
cov = np.cov(Z.T)
inv_cov = np.linalg.inv(cov)

# For each treated unit, find its nearest control and record the distance
results = []
for i in range(len(treated)):
    z_t = treated.loc[i, ['Z1', 'Z2']].values
    dists = control[['Z1', 'Z2']].apply(
        lambda row: mahalanobis(z_t, row.values, inv_cov), axis=1
    )
    j = dists.idxmin()
    results.append({
        'treated_index': i,
        'treated_Z1': treated.loc[i, 'Z1'],
        'treated_Z2': treated.loc[i, 'Z2'],
        'control_index': j,
        'control_Z1': control.loc[j, 'Z1'],
        'control_Z2': control.loc[j, 'Z2'],
        'min_distance': dists.min()
    })

results_df = pd.DataFrame(results)

# Find the treated unit with the *largest* minimum distance (least common support)
worst_match = results_df.loc[results_df['min_distance'].idxmax()]

print("Treated unit with least common support:")
print(f"  Z1 = {worst_match['treated_Z1']}, Z2 = {worst_match['treated_Z2']}")
print("Nearest control unit:")
print(f"  Z1 = {worst_match['control_Z1']}, Z2 = {worst_match['control_Z2']}")
print(f"Mahalanobis distance = {worst_match['min_distance']}")

Treated unit with least common support:
  Z1 = 2.69622405256358, Z2 = 0.5381554886023228
Nearest control unit:
  Z1 = 1.5199948607657727, Z2 = -1.2822079376259403
Mahalanobis distance = 1.3830045328325056
