## OLS, RF, and IV

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from linearmodels import AbsorbingLS
from linearmodels import PanelOLS
import itertools
from linearmodels.iv import IV2SLS
import warnings
from linearmodels.iv.absorbing import AbsorbingEffectWarning
import psutil
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

In [2]:
# Get the virtual memory details
memory_info = psutil.virtual_memory()
total_memory = memory_info.total
available_memory = memory_info.available
used_memory = memory_info.used

print(f"Total Memory: {total_memory / (1024 ** 3):.2f} GB")
print(f"Available Memory: {available_memory / (1024 ** 3):.2f} GB")
print(f"Used Memory: {used_memory / (1024 ** 3):.2f} GB")

Total Memory: 7.45 GB
Available Memory: 0.88 GB
Used Memory: 6.57 GB


In [3]:
df = pd.read_csv("./Final Data/analysis_input.csv")
df['principal_city_binary'] = df['principal_city'] = np.where(df['principal_city'] == 999999, np.nan, np.where(df['principal_city'] == 0, 0, 1))
df['urban_rural_binary'] = np.where(df['urbanrural'].isin(['M', 'U']), 1, np.where(df['urbanrural'] == 'R', 0, np.nan))
# Merger Sample
df = df.dropna(subset=['overlap'])
df_final = df.copy()

In [7]:
# Step 8: Define the function to run regressions
def run_regression(df, outcome_var, control_vars_prefixes):
    # Ensure the dataframe is indexed properly with MultiIndex for fixed effects
    df_regression = df.copy()
    #df_regression.set_index(['indivID', 'group_timeID'], inplace=True)

    # Prepare dependent and independent variables
    
    control_vars = []
    for prefix in control_vars_prefixes:
        control_vars += [col for col in df_regression.columns if col.startswith(prefix)]
    
    # Combining edum* and control variables
    df2 = df_regression[[outcome_var] + control_vars + ['indivID', 'group_timeID','clustID']].dropna()
    X = df2[control_vars]
    y = df2[outcome_var]
    
    # Running the PanelOLS regression with fixed effects and clustering by clustID
    #model = PanelOLS(y, X, entity_effects=True, drop_absorbed=True, check_rank=False).fit(cov_type='clustered', clusters=df_regression['clustID'])

    # Absorbing fixed effects (indivID and group_timeID) and clustering by clustID
    model = AbsorbingLS(y, X,  drop_absorbed=True).fit(cov_type='clustered', clusters=df2['clustID'])

    
    return model

In [8]:
# Step 10: IV Estimation
def run_iv_regression(df, dep_var, endog_var, instrument_var, control_vars):
    # Prepare dependent and independent variables
    df_regression = df.copy()
    df_regression.set_index(['indivID', 'group_timeID'], inplace=True)
    
    # Combining edum* and control variables
    df2 = df_regression[[dep_var, endog_var, instrument_var] + control_vars + ['clustID']].dropna()

    y = df2[dep_var]  # Dependent variable
    endog = df2[endog_var]  # Endogenous variable
    #print(endog)
    instrument = df2[instrument_var]  # Instrument
    #print(df2[exog_vars])
    exog = df2[control_vars]#.drop(labels = ['poptot1999', 'poptot2000', 'poptot2001', 'poptot2002', 'poptot2003', 'poptot2004', 'poptot2005', 'poptot2006', 'poptot2007', 'poptot2008', 'poptot2009', 'popden1999', 'popden2000', 'popden2001', 'popden2002', 'popden2003', 'popden2004', 'popden2005', 'popden2006', 'popden2007', 'popden2008', 'popden2009', 'pminority1999', 'pminority2000', 'pminority2001', 'pminority2002', 'pminority2003', 'pminority2004', 'pminority2005', 'pminority2006', 'pminority2007', 'pminority2008', 'pminority2009', 'pcollege1999', 'pcollege2000', 'pcollege2001', 'pcollege200'2, pcollege2003, pcollege2004, pcollege2005, pcollege2006, pcollege2007, pcollege2008, pcollege2009, medincome1999, medincome2000, medincome2001, medincome2002, medincome2003, medincome2004, medincome2005, medincome2006, medincome2007, medincome2008, medincome2009, total_branches_county1999, total_branches_county2000, total_branches_county2001, total_branches_county2002, total_branches_county2003, total_branches_county2004, total_branches_county2005, total_branches_county2006, total_branches_county2007, total_branches_county2008, total_branches_county2009, avg_branch_growth_county1999, avg_branch_growth_county2000, avg_branch_growth_county2001, avg_branch_growth_county2002, avg_branch_growth_county2003, avg_branch_growth_county2004, avg_branch_growth_county2005, avg_branch_growth_county2006, avg_branch_growth_county2007, avg_branch_growth_county2008, avg_branch_growth_county2009]
    # IV regression using IV2SLS
    model = IV2SLS(y, exog, endog, instrument)
    result = model.fit(cov_type='clustered', clusters=df2['clustID'])
    
    return result

In [9]:
ols_prefixes = ['POST_close','poptot', 'popden', 'pminority', 'pcollege', 'medincome', 'total_branches_county', 'avg_branch_growth_county']
rf_prefixes = ['POST_expose','poptot', 'popden', 'pminority', 'pcollege', 'medincome', 'total_branches_county', 'avg_branch_growth_county']
iv_prefixes = ['poptot', 'popden', 'pminority', 'pcollege', 'medincome', 'total_branches_county', 'avg_branch_growth_county']

In [10]:
# OLS for NumSBL_Rev1 
ols_result_1 = run_regression(df_final, 'sbl_total', ols_prefixes)
ols_result_2 = run_regression(df_final, 'vol_sbus', ols_prefixes)
ols_result_3 = run_regression(df_final, 'sbl_total_num', ols_prefixes)
ols_result_4 = run_regression(df_final, 'num_sbus', ols_prefixes)

In [11]:
ols_result_1

0,1,2,3
Dep. Variable:,sbl_total,R-squared:,0.0106
Estimator:,Absorbing LS,Adj. R-squared:,0.0104
No. Observations:,38048,F-statistic:,94.263
Date:,"Sun, Oct 20 2024",P-value (F-stat):,0.0000
Time:,03:09:20,Distribution:,chi2(9)
Cov. Estimator:,clustered,R-squared (No Effects):,0.0106
,,Variables Absorbed:,0.0000

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
POST_close,2235.4,1606.1,1.3918,0.1640,-912.44,5383.2
poptot,0.7555,0.2671,2.8287,0.0047,0.2320,1.2789
poptot25,0.0023,0.0029,0.7839,0.4331,-0.0034,0.0080
popden,0.3640,0.2701,1.3480,0.1777,-0.1653,0.8934
pminority,-60.877,31.634,-1.9245,0.0543,-122.88,1.1233
pcollege,6144.3,3213.9,1.9118,0.0559,-154.87,1.244e+04
medincome,0.0239,0.0175,1.3598,0.1739,-0.0105,0.0583
total_branches_county,-34.897,22.671,-1.5393,0.1237,-79.332,9.5382
avg_branch_growth_county,-574.83,945.93,-0.6077,0.5434,-2428.8,1279.1


In [17]:
ols_result_3

0,1,2,3
Dep. Variable:,sbl_total_num,R-squared:,0.0387
Estimator:,Absorbing LS,Adj. R-squared:,0.0385
No. Observations:,38048,F-statistic:,185.06
Date:,"Sun, Oct 20 2024",P-value (F-stat):,0.0000
Time:,03:09:20,Distribution:,chi2(9)
Cov. Estimator:,clustered,R-squared (No Effects):,0.0387
,,Variables Absorbed:,0.0000

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
POST_close,18.923,12.165,1.5555,0.1198,-4.9206,42.767
poptot,0.0228,0.0043,5.2648,0.0000,0.0143,0.0312
poptot25,0.0001,3.739e-05,2.7274,0.0064,2.869e-05,0.0002
popden,0.0031,0.0026,1.2090,0.2267,-0.0019,0.0082
pminority,-1.0696,0.4167,-2.5669,0.0103,-1.8862,-0.2529
pcollege,53.796,42.307,1.2716,0.2035,-29.125,136.72
medincome,0.0005,0.0002,2.8800,0.0040,0.0002,0.0008
total_branches_county,-0.4349,0.4553,-0.9550,0.3396,-1.3273,0.4576
avg_branch_growth_county,-14.644,14.876,-0.9844,0.3249,-43.800,14.512


In [12]:
# RF for NumSBL_Rev1 
rf_result_1 = run_regression(df_final, 'sbl_total', rf_prefixes)
rf_result_2 = run_regression(df_final, 'vol_sbus', rf_prefixes)
rf_result_3 = run_regression(df_final, 'sbl_total_num', rf_prefixes)
rf_result_4 = run_regression(df_final, 'num_sbus', rf_prefixes)

In [13]:
rf_result_1

0,1,2,3
Dep. Variable:,sbl_total,R-squared:,0.0116
Estimator:,Absorbing LS,Adj. R-squared:,0.0114
No. Observations:,38048,F-statistic:,89.999
Date:,"Sun, Oct 20 2024",P-value (F-stat):,0.0000
Time:,03:09:21,Distribution:,chi2(9)
Cov. Estimator:,clustered,R-squared (No Effects):,0.0116
,,Variables Absorbed:,0.0000

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
POST_expose,3300.7,2464.6,1.3393,0.1805,-1529.8,8131.3
poptot,0.7553,0.2632,2.8700,0.0041,0.2395,1.2711
poptot25,0.0023,0.0029,0.7898,0.4296,-0.0034,0.0079
popden,0.3539,0.2634,1.3434,0.1791,-0.1624,0.8703
pminority,-63.122,30.650,-2.0594,0.0395,-123.19,-3.0488
pcollege,6084.3,3232.8,1.8821,0.0598,-251.79,1.242e+04
medincome,0.0240,0.0175,1.3720,0.1701,-0.0103,0.0582
total_branches_county,-33.746,22.409,-1.5059,0.1321,-77.668,10.175
avg_branch_growth_county,-483.83,964.28,-0.5017,0.6158,-2373.8,1406.1


In [18]:
rf_result_3

0,1,2,3
Dep. Variable:,sbl_total_num,R-squared:,0.0399
Estimator:,Absorbing LS,Adj. R-squared:,0.0397
No. Observations:,38048,F-statistic:,170.99
Date:,"Sun, Oct 20 2024",P-value (F-stat):,0.0000
Time:,03:09:21,Distribution:,chi2(9)
Cov. Estimator:,clustered,R-squared (No Effects):,0.0399
,,Variables Absorbed:,0.0000

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
POST_expose,29.821,18.189,1.6395,0.1011,-5.8286,65.470
poptot,0.0227,0.0043,5.3018,0.0000,0.0143,0.0312
poptot25,0.0001,3.697e-05,2.7514,0.0059,2.926e-05,0.0002
popden,0.0030,0.0025,1.2014,0.2296,-0.0019,0.0080
pminority,-1.0882,0.4103,-2.6519,0.0080,-1.8924,-0.2839
pcollege,53.180,42.598,1.2484,0.2119,-30.311,136.67
medincome,0.0005,0.0002,2.8965,0.0038,0.0002,0.0008
total_branches_county,-0.4230,0.4549,-0.9298,0.3525,-1.3146,0.4686
avg_branch_growth_county,-13.834,15.104,-0.9159,0.3597,-43.437,15.769


In [14]:
# IV for NumSBL_Rev1 
iv_result_1 = run_iv_regression(df_final, 'sbl_total', 'POST_close', 'POST_expose', control_vars)
iv_result_2 = run_iv_regression(df_final, 'vol_sbus', 'POST_close', 'POST_expose', control_vars)
iv_result_3 = run_iv_regression(df_final, 'sbl_total_num', 'POST_close', 'POST_expose', control_vars)
iv_result_4 = run_iv_regression(df_final, 'num_sbus', 'POST_close', 'POST_expose', control_vars)


In [23]:
iv_result_1

0,1,2,3
Dep. Variable:,sbl_total,R-squared:,0.0072
Estimator:,IV-2SLS,Adj. R-squared:,0.0070
No. Observations:,38048,F-statistic:,68.938
Date:,"Sun, Oct 20 2024",P-value (F-stat),0.0000
Time:,03:09:22,Distribution:,chi2(8)
Cov. Estimator:,clustered,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
poptot,0.7356,0.2595,2.8348,0.0046,0.2270,1.2442
popden,0.3569,0.2662,1.3405,0.1801,-0.1649,0.8787
pminority,-57.148,32.967,-1.7335,0.0830,-121.76,7.4654
pcollege,5980.3,3227.8,1.8528,0.0639,-346.03,1.231e+04
medincome,0.0246,0.0170,1.4455,0.1483,-0.0087,0.0579
total_branches_county,-31.699,21.854,-1.4505,0.1469,-74.532,11.135
avg_branch_growth_county,-601.25,949.13,-0.6335,0.5264,-2461.5,1259.0
POST_close,5029.7,3808.1,1.3208,0.1866,-2434.1,1.249e+04


In [19]:
iv_result_3

0,1,2,3
Dep. Variable:,sbl_total_num,R-squared:,0.0350
Estimator:,IV-2SLS,Adj. R-squared:,0.0348
No. Observations:,38048,F-statistic:,147.50
Date:,"Sun, Oct 20 2024",P-value (F-stat),0.0000
Time:,03:09:23,Distribution:,chi2(8)
Cov. Estimator:,clustered,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
poptot,0.0226,0.0043,5.2862,0.0000,0.0142,0.0310
popden,0.0030,0.0025,1.1972,0.2312,-0.0019,0.0080
pminority,-1.0335,0.4188,-2.4674,0.0136,-1.8544,-0.2126
pcollege,52.211,42.478,1.2292,0.2190,-31.043,135.47
medincome,0.0005,0.0002,2.9819,0.0029,0.0002,0.0008
total_branches_county,-0.4056,0.4514,-0.8985,0.3689,-1.2902,0.4791
avg_branch_growth_county,-14.892,14.779,-1.0076,0.3136,-43.858,14.075
POST_close,45.447,28.243,1.6092,0.1076,-9.9078,100.80


In [None]:
iv_result_1.params['POST_close'] * 5

25148.483370482347

In [None]:
iv_result_1.std_errors['POST_close'] * 5

19040.5749172693

In [27]:
iv_result_3.params['POST_close'] * 5

227.2344857788498

In [28]:
iv_result_3.std_errors['POST_close'] * 5

141.21366907707977

In [21]:
# Step 2: Filter the dataset to keep only rows where event_year == -1 and overlap == 1
df_filtered = df[(df['event_year'] == -1) & (df['overlap'] == 1)].copy()

# Step 3: Compute the mean of each variable in the filtered data and store as scalars
scalars = {}
for var in ['sbl_total', 'vol_sbus', 'sbl_total_num', 'num_sbus']:
    scalars[var] = df_filtered[var].mean()


for var, mean_value in scalars.items():
    print(f"Mean of {var}: {mean_value}")

Mean of sbl_total: 14142.099378881987
Mean of vol_sbus: 3272.88198757764
Mean of sbl_total_num: 214.22360248447205
Mean of num_sbus: 89.13354037267081
