## OLS, RF, and IV

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from linearmodels import AbsorbingLS
from linearmodels import PanelOLS
import itertools
from linearmodels.iv import IV2SLS
import warnings
from linearmodels.iv.absorbing import AbsorbingEffectWarning
import psutil
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

In [2]:
# Get the virtual memory details
memory_info = psutil.virtual_memory()
total_memory = memory_info.total
available_memory = memory_info.available
used_memory = memory_info.used

print(f"Total Memory: {total_memory / (1024 ** 3):.2f} GB")
print(f"Available Memory: {available_memory / (1024 ** 3):.2f} GB")
print(f"Used Memory: {used_memory / (1024 ** 3):.2f} GB")

Total Memory: 7.45 GB
Available Memory: 0.95 GB
Used Memory: 6.50 GB


In [3]:
df = pd.read_csv("./Final Data/analysis_input.csv")
df['principal_city_binary'] = df['principal_city'] = np.where(df['principal_city'] == 999999, np.nan, np.where(df['principal_city'] == 0, 0, 1))
df['urban_rural_binary'] = np.where(df['urbanrural'].isin(['M', 'U']), 1, np.where(df['urbanrural'] == 'R', 0, np.nan))
# Merger Sample
df = df.dropna(subset=['overlap'])
control_vars = ['popden', 'pminority', 'pcollege', 'medincome', 'total_branches_county', 'avg_branch_growth_county']

### Target-Only Control Tracts

In [4]:
exposed_df = df[(df['overlap'] == 1)] ## overlapped tracts

In [5]:
non_target_control_df = df[(df['overlap'] == 0) & (df['target_only'] == 0)] ## Non-target control tracts

In [6]:
target_control_df = df[(df['overlap'] == 0) & (df['target_only'] == 1)] ## target control tracts

In [7]:
df = pd.concat([exposed_df,target_control_df])

In [8]:
df

Unnamed: 0,year,state,county,tract,pop_group,income_group,num_100k,vol_100k,num_250k,vol_250k,...,branch_closures_tract,avg_branch_growth_tract,total_branches_county,branch_closures_county,avg_branch_growth_county,total_branches,branch_closures,avg_branch_growth_merger,principal_city_binary,urban_rural_binary
7222,2010,36,119,9400.0,L,8.0,321.0,3905.0,11.0,2173.0,...,,,283,12.0,-0.077869,1,,,1.0,1.0
7223,2011,36,119,9400.0,L,8.0,378.0,5119.0,9.0,1752.0,...,0.0,,302,0.0,-0.077869,2,0.0,,1.0,1.0
7224,2011,36,119,9400.0,L,8.0,378.0,5119.0,9.0,1752.0,...,0.0,,302,0.0,-0.077869,2,0.0,,1.0,1.0
7225,2012,36,119,9400.0,L,8.0,396.0,4864.0,11.0,2150.0,...,0.0,0.384615,297,5.0,0.049470,2,0.0,1.0,,
7226,2012,36,119,9400.0,L,8.0,396.0,4864.0,11.0,2150.0,...,0.0,0.384615,297,5.0,0.049470,2,0.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119351,2018,39,133,601000.0,S,102.0,32.0,315.0,0.0,0.0,...,0.0,2.000000,28,1.0,-0.034483,1,0.0,,0.0,1.0
119376,2017,39,155,933002.0,S,103.0,73.0,1049.0,4.0,741.0,...,0.0,0.500000,42,0.0,0.200000,1,,,1.0,1.0
119377,2018,39,155,933002.0,S,103.0,57.0,802.0,4.0,723.0,...,0.0,0.500000,39,3.0,0.083333,1,0.0,,1.0,1.0
119378,2019,39,155,933002.0,S,103.0,57.0,758.0,2.0,446.0,...,0.0,0.000000,39,0.0,-0.071429,1,0.0,0.0,1.0,1.0


In [12]:
# Step 8: Define the function to run regressions
def run_regression(df, outcome_var, control_vars_prefixes):
    # Ensure the dataframe is indexed properly with MultiIndex for fixed effects
    df_regression = df.copy()
    #df_regression.set_index(['indivID', 'group_timeID'], inplace=True)

    # Prepare dependent and independent variables
    
    control_vars = []
    for prefix in control_vars_prefixes:
        control_vars += [col for col in df_regression.columns if col.startswith(prefix)]
    
    # Combining edum* and control variables
    df2 = df_regression[[outcome_var] + control_vars + ['indivID', 'group_timeID','clustID']].dropna()
    X = df2[control_vars]
    y = df2[outcome_var]
    
    # Running the PanelOLS regression with fixed effects and clustering by clustID
    #model = PanelOLS(y, X, entity_effects=True, drop_absorbed=True, check_rank=False).fit(cov_type='clustered', clusters=df_regression['clustID'])

    # Absorbing fixed effects (indivID and group_timeID) and clustering by clustID
    model = AbsorbingLS(y, X,  drop_absorbed=True).fit(cov_type='clustered', clusters=df2['clustID'])

    
    return model

In [13]:
# Step 10: IV Estimation
def run_iv_regression(df, dep_var, endog_var, instrument_var, control_vars):
    # Prepare dependent and independent variables
    df_regression = df.copy()
    df_regression.set_index(['indivID', 'group_timeID'], inplace=True)
    
    # Combining edum* and control variables
    df2 = df_regression[[dep_var, endog_var, instrument_var] + control_vars + ['clustID']].dropna()

    y = df2[dep_var]  # Dependent variable
    endog = df2[endog_var]  # Endogenous variable
    #print(endog)
    instrument = df2[instrument_var]  # Instrument
    #print(df2[exog_vars])
    exog = df2[control_vars]#.drop(labels = ['poptot1999', 'poptot2000', 'poptot2001', 'poptot2002', 'poptot2003', 'poptot2004', 'poptot2005', 'poptot2006', 'poptot2007', 'poptot2008', 'poptot2009', 'popden1999', 'popden2000', 'popden2001', 'popden2002', 'popden2003', 'popden2004', 'popden2005', 'popden2006', 'popden2007', 'popden2008', 'popden2009', 'pminority1999', 'pminority2000', 'pminority2001', 'pminority2002', 'pminority2003', 'pminority2004', 'pminority2005', 'pminority2006', 'pminority2007', 'pminority2008', 'pminority2009', 'pcollege1999', 'pcollege2000', 'pcollege2001', 'pcollege200'2, pcollege2003, pcollege2004, pcollege2005, pcollege2006, pcollege2007, pcollege2008, pcollege2009, medincome1999, medincome2000, medincome2001, medincome2002, medincome2003, medincome2004, medincome2005, medincome2006, medincome2007, medincome2008, medincome2009, total_branches_county1999, total_branches_county2000, total_branches_county2001, total_branches_county2002, total_branches_county2003, total_branches_county2004, total_branches_county2005, total_branches_county2006, total_branches_county2007, total_branches_county2008, total_branches_county2009, avg_branch_growth_county1999, avg_branch_growth_county2000, avg_branch_growth_county2001, avg_branch_growth_county2002, avg_branch_growth_county2003, avg_branch_growth_county2004, avg_branch_growth_county2005, avg_branch_growth_county2006, avg_branch_growth_county2007, avg_branch_growth_county2008, avg_branch_growth_county2009]
    # IV regression using IV2SLS
    model = IV2SLS(y, exog, endog, instrument)
    result = model.fit(cov_type='clustered', clusters=df2['clustID'])
    
    return result

In [14]:
ols_prefixes = ['POST_close','poptot', 'popden', 'pminority', 'pcollege', 'medincome', 'total_branches_county', 'avg_branch_growth_county']
rf_prefixes = ['POST_expose','poptot', 'popden', 'pminority', 'pcollege', 'medincome', 'total_branches_county', 'avg_branch_growth_county']
iv_prefixes = ['poptot', 'popden', 'pminority', 'pcollege', 'medincome', 'total_branches_county', 'avg_branch_growth_county']

In [15]:
# OLS for NumSBL_Rev1 
ols_result_1 = run_regression(df, 'sbl_total', ols_prefixes)
ols_result_2 = run_regression(df, 'vol_sbus', ols_prefixes)
ols_result_3 = run_regression(df, 'sbl_total_num', ols_prefixes)
ols_result_4 = run_regression(df, 'num_sbus', ols_prefixes)

In [16]:
ols_result_1

0,1,2,3
Dep. Variable:,sbl_total,R-squared:,0.0278
Estimator:,Absorbing LS,Adj. R-squared:,0.0267
No. Observations:,8432,F-statistic:,465.43
Date:,"Sun, Oct 20 2024",P-value (F-stat):,0.0000
Time:,03:28:21,Distribution:,chi2(9)
Cov. Estimator:,clustered,R-squared (No Effects):,0.0278
,,Variables Absorbed:,0.0000

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
POST_close,4147.4,3374.3,1.2291,0.2190,-2466.0,1.076e+04
poptot,0.2563,0.4152,0.6172,0.5371,-0.5576,1.0702
poptot25,-0.0015,0.0028,-0.5212,0.6022,-0.0070,0.0041
popden,1.5316,0.2964,5.1669,0.0000,0.9506,2.1125
pminority,17.434,84.682,0.2059,0.8369,-148.54,183.41
pcollege,1.034e+04,4821.0,2.1440,0.0320,887.38,1.979e+04
medincome,-0.0353,0.0407,-0.8670,0.3860,-0.1150,0.0444
total_branches_county,-141.17,12.618,-11.188,0.0000,-165.90,-116.44
avg_branch_growth_county,973.13,1506.3,0.6460,0.5182,-1979.1,3925.4


In [17]:
# RF for NumSBL_Rev1 
rf_result_1 = run_regression(df, 'sbl_total', rf_prefixes)
rf_result_2 = run_regression(df, 'vol_sbus', rf_prefixes)
rf_result_3 = run_regression(df, 'sbl_total_num', rf_prefixes)
rf_result_4 = run_regression(df, 'num_sbus', rf_prefixes)

In [18]:
rf_result_1

0,1,2,3
Dep. Variable:,sbl_total,R-squared:,0.0258
Estimator:,Absorbing LS,Adj. R-squared:,0.0247
No. Observations:,8432,F-statistic:,499.40
Date:,"Sun, Oct 20 2024",P-value (F-stat):,0.0000
Time:,03:28:21,Distribution:,chi2(9)
Cov. Estimator:,clustered,R-squared (No Effects):,0.0257
,,Variables Absorbed:,0.0000

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
POST_expose,4059.0,3014.5,1.3465,0.1781,-1849.4,9967.4
poptot,0.2764,0.4300,0.6429,0.5203,-0.5663,1.1191
poptot25,-0.0017,0.0029,-0.5635,0.5731,-0.0074,0.0041
popden,1.5051,0.2834,5.3114,0.0000,0.9497,2.0605
pminority,4.4559,75.787,0.0588,0.9531,-144.08,153.00
pcollege,9469.1,4258.0,2.2239,0.0262,1123.7,1.781e+04
medincome,-0.0316,0.0386,-0.8206,0.4119,-0.1072,0.0439
total_branches_county,-140.30,13.030,-10.767,0.0000,-165.84,-114.76
avg_branch_growth_county,1635.0,1337.1,1.2228,0.2214,-985.73,4255.7


In [19]:
rf_result_3

0,1,2,3
Dep. Variable:,sbl_total_num,R-squared:,0.0915
Estimator:,Absorbing LS,Adj. R-squared:,0.0905
No. Observations:,8432,F-statistic:,641.57
Date:,"Sun, Oct 20 2024",P-value (F-stat):,0.0000
Time:,03:28:21,Distribution:,chi2(9)
Cov. Estimator:,clustered,R-squared (No Effects):,0.0915
,,Variables Absorbed:,0.0000

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
POST_expose,38.023,22.486,1.6910,0.0908,-6.0488,82.094
poptot,0.0080,0.0059,1.3461,0.1783,-0.0036,0.0196
poptot25,6.274e-05,6.297e-05,0.9963,0.3191,-6.068e-05,0.0002
popden,0.0197,0.0026,7.6384,0.0000,0.0146,0.0247
pminority,-0.2671,0.6629,-0.4029,0.6870,-1.5663,1.0321
pcollege,94.507,49.234,1.9195,0.0549,-1.9902,191.01
medincome,-3.817e-06,0.0003,-0.0126,0.9899,-0.0006,0.0006
total_branches_county,-1.3697,0.1666,-8.2197,0.0000,-1.6964,-1.0431
avg_branch_growth_county,22.948,16.535,1.3878,0.1652,-9.4606,55.356


In [20]:
# IV for NumSBL_Rev1 
iv_result_1 = run_iv_regression(df, 'sbl_total', 'POST_close', 'POST_expose', control_vars)
iv_result_2 = run_iv_regression(df, 'vol_sbus', 'POST_close', 'POST_expose', control_vars)
iv_result_3 = run_iv_regression(df, 'sbl_total_num', 'POST_close', 'POST_expose', control_vars)
iv_result_4 = run_iv_regression(df, 'num_sbus', 'POST_close', 'POST_expose', control_vars)


In [21]:
iv_result_1

0,1,2,3
Dep. Variable:,sbl_total,R-squared:,0.0234
Estimator:,IV-2SLS,Adj. R-squared:,0.0225
No. Observations:,8432,F-statistic:,376.17
Date:,"Sun, Oct 20 2024",P-value (F-stat),0.0000
Time:,03:28:21,Distribution:,chi2(8)
Cov. Estimator:,clustered,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
poptot,0.1532,0.3702,0.4138,0.6790,-0.5724,0.8788
popden,1.5342,0.2992,5.1284,0.0000,0.9479,2.1206
pminority,24.642,88.572,0.2782,0.7808,-148.96,198.24
pcollege,1.091e+04,5361.8,2.0346,0.0419,400.30,2.142e+04
medincome,-0.0404,0.0413,-0.9784,0.3279,-0.1214,0.0406
total_branches_county,-126.85,13.387,-9.4760,0.0000,-153.09,-100.62
avg_branch_growth_county,165.69,1936.1,0.0856,0.9318,-3629.0,3960.4
POST_close,7630.7,5557.8,1.3730,0.1698,-3262.3,1.852e+04


In [22]:
iv_result_3

0,1,2,3
Dep. Variable:,sbl_total_num,R-squared:,0.0823
Estimator:,IV-2SLS,Adj. R-squared:,0.0814
No. Observations:,8432,F-statistic:,544.84
Date:,"Sun, Oct 20 2024",P-value (F-stat),0.0000
Time:,03:28:22,Distribution:,chi2(8)
Cov. Estimator:,clustered,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
poptot,0.0069,0.0055,1.2584,0.2082,-0.0038,0.0176
popden,0.0200,0.0027,7.2717,0.0000,0.0146,0.0254
pminority,-0.0772,0.7840,-0.0985,0.9215,-1.6138,1.4594
pcollege,107.94,54.593,1.9771,0.0480,0.9366,214.94
medincome,-8.574e-05,0.0003,-0.2669,0.7896,-0.0007,0.0005
total_branches_county,-1.2452,0.1633,-7.6258,0.0000,-1.5653,-0.9252
avg_branch_growth_county,9.2863,19.212,0.4834,0.6288,-28.368,46.941
POST_close,71.511,41.742,1.7132,0.0867,-10.302,153.32


In [23]:
# Step 2: Filter the dataset to keep only rows where event_year == -1 and overlap == 1
df_filtered = df[(df['event_year'] == -1) & (df['overlap'] == 1)].copy()

# Step 3: Compute the mean of each variable in the filtered data and store as scalars
scalars = {}
for var in ['sbl_total', 'vol_sbus']:
    scalars[var] = df_filtered[var].mean()


for var, mean_value in scalars.items():
    print(f"Mean of {var}: {mean_value}")

Mean of sbl_total: 14142.099378881987
Mean of vol_sbus: 3272.88198757764
