# Import packages

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from linearmodels.panel import PanelOLS

# Import dataset

In [None]:
df = pd.read_csv("C:/Users/trung/CCDS/data/processed/df.csv")

df.head(5)

Unnamed: 0,name,year,poverty-rate,median-household-income,rent-burden,pct-white,households-threatened,eviction-filing-rate,total-crime
0,Baltimore City,2000,22.92,30078,26.9,30.96,58207.75,124.17,66791
1,Baltimore City,2001,22.92,30078,26.9,30.96,52564.67,111.08,64453
2,Baltimore City,2002,22.92,30078,26.9,30.96,57183.58,121.05,56550
3,Baltimore City,2003,22.92,30078,26.9,30.96,57039.41,116.82,49263
4,Baltimore City,2004,22.92,30078,26.9,30.96,60221.48,109.42,48314


# Data processing

In [22]:
# drop households-threatened variable
df = df.drop(columns="households-threatened", axis=1)

In [23]:
# create a metropolitan column indicating whether a jurisdiction is metropolitan
metro_jurisdictions = [
    "Frederick County",
    "Montgomery County",
    "Prince George's County"
]

df['metropolitan'] = df['name'].isin(metro_jurisdictions).astype(int)

In [24]:
# Log transform income and crime variable
df['log_income'] = np.log(df['median-household-income'])

# Log crime (+1 to allow zeros) since some jurisdictions have low crime rate
df['log_crime'] = np.log(df['total-crime'] + 1)

In [25]:
# Create a poverty × metro interaction variable
df['poverty_metro'] = df['poverty-rate'] * df['metropolitan']

In [28]:
# rename columns to keep consistent format
df.rename(columns={"poverty-rate": "poverty_rate",
           "median-household-income": "median_household_income",
           "rent-burden": "rent_burden",
           "pct-white": "pct_white",
           "eviction-filing-rate": "eviction_filing_rate",
           "total-crime": "total_crime"
           }, inplace=True)

In [29]:
df

Unnamed: 0,name,year,poverty_rate,median_household_income,rent_burden,pct_white,eviction_filing_rate,total_crime,metropolitan,log_income,log_crime,poverty_metro
0,Baltimore City,2000,22.92,30078,26.9,30.96,124.17,66791,0,10.311549,11.109339,0.0
1,Baltimore City,2001,22.92,30078,26.9,30.96,111.08,64453,0,10.311549,11.073707,0.0
2,Baltimore City,2002,22.92,30078,26.9,30.96,121.05,56550,0,10.311549,10.942898,0.0
3,Baltimore City,2003,22.92,30078,26.9,30.96,116.82,49263,0,10.311549,10.804949,0.0
4,Baltimore City,2004,22.92,30078,26.9,30.96,109.42,48314,0,10.311549,10.785497,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
451,Worcester County,2014,7.65,56773,31.5,79.97,17.29,2427,0,10.946816,7.794823,0.0
452,Worcester County,2015,7.65,56773,31.5,79.97,10.03,2008,0,10.946816,7.605392,0.0
453,Worcester County,2016,6.41,61145,30.7,80.08,15.95,1987,0,11.021003,7.594884,0.0
454,Worcester County,2017,6.41,61145,30.7,80.08,14.51,1950,0,11.021003,7.576097,0.0


# Baseline model: Pooled OLS

In [None]:
ols_vars = ['poverty-rate', 'log_income', 'rent_burden', 'log_crime', 'pct_white']
X0 = sm.add_constant(df[ols_vars])
y = df['eviction_rate']

# Cluster-robust SEs by county
model0 = sm.OLS(y, X0)
res0 = model0.fit(cov_type='cluster', cov_kwds={'groups': df['county']})

print("Model 0: Pooled OLS")
print(res0.summary())