# 01 — Data Cleaning & EDA (Housing & Homelessness)

In [None]:

import pandas as pd, numpy as np, matplotlib.pyplot as plt
rents = pd.read_csv('data/rents.csv', parse_dates=['month'])
shelters = pd.read_csv('data/shelters.csv')
occ = pd.read_csv('data/shelter_occupancy.csv', parse_dates=['date'])
households = pd.read_csv('data/households.csv')

print('Missingness — households:\n', households.isna().mean())

households['annual_income_usd'] = households.groupby(['county','year'])['annual_income_usd'].transform(lambda s: s.fillna(s.median()))
households['num_children'] = households.groupby(['county','year'])['num_children'].transform(lambda s: s.fillna(s.median().round()))
households['rent_burden_ratio'] = households['rent_burden_ratio'].clip(0,1)
households['rent_burden_ratio'] = households.groupby(['county','year'])['rent_burden_ratio'].transform(lambda s: s.fillna(s.mean()))

households['monthly_housing_cost_est'] = households['annual_income_usd'] * households['rent_burden_ratio'] / 12.0

risk = households.groupby(['county','year']).agg(
    eviction_rate=('eviction_filed_next_6mo','mean'),
    mean_income=('annual_income_usd','mean'),
    mean_rent_burden=('rent_burden_ratio','mean')
).reset_index()

fig = plt.figure()
for c, sub in rents.groupby('county'):
    ss = sub.sort_values('month')
    plt.plot(ss['month'], ss['avg_rent_usd'], label=c)
plt.legend(); plt.title('Average Rent by County Over Time'); plt.xlabel('Month'); plt.ylabel('Avg Rent (USD)'); plt.tight_layout(); plt.show()

fig = plt.figure()
for c, sub in risk.groupby('county'):
    ss = sub.sort_values('year')
    plt.plot(ss['year'], ss['eviction_rate'], marker='o', label=c)
plt.legend(); plt.title('Eviction Rate by County-Year'); plt.xlabel('Year'); plt.ylabel('Rate'); plt.tight_layout(); plt.show()

occ_full = occ.merge(shelters[['shelter_id','county','capacity']], on='shelter_id', how='left')
occ_full['utilization'] = occ_full['occupied_beds'] / occ_full['capacity']
weekly_util = occ_full.groupby(['county','date']).utilization.mean().reset_index()

fig = plt.figure()
for c, sub in weekly_util.groupby('county'):
    ss = sub.sort_values('date')
    plt.plot(ss['date'], ss['utilization'], label=c)
plt.legend(); plt.title('Weekly Shelter Utilization by County'); plt.xlabel('Week'); plt.ylabel('Utilization'); plt.tight_layout(); plt.show()

with pd.ExcelWriter('data/cleaned_outputs.xlsx', engine='xlsxwriter') as xw:
    households.to_excel(xw, sheet_name='households_clean', index=False)
    risk.to_excel(xw, sheet_name='county_year_risk', index=False)
    rents.to_excel(xw, sheet_name='rents', index=False)
print('Wrote data/cleaned_outputs.xlsx')
