# PREDICTING HOMELESSNESS IN AMERICA


In [62]:
import pandas as pd
import numpy as np
import re
import dbf
import string

## Map County and Location Data to HUD Codes

- HUD Data was mannually compiled using a 2016 list of Continuum of Care areas and state-wide county names.
- Lat and lon data was downloaded from: https://www.weather.gov/gis/Counties
- State names were downloaded from Kaggle: https://www.kaggle.com/stansilas/us-state-county-name-codes/downloads/us-state-county-name-codes.zip/2

In [63]:
#Merge FIPS, COC Codes, and Lat & Lon to map across US county data 

#### HUD COC Codes & FIPS Data

In [64]:
coc_df = pd.read_csv('full_county_list.csv') #mannually compiled list with HUD codes

In [65]:
coc_df.shape

(3144, 4)

In [66]:
coc_df.head()

Unnamed: 0,FIPS,county,state,number
0,1001.0,Autauga,AL,AL-507
1,1003.0,Baldwin,AL,AL-501
2,1005.0,Barbour,AL,AL-507
3,1007.0,Bibb,AL,AL-507
4,1009.0,Blount,AL,AL-503


In [67]:
coc_df.drop(coc_df.index[383:387], inplace=True) #dropping empty rows

In [68]:
coc_df.county = coc_df.county.apply(lambda x: str(x).lower()) #lowercase strings

In [69]:
coc_df.FIPS.isna().value_counts() #check for missing FIPS codes

False    3133
True        7
Name: FIPS, dtype: int64

In [70]:
coc_df[coc_df.FIPS.isna()== True] #all Los Angeles greater area locations

Unnamed: 0,FIPS,county,state,number
197,,glendale,CA,CA-612
205,,long beach,CA,CA-606
218,,pasadena,CA,CA-607
391,,,,
392,,,,
393,,,,
394,,,,


In [71]:
coc_df.FIPS = coc_df.FIPS.fillna(0) #fill missing fields with zero

In [72]:
coc_df.FIPS = coc_df.FIPS.apply(lambda x: int(x)) #change from float to int

#### Weather.gov Data

In [73]:
#import a weather.gov DBF file of lat and lon; convert to CSV download
tabl = dbf.Table('c_02ap19/c_02ap19.dbf')
tabl.open()
dbf.export(tabl, header=False)

3332

In [74]:
lat_df = pd.read_csv('c_02ap19/c_02ap19.csv') #use pandas to manipulate weather.gov data

In [75]:
lat_df.head() 

Unnamed: 0,ME,CAR,Washington,23029,E,se,-67.6361,45.0363
0,GA,CHS,McIntosh,13191,E,se,-81.2646,31.533
1,GA,CHS,Liberty,13179,E,se,-81.2102,31.7093
2,AS,PPG,Swains Island,60040,S,,-171.0459,-11.0843
3,AS,PPG,Manu'a,60020,S,,-169.506,-14.2219
4,AS,PPG,Western,60050,S,,-170.7691,-14.3241


In [76]:
#need to move the column names down and reset the index
lat_df = lat_df.T.reset_index().T

In [77]:
lat_df = lat_df.reset_index()

In [78]:
#drop unnecessary columns
lat_df.drop(['index',1,4,5],axis=1, inplace=True)

In [79]:
lat_df.rename({0:'st', 2:'county', 3:'FIPS', 6:'LON', 7:'LAT'}, axis=1, inplace=True)
lat_df.county = lat_df.county.apply(lambda x: str(x).lower())

In [80]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [81]:
#function to remove punctuation from county names
def punct(x):
    exclude = set(string.punctuation)
    x = ''.join(ch for ch in x if ch not in exclude)
    return x

In [82]:
lat_df.county = lat_df.county.apply(lambda x: punct(x))

In [83]:
lat_df.head()

Unnamed: 0,st,county,FIPS,LON,LAT
0,ME,washington,23029,-67.6361,45.0363
1,GA,mcintosh,13191,-81.2646,31.533
2,GA,liberty,13179,-81.2102,31.7093
3,AS,swains island,60040,-171.046,-11.0843
4,AS,manua,60020,-169.506,-14.2219


In [84]:
lat_df.shape

(3332, 5)

In [85]:
#merge the HUD & FIPS codes with the location data
codes_df = pd.merge(lat_df, coc_df, how='outer', on= ('FIPS'))

In [86]:
codes_df.head()

Unnamed: 0,st,county_x,FIPS,LON,LAT,county_y,state,number
0,ME,washington,23029,-67.6361,45.0363,,,
1,GA,mcintosh,13191,-81.2646,31.533,mcintosh,GA,GA-501
2,GA,mcintosh,13191,-81.4149,31.5007,mcintosh,GA,GA-501
3,GA,liberty,13179,-81.2102,31.7093,liberty,GA,GA-501
4,GA,liberty,13179,-81.4998,31.8308,liberty,GA,GA-501


In [87]:
codes_df.state.isna().value_counts()#the location data included US territories that can be dropped

False    3237
True      114
Name: state, dtype: int64

In [88]:
codes_df.number.isna().value_counts() #HUD codes from US territories and small/new counties may not be present

False    3237
True      114
Name: number, dtype: int64

In [89]:
codes_df.dropna(inplace=True) #dropping missing data

In [90]:
codes_df.drop(['county_y','state'],axis=1, inplace=True)#dropping duplicate columns

In [91]:
codes_df.rename({'county_x': 'county'}, axis=1, inplace=True)

In [92]:
codes_df.county = codes_df.county.apply(lambda x: punct(x).strip())

#### Full State Names and Abbreviations

In [93]:
st_df = pd.read_csv('states.csv')

In [94]:
#merge the state names with the location and HUD data
codes_df = pd.merge(codes_df, st_df, how='outer', left_on=('st'), right_on=('Abbreviation'))

In [95]:
codes_df.drop('Abbreviation', axis=1, inplace=True) #drop duplicate column

In [96]:
codes_df.rename({'State':'state'}, axis=1, inplace=True)

In [97]:
codes_df.state = codes_df.state.apply(lambda x: str(x).lower()) #strings to lowercase

In [98]:
#codes_df.to_csv('codes.csv')

### FEAUTURE DATA TO MERGE:
- Princeton Eviction Data
- SSI Totals
- Unemployment Rates
- HUD Homeless Counts

## Princeton Eviction Lab Data

Downloaded from: https://data-downloads.evictionlab.org/

In [99]:
ev_df = pd.read_csv('UScounties.csv')

In [100]:
ev_df.head(2)

Unnamed: 0,GEOID,year,name,parent-location,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,...,pct-nh-pi,pct-multiple,pct-other,eviction-filings,evictions,eviction-rate,eviction-filing-rate,low-flag,imputed,subbed
0,1001,2000,Autauga County,Alabama,43671.0,10.92,3074.0,19.21,537.0,42013.0,...,0.03,0.86,0.1,61.0,40.0,1.3,1.98,1,0,0
1,1001,2001,Autauga County,Alabama,43671.0,10.92,3264.0,19.21,537.0,42013.0,...,0.03,0.86,0.1,89.0,37.0,1.13,2.73,0,0,0


In [101]:
ev_df.shape

(53436, 27)

In [102]:
ev_df.rename({'name':'county','parent-location': 'state'}, axis=1, inplace=True)#rename columns for merge

In [103]:
ev_df.county = ev_df.county.apply(lambda x: str(x).lower().strip())#strings to lowercase
ev_df.county = ev_df.county.apply(lambda x: punct(x))

In [104]:
ev_df.state = ev_df.state.apply(lambda x: str(x).lower())

In [105]:
#function remove the last word 'county' from the county names for future merges
def remove_county(value):
    value = value.split(" ")[:-1]
    return ' '.join(value)

In [106]:
#loop the function through the array
ev_df.county = ev_df.county.apply(lambda x: remove_county(x))

In [107]:
ev_df.drop({'low-flag','imputed','subbed'}, axis=1, inplace=True) #drop unnecessary columns

In [108]:
#merge HUD codes with the eviction data

In [109]:
evicts = pd.merge(ev_df, codes_df, how='outer', left_on = ('GEOID'), right_on=('FIPS'))

In [110]:
evicts.shape

(55170, 31)

In [111]:
evicts.head()

Unnamed: 0,GEOID,year,county_x,state_x,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,...,evictions,eviction-rate,eviction-filing-rate,st,county_y,FIPS,LON,LAT,number,state_y
0,1001,2000,autauga,alabama,43671.0,10.92,3074.0,19.21,537.0,42013.0,...,40.0,1.3,1.98,AL,autauga,1001,-86.6428,32.5349,AL-507,alabama
1,1001,2001,autauga,alabama,43671.0,10.92,3264.0,19.21,537.0,42013.0,...,37.0,1.13,2.73,AL,autauga,1001,-86.6428,32.5349,AL-507,alabama
2,1001,2002,autauga,alabama,43671.0,10.92,3454.0,19.21,537.0,42013.0,...,20.0,0.58,2.98,AL,autauga,1001,-86.6428,32.5349,AL-507,alabama
3,1001,2003,autauga,alabama,43671.0,10.92,3644.0,19.21,537.0,42013.0,...,12.0,0.33,2.94,AL,autauga,1001,-86.6428,32.5349,AL-507,alabama
4,1001,2004,autauga,alabama,43671.0,10.92,3834.0,19.21,537.0,42013.0,...,18.0,0.47,2.56,AL,autauga,1001,-86.6428,32.5349,AL-507,alabama


In [112]:
evicts.drop(columns=['county_y', 'state_y'], axis=1, inplace=True)

In [113]:
evicts.rename({'county_x':'county', 'state_x':'state'}, axis=1, inplace=True)

In [114]:
evicts.number.isna().value_counts()

False    54791
True       379
Name: number, dtype: int64

In [115]:
evicts.number.dropna(inplace=True)

In [116]:
evicts = evicts[evicts.year >= 2007]

## Social Security Data

In [117]:
ssi_df = pd.read_csv('ssi_df.csv')

In [118]:
ssi_df.head()

Unnamed: 0.1,Unnamed: 0,county,total_ssi,aged_ssi,disbl_ssi,18_und_ssi,18_64_ssi,65_ovr_ssi,state,year
0,0,autauga,1467,68,1399,236,1027,204,alabama,2016
1,1,baldwin,3521,193,3328,607,2384,530,alabama,2016
2,2,barbour,1469,104,1365,233,910,326,alabama,2016
3,3,bibb,966,29,937,95,714,157,alabama,2016
4,4,blount,1282,62,1220,121,945,216,alabama,2016


In [119]:
ssi_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [120]:
ssi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31417 entries, 0 to 31416
Data columns (total 9 columns):
county        31291 non-null object
total_ssi     31312 non-null object
aged_ssi      31291 non-null object
disbl_ssi     31291 non-null object
18_und_ssi    31291 non-null object
18_64_ssi     31291 non-null object
65_ovr_ssi    31291 non-null object
state         31417 non-null object
year          31417 non-null int64
dtypes: int64(1), object(8)
memory usage: 2.2+ MB


In [121]:
ssi_df.county = ssi_df.county.apply(lambda x: punct(str(x)).strip())

In [122]:
ssi_df.state = ssi_df.state.apply(lambda x: str(x).strip())

In [123]:
ssi_df['18_64_ssi'].isna().value_counts()

False    31291
True       126
Name: 18_64_ssi, dtype: int64

In [124]:
ssi_df['18_und_ssi'].isna().value_counts()

False    31291
True       126
Name: 18_und_ssi, dtype: int64

In [125]:
#ssi_df[ssi_df['18_und_ssi'].isna() == True]
ssi_df = ssi_df.dropna()

In [126]:
ssi_df.replace('b',0, inplace=True)
ssi_df.replace('a',0, inplace=True)
ssi_df.replace('(X)',0, inplace=True)

In [127]:
ssi_df['18_64_ssi'] = ssi_df['18_64_ssi'].astype(int)
ssi_df['18_und_ssi'] = ssi_df['18_und_ssi'].astype(int)
ssi_df['65_ovr_ssi'] = ssi_df['65_ovr_ssi'].astype(int)
ssi_df['aged_ssi'] = ssi_df['aged_ssi'].astype(int)
ssi_df['disbl_ssi'] = ssi_df['disbl_ssi'].astype(int)
ssi_df['total_ssi'] = ssi_df['total_ssi'].astype(int)

In [128]:
#merge FIPS codes with data
co_df = codes_df[['state','county','FIPS']]
co_df.state = co_df.state.astype(str)
co_df.county = co_df.county.astype(str)
co_df.FIPS = co_df.FIPS.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [129]:
si_df = pd.merge(co_df, ssi_df, how='inner', on=('state','county'))

In [130]:
si_df.shape

(31504, 10)

In [131]:
#merge eviction data with social security data using FIPS codes and years

In [132]:
hl_df = pd.merge(evicts, si_df, how ='inner', on=('FIPS','year'))

In [133]:
hl_df.shape

(33864, 37)

In [134]:
hl_df.columns

Index(['GEOID', 'year', 'county_x', 'state_x', 'population', 'poverty-rate',
       'renter-occupied-households', 'pct-renter-occupied',
       'median-gross-rent', 'median-household-income', 'median-property-value',
       'rent-burden', 'pct-white', 'pct-af-am', 'pct-hispanic', 'pct-am-ind',
       'pct-asian', 'pct-nh-pi', 'pct-multiple', 'pct-other',
       'eviction-filings', 'evictions', 'eviction-rate',
       'eviction-filing-rate', 'st', 'FIPS', 'LON', 'LAT', 'number', 'state_y',
       'county_y', 'total_ssi', 'aged_ssi', 'disbl_ssi', '18_und_ssi',
       '18_64_ssi', '65_ovr_ssi'],
      dtype='object')

In [135]:
hl_df.drop(['state_x','county_x'],axis=1,inplace=True)

In [136]:
hl_df.rename({'county_y':'county', 'state_y':'state'}, axis=1, inplace=True)

## Unemployment Rates

In [137]:
un_df = pd.read_csv('unemployment_data.csv')

In [138]:
un_df.head()

Unnamed: 0.1,Unnamed: 0,state,county,year,unemploy_rate
0,0,alabama,autauga,2016,5.21
1,1,alabama,baldwin,2016,5.35
2,2,alabama,barbour,2016,8.53
3,3,alabama,bibb,2016,6.51
4,4,alabama,blount,2016,5.43


In [139]:
un_df.drop('Unnamed: 0',axis=1, inplace=True)

In [140]:
un_df.county = un_df.county.apply(lambda x: punct(x))

In [141]:
hl_df = pd.merge(hl_df, un_df, how='inner', on=('county','state','year'))

In [142]:
#hl_df.to_csv('for_graph.csv') #this data set can be used to create visualizations later

### In order to map homeless counts to the data set, the COC areas must be converged.

In [143]:
hl_df.columns

Index(['GEOID', 'year', 'population', 'poverty-rate',
       'renter-occupied-households', 'pct-renter-occupied',
       'median-gross-rent', 'median-household-income', 'median-property-value',
       'rent-burden', 'pct-white', 'pct-af-am', 'pct-hispanic', 'pct-am-ind',
       'pct-asian', 'pct-nh-pi', 'pct-multiple', 'pct-other',
       'eviction-filings', 'evictions', 'eviction-rate',
       'eviction-filing-rate', 'st', 'FIPS', 'LON', 'LAT', 'number', 'state',
       'county', 'total_ssi', 'aged_ssi', 'disbl_ssi', '18_und_ssi',
       '18_64_ssi', '65_ovr_ssi', 'unemploy_rate'],
      dtype='object')

In [85]:
total = hl_df[['state','number','year','population', 'renter-occupied-households','eviction-filings','evictions',
'total_ssi', 'aged_ssi', 'disbl_ssi', '18_und_ssi', '18_64_ssi','65_ovr_ssi']]

In [86]:
totals = total.groupby(['state','number','year']).sum() #arrays containing whole number totals will be added together

In [87]:
totals.reset_index(inplace=True)

In [88]:
mean = hl_df[['state','number','year','poverty-rate','median-gross-rent','median-household-income','median-property-value','rent-burden','eviction-rate','eviction-filing-rate','unemploy_rate','pct-renter-occupied',
              'pct-white', 'pct-af-am', 'pct-hispanic', 'pct-am-ind',
       'pct-asian', 'pct-nh-pi', 'pct-multiple', 'pct-other']]

In [89]:
means = mean.groupby(['state','number','year']).mean() #arrays containing percentages and measures of central tendency will be averaged

In [90]:
means.reset_index(inplace=True)

In [91]:
df = pd.merge(totals, means, how='inner', on=('number','year'))

In [92]:
df.shape

(3315, 31)

In [93]:
df.drop('state_y', axis=1, inplace=True)

In [94]:
df.rename({'state_x': 'state'}, axis=1, inplace=True)

In [95]:
df.head()

Unnamed: 0,state,number,year,population,renter-occupied-households,eviction-filings,evictions,total_ssi,aged_ssi,disbl_ssi,...,unemploy_rate,pct-renter-occupied,pct-white,pct-af-am,pct-hispanic,pct-am-ind,pct-asian,pct-nh-pi,pct-multiple,pct-other
0,alabama,AL-500,2007,845208.0,104742.0,3203.0,656.0,25480,1839,23641,...,3.23,26.345,68.865,25.145,3.43,0.195,1.435,0.03,0.77,0.12
1,alabama,AL-500,2008,845208.0,105612.0,2172.0,976.0,25762,1722,24040,...,4.555,26.345,68.865,25.145,3.43,0.195,1.435,0.03,0.77,0.12
2,alabama,AL-500,2009,845208.0,106482.0,1785.0,945.0,26244,1615,24629,...,9.27,26.345,68.865,25.145,3.43,0.195,1.435,0.03,0.77,0.12
3,alabama,AL-500,2010,853551.0,107353.0,2486.0,1605.0,27098,1525,25573,...,8.72,27.615,65.915,26.2,4.9,0.225,1.64,0.025,0.99,0.105
4,alabama,AL-500,2011,862556.0,108846.0,3708.0,2257.0,27532,1459,26073,...,7.765,28.96,64.905,26.95,4.82,0.23,1.8,0.025,1.105,0.165


## HUD Homeless Counts

In [96]:
hud_df = pd.read_csv('homeless_counts.csv') 

In [97]:
hud_df.drop('Unnamed: 0', axis=1, inplace=True)

In [98]:
hud_df.head()

Unnamed: 0,num,homeless_count,yr
0,AK-500,1105,2016
1,AK-501,835,2016
2,AL-500,1228,2016
3,AL-501,623,2016
4,AL-502,337,2016


In [99]:
df2 = pd.merge(df, hud_df, how='inner', left_on=('number','year'), right_on=('num','yr'))

In [100]:
df2.shape

(3157, 33)

In [101]:
#df2.to_csv('clean_data_set.csv')