In [1]:
import pandas as pd
import numpy as np

## Geographic Relationships

In [2]:
# Take intersected data from QGIS, recreate geoid
geo = pd.read_csv('./populationsim-master/example_wa/data/qgis_geo.csv')
geo = geo.drop('Unnamed: 5', axis=1)
geo['REGION'] = '1'
geo['STATE'] = geo['STATE'].astype(str).str.zfill(2)
geo['COUNTY'] = geo['COUNTY'].astype(str).str.zfill(3)
geo['TRACT'] = geo['TRACT'].astype(str).str.zfill(6)
geo['TRACT'] = geo['STATE'] + geo['COUNTY'] + geo['TRACT']
geo.to_csv('./populationsim-master/example_wa/data/geo_cross_walk.csv',index=False)

## Seed Samples from PUMS

#### Persons

In [3]:
p_df = pd.read_csv('./data/PUMS_2017_5YR_WA/psam_p53.csv')

In [4]:
pums_data_p = p_df[['SERIALNO','SPORDER','PUMA','PWGTP','AGEP','SEX','COW','SCHL','PINCP']].copy() #JWMNP, MAR
pums_data_p.dropna(inplace=True)
pums_data_p = pums_data_p[['SERIALNO','SPORDER','PUMA','PWGTP','AGEP','PINCP','SEX','COW','SCHL']]

pums_data_p.to_csv('./populationsim-master/example_wa/data/seed_persons.csv', index=False)

#### Households

In [5]:
h_df = pd.read_csv('./data/PUMS_2017_5YR_WA/psam_h53.csv')

In [6]:
pums_data_h = h_df[['SERIALNO','PUMA','WGTP','HINCP','NP','VEH','TYPE']].copy()
pums_data_h.dropna(inplace=True)
pums_data_h = pums_data_h[['SERIALNO','PUMA','WGTP','HINCP','VEH','NP','TYPE']]

# Some PUMS households have no people in them; need to be filtered out of dataset
hh_persons = pd.merge(pums_data_h, pums_data_p, how='left', on='SERIALNO')
index = hh_persons['SPORDER'].index[hh_persons['SPORDER'].apply(np.isnan)]
empty_households = hh_persons.iloc[index,:]['SERIALNO'].values
pums_data_h = pums_data_h[~(pums_data_h['SERIALNO'].isin(empty_households))]

pums_data_h.to_csv('./populationsim-master/example_wa/data/seed_households.csv', index=False)

## Control Totals from ACS

#### Tract

In [7]:
t3_df = pd.read_csv('./data/ACS_2019_WA/dp03.csv', low_memory=False)
t3_data = t3_df[['GEO_ID',
                'DP03_0051E',
                'DP03_0052E',
                'DP03_0053E',
                'DP03_0054E',
                'DP03_0055E',
                'DP03_0056E',
                'DP03_0057E',
                'DP03_0058E',
                'DP03_0059E',
                'DP03_0060E',
                'DP03_0061E',
                'DP03_0047E',
                'DP03_0048E',
                'DP03_0049E',
                'DP03_0050E']].copy()
t3_data.columns = ['GEO_ID',
                  'NUM_HH',
                  'HHINC1',
                  'HHINC2',
                  'HHINC3',
                  'HHINC4',
                  'HHINC5',
                  'HHINC6',
                  'HHINC7',
                  'HHINC8',
                  'HHINC9',
                  'HHINC10',
                  'COW1',
                  'COW2',
                  'COW3',
                  'COW4']

t5_df = pd.read_csv('./data/ACS_2019_WA/dp05.csv', low_memory=False)
t5_data = t5_df[['GEO_ID',
                'DP05_0008E',
                'DP05_0009E',
                'DP05_0010E',
                'DP05_0011E',
                'DP05_0012E',
                'DP05_0013E',
                'DP05_0014E',
                'DP05_0015E',
                'DP05_0016E',
                'DP05_0017E']].copy()
t5_data.columns = ['GEO_ID',
                  'AGE4',  # Start at 20yrs old; there must be people in the seed to meet the controls
                  'AGE5',
                  'AGE6',
                  'AGE7',
                  'AGE8',
                  'AGE9',
                  'AGE10',
                  'AGE11',
                  'AGE12',
                  'AGE13']

In [8]:
# Join data from different census tables
tract_data = pd.merge(t3_data, t5_data, on='GEO_ID')
tract_data.dropna(inplace=True)
# Get tract id from the extended geo id, assign region 1
tract_data['TRACT'] = tract_data['GEO_ID'].str.slice(-11,)
# tract_data['TRACT'] = [string.lstrip('0') for string in tract_data['TRACT']]
tract_data['REGION'] = '1'
tract_data = tract_data.iloc[1:,:]
tract_data.to_csv('./populationsim-master/example_wa/data/control_totals_tract.csv', index=False)

#### Region

In [9]:
# Sum the values across all census tracts to get region marginals
region_data = tract_data.iloc[:,1:-2].apply(pd.to_numeric, errors='ignore')
region_data = pd.DataFrame(region_data.sum()).transpose()
region_data['REGION'] = '1'
region_data.dropna(inplace=True)
region_data.to_csv('./populationsim-master/example_wa/data/control_totals_meta.csv', index=False)