In [1]:
from functools import reduce
import pandas as pd
import numpy as np

## Geographic Relationships

In [2]:
# Take intersected data from QGIS, recreate geoid
geo = pd.read_csv('./populationsim-master/example_wa/data/qgis_geo.csv')
geo = geo.drop('Unnamed: 5', axis=1)
geo['REGION'] = '1'
geo['STATE'] = geo['STATE'].astype(str).str.zfill(2)
geo['COUNTY'] = geo['COUNTY'].astype(str).str.zfill(3)
geo['TRACT'] = geo['TRACT'].astype(str).str.zfill(6)
geo['TRACT'] = geo['STATE'] + geo['COUNTY'] + geo['TRACT']
geo.to_csv('./populationsim-master/example_wa/data/geo_cross_walk.csv',index=False)

## Seed Samples from PUMS

#### Persons

In [3]:
p_df = pd.read_csv('./data/PUMS_2017_5YR_WA/psam_p53.csv')

In [4]:
pums_data_p = p_df[['SERIALNO','SPORDER','PUMA','PWGTP','PINCP','AGEP','SEX','SCHL','COW']].copy()
pums_data_p.dropna(inplace=True)

pums_data_p.to_csv('./populationsim-master/example_wa/data/seed_persons.csv', index=False)

#### Households

In [5]:
h_df = pd.read_csv('./data/PUMS_2017_5YR_WA/psam_h53.csv')

In [6]:
pums_data_h = h_df[['SERIALNO','PUMA','WGTP','HINCP','NP','VEH']].copy()
pums_data_h.dropna(inplace=True)

# Some PUMS households have no people in them; need to be filtered out of dataset
hh_persons = pd.merge(pums_data_h, pums_data_p, how='left', on='SERIALNO')
index = hh_persons['SPORDER'].index[hh_persons['SPORDER'].apply(np.isnan)]
empty_households = hh_persons.iloc[index,:]['SERIALNO'].values
pums_data_h = pums_data_h[~(pums_data_h['SERIALNO'].isin(empty_households))]

pums_data_h.to_csv('./populationsim-master/example_wa/data/seed_households.csv', index=False)

## Control Totals from ACS

#### Tract

In [7]:
t2_df = pd.read_csv('./data/ACS_2019_WA/dp02.csv', low_memory=False)
t2_data = t2_df[['GEO_ID',
                'DP02_0060E',
                'DP02_0061E',
                'DP02_0062E',
                'DP02_0063E',
                'DP02_0064E',
                'DP02_0065E',
                'DP02_0066E']].copy()
t2_data.columns = ['GEO_ID',
                  'SCHL1',
                  'SCHL2',
                  'SCHL3',
                  'SCHL4',
                  'SCHL5',
                  'SCHL6',
                  'SCHL7']

t3_df = pd.read_csv('./data/ACS_2019_WA/dp03.csv', low_memory=False)
t3_data = t3_df[['GEO_ID',
                'DP03_0051E',
                'DP03_0052E',
                'DP03_0053E',
                'DP03_0054E',
                'DP03_0055E',
                'DP03_0056E',
                'DP03_0057E',
                'DP03_0058E',
                'DP03_0059E',
                'DP03_0060E',
                'DP03_0061E',
                'DP03_0047E',
                'DP03_0048E',
                'DP03_0049E',
                'DP03_0050E']].copy()
t3_data.columns = ['GEO_ID',
                  'NUM_HH',
                  'HINCP1',
                  'HINCP2',
                  'HINCP3',
                  'HINCP4',
                  'HINCP5',
                  'HINCP6',
                  'HINCP7',
                  'HINCP8',
                  'HINCP9',
                  'HINCP10',
                  'COW1',
                  'COW2',
                  'COW3',
                  'COW4']

t4_df = pd.read_csv('./data/ACS_2019_WA/dp04.csv', low_memory=False)
t4_data = t4_df[['GEO_ID',
                'DP04_0058E',
                'DP04_0059E',
                'DP04_0060E',
                'DP04_0061E']].copy()
t4_data.columns = ['GEO_ID',
                  'VEH1',
                  'VEH2',
                  'VEH3',
                  'VEH4']

t5_df = pd.read_csv('./data/ACS_2019_WA/dp05.csv', low_memory=False)
t5_data = t5_df[['GEO_ID',
                'DP05_0008E',
                'DP05_0009E',
                'DP05_0010E',
                'DP05_0011E',
                'DP05_0012E',
                'DP05_0013E',
                'DP05_0014E',
                'DP05_0015E',
                'DP05_0016E',
                'DP05_0017E']].copy()
t5_data.columns = ['GEO_ID',
                  'AGEP1',  # Start at 20yrs old; there must be people in the seed to meet the controls
                  'AGEP2',
                  'AGEP3',
                  'AGEP4',
                  'AGEP5',
                  'AGEP6',
                  'AGEP7',
                  'AGEP8',
                  'AGEP9',
                  'AGEP10']

t7_df = pd.read_csv('./data/ACS_2019_WA/s2501.csv', low_memory=False)
t7_data = t7_df[['GEO_ID',
                'S2501_C01_002E',
                'S2501_C01_003E',
                'S2501_C01_004E',
                'S2501_C01_005E']].copy()
t7_data.columns = ['GEO_ID',
                  'NP1',
                  'NP2',
                  'NP3',
                  'NP4']

In [8]:
# Join data from different census tables
tract_dataframes = [t2_data, t3_data, t4_data, t5_data, t7_data]
tract_data = reduce(lambda left,right: pd.merge(left,right,on='GEO_ID'), tract_dataframes)
tract_data.dropna(inplace=True)

# Get tract id from the extended geo id, assign region 1
tract_data['TRACT'] = tract_data['GEO_ID'].str.slice(-11,)
tract_data['REGION'] = '1'
tract_data = tract_data.iloc[1:,:]
tract_data.to_csv('./populationsim-master/example_wa/data/control_totals_tract.csv', index=False)
tract_data

Unnamed: 0,GEO_ID,SCHL1,SCHL2,SCHL3,SCHL4,SCHL5,SCHL6,SCHL7,NUM_HH,HINCP1,...,AGEP7,AGEP8,AGEP9,AGEP10,NP1,NP2,NP3,NP4,TRACT,REGION
1,1400000US53001950100,53,121,592,409,190,255,155,1050,80,...,263,235,141,78,373,336,113,228,53001950100,1
2,1400000US53001950200,20,102,343,303,116,171,53,615,51,...,115,213,66,11,135,278,58,144,53001950200,1
3,1400000US53001950300,916,381,972,416,148,214,183,1655,71,...,199,424,155,72,147,385,261,862,53001950300,1
4,1400000US53001950400,475,94,417,318,57,186,116,974,65,...,113,168,81,70,261,239,126,348,53001950400,1
5,1400000US53001950500,1035,344,750,452,157,97,102,1679,223,...,260,247,177,0,226,501,327,625,53001950500,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1400000US53077940002,485,373,757,664,163,165,128,1316,146,...,179,386,216,51,224,349,216,527,53077940002,1
1455,1400000US53077940003,235,219,760,474,195,117,106,1017,60,...,188,270,168,16,230,264,162,361,53077940003,1
1456,1400000US53077940004,925,579,1074,431,79,102,54,1615,175,...,245,346,169,36,165,405,314,731,53077940004,1
1457,1400000US53077940005,721,420,664,555,120,152,75,1408,76,...,174,338,76,67,208,313,251,636,53077940005,1


#### Region

In [9]:
# Sum the values across all census tracts to get region marginals
region_data = tract_data.iloc[:,1:-2].apply(pd.to_numeric, errors='ignore')
region_data = pd.DataFrame(region_data.sum()).transpose()
region_data['REGION'] = '1'
region_data.dropna(inplace=True)
region_data.to_csv('./populationsim-master/example_wa/data/control_totals_meta.csv', index=False)
region_data

Unnamed: 0,SCHL1,SCHL2,SCHL3,SCHL4,SCHL5,SCHL6,SCHL7,NUM_HH,HINCP1,HINCP2,...,AGEP6,AGEP7,AGEP8,AGEP9,AGEP10,NP1,NP2,NP3,NP4,REGION
0,186275,256174,1122330,1189880,509353,1144545,693067,2848396,136214,91269,...,486024,464627,679924,309016,128733,759370,1014083,439707,635236,1
