In [1]:
import functools
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Geographic Relationships

In [2]:
# Clean up the file generated from QGIS spatial join
geo = pd.read_csv('./data/TIGER/msa_puma_tract_join.csv', usecols=['jmsa_GEOID','jpuma_GEOI','GEOID']).dropna()
geo['REGION'] = '1'
geo = geo[['REGION','jmsa_GEOID','GEOID']].astype(int)
geo.columns = ['REGION','MSA','TRACT']

# Get rid of Micropolitan Statistical Areas
msa_ids = pd.read_csv('./data/TIGER/msa_list.csv')
geo = geo[geo['MSA'].isin(msa_ids['GEOID'])]
full_msa_list = pd.unique(geo['MSA'])

# Limit to certain MSAs if desired, there are 925 total MSA + MicroSA, 384 MSA
msas_to_use = full_msa_list[200:249]
geo = geo[geo['MSA'].isin(msas_to_use)]

geo.to_csv('./populationsim-master/example_msa_survey/data/geo_cross_walk.csv', index=False)

# Get the list of msas, and tracts that are being synthesized
msa_list = list(pd.unique(geo['MSA']))
msa_list = [int(i) for i in msa_list]
tract_list = list(pd.unique(geo['TRACT']))
tract_list = [int(i) for i in tract_list]

## Seed Samples from Survey

#### Persons/Households

In [3]:
p_data = pd.read_csv('./data/scoot_socio.csv', dtype=str)
p_data.reset_index(inplace=True)

In [4]:
# Check for NA values
p_data.isna().sum()

index       0
user        0
age         0
gender      0
hispanic    0
race        0
usborn      0
edu         0
student     0
work        0
zipcode     0
hhsize      0
child       0
hhincome    0
idincome    0
disable     0
veh         0
bike        0
dtype: int64

In [5]:
# Fake household for each person
h_data = p_data[['index','user']].copy()
h_data['REGION'] = '1'
h_data['WGTP'] = 1.0

# Allocate all sample data to every MSA (seed geography)
# Every person is alone in a single household (id=index)
# Every person/household is repeated for every MSA
p_data_list = []
h_data_list = []
for msa in msa_list:
    p_data_new = p_data.copy()
    p_data_new['MSA'] = msa
    p_data_list.append(p_data_new)
    h_data_new = h_data.copy()
    h_data_new['MSA'] = msa
    h_data_list.append(h_data_new)
p_data = pd.concat(p_data_list)
p_data.reset_index(inplace=True)
p_data.drop(['level_0','index'], axis=1, inplace=True)
p_data.reset_index(inplace=True)
h_data = pd.concat(h_data_list)
h_data.reset_index(inplace=True)
h_data.drop(['level_0','index'], axis=1, inplace=True)
h_data.reset_index(inplace=True)

In [6]:
p_data

Unnamed: 0,index,user,age,gender,hispanic,race,usborn,edu,student,work,zipcode,hhsize,child,hhincome,idincome,disable,veh,bike,MSA
0,0,04RbJrWYLoUs721TfrKU,61,F,N,1,Y,4,N,2,29576,1,0,3,3,no,2,no,29620
1,1,05jO2rxIAAUb56QPb0se,42,M,N,4,N,6,N,1,54956,4,2,9,9,no,2,yes,29620
2,2,0a6oBOvnElapBG5P96Bv,58,F,N,4,P,5,N,3,94709,2,0,4,2,no,1,no,29620
3,3,0ALR8Wou755wsfL2CcTe,36,M,N,1,Y,4,N,7,78634,3,0,7,1,no,4,yes,29620
4,4,0bgDWIbxXUIBvjlUbPPN,33,M,N,1,Y,3,N,1,12084,1,0,6,6,no,1,yes,29620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86921,86921,Zz7HWhA1rLeaq0SZvtHQ,46,M,N,1,Y,5,N,1,92122,4,2,10,7,no,2,no,45060
86922,86922,ZZhIMbD8AGPTEy5FthwW,34,M,N,1,Y,5,N,1,89141,3,1,9,9,no,2,yes,45060
86923,86923,zzJNMD87EWq41ChYzGgD,33,F,N,1,Y,4,P,1,85746,4,2,7,3,no,3,yes,45060
86924,86924,ZzqLinqwV4Q9GGtl5iLq,40,M,Y,1,Y,5,F,1,68701,4,2,5,5,yes,4,yes,45060


In [7]:
h_data

Unnamed: 0,index,user,REGION,WGTP,MSA
0,0,04RbJrWYLoUs721TfrKU,1,1.0,29620
1,1,05jO2rxIAAUb56QPb0se,1,1.0,29620
2,2,0a6oBOvnElapBG5P96Bv,1,1.0,29620
3,3,0ALR8Wou755wsfL2CcTe,1,1.0,29620
4,4,0bgDWIbxXUIBvjlUbPPN,1,1.0,29620
...,...,...,...,...,...
86921,86921,Zz7HWhA1rLeaq0SZvtHQ,1,1.0,45060
86922,86922,ZZhIMbD8AGPTEy5FthwW,1,1.0,45060
86923,86923,zzJNMD87EWq41ChYzGgD,1,1.0,45060
86924,86924,ZzqLinqwV4Q9GGtl5iLq,1,1.0,45060


In [8]:
# Save seed data to file
h_data.to_csv('./populationsim-master/example_msa_survey/data/seed_households.csv', index=False)
p_data.to_csv('./populationsim-master/example_msa_survey/data/seed_persons.csv', index=False)

## Control Totals from ACS

#### MSA

In [9]:
S1501_cols = ['GEO_ID',
             'S1501_C01_007E',
             'S1501_C01_008E',
             'S1501_C01_009E',
             'S1501_C01_010E',
             'S1501_C01_011E',
             'S1501_C01_012E',
             'S1501_C01_013E']
S1501_colnames = ['GEO_ID',
                 'edu1-11',
                 'edu12-15',
                 'edu16-17',
                 'edu18-19',
                 'edu20',
                 'edu21',
                 'edu22-24']
S1501 = pd.read_csv('./data/ACS/MSA/S1501.csv', low_memory=False, usecols=S1501_cols).iloc[1:,:]
S1501.columns = S1501_colnames

# Combine columns to match bins used in survey
S1501['edu1-11'] = S1501['edu1-11'].astype(int) + S1501['edu12-15'].astype(int)
S1501.drop('edu12-15', axis=1, inplace=True)
S1501.rename(columns={'edu1-11':'edu1-15'}, inplace=True)

In [10]:
S0101_cols = ['GEO_ID',
             'S0101_C01_001E',
             'S0101_C01_002E',
             'S0101_C01_003E',
             'S0101_C01_004E',
             'S0101_C01_005E',
             'S0101_C01_006E',
             'S0101_C01_007E',
             'S0101_C01_008E',
             'S0101_C01_009E',
             'S0101_C01_010E',
             'S0101_C01_011E',
             'S0101_C01_012E',
             'S0101_C01_013E',
             'S0101_C01_014E',
             'S0101_C01_015E',
             'S0101_C01_016E',
             'S0101_C01_017E',
             'S0101_C01_018E',
             'S0101_C01_019E',
             'S0101_C01_033E',
             'S0101_C01_034E']
S0101_colnames = ['GEO_ID',
                 'TOTAL_P',
                 'age0-5',
                 'age5-9',
                 'age10-14',
                 'age15-19',
                 'age20-24',
                 'age25-29',
                 'age30-34',
                 'age35-39',
                 'age40-44',
                 'age45-49',
                 'age50-54',
                 'age55-59',
                 'age60-64',
                 'age65-69',
                 'age70-74',
                 'age75-79',
                 'age80-84',
                 'age80+',
                 'sexRatio',
                 'sexM']
S0101 = pd.read_csv('./data/ACS/MSA/S0101.csv', low_memory=False, usecols=S0101_cols).iloc[1:,:]
S0101.columns = S0101_colnames
S0101['sexRatio'] = pd.to_numeric(S0101['sexRatio'], errors='coerce').fillna(0)

# Gender counts are in percentage
S0101['sexRatio'] = S0101['TOTAL_P'].astype(float) / (S0101['sexRatio'].astype(float)/100 + 1.0)
S0101['sexM'] = S0101['TOTAL_P'].astype(float) - S0101['sexRatio'].astype(float)
S0101.rename(columns={'sexRatio':'sexF'}, inplace=True)
S0101['sexM'] = S0101['sexM'].astype(int)
S0101['sexF'] = S0101['sexF'].astype(int)

# Combine columns to get wider bins and match controls/PUMS
S0101['age15-19'] = (S0101['age0-5'].astype(int) +
                     S0101['age0-5'].astype(int) +
                     S0101['age5-9'].astype(int) +
                     S0101['age10-14'].astype(int) +
                     S0101['age15-19'].astype(int))
S0101.drop(['age0-5', 'age5-9', 'age10-14'], axis=1, inplace=True)
S0101.rename(columns={'age15-19':'age0-19'}, inplace=True)

S0101['age40-44'] = (S0101['age25-29'].astype(int) +
                     S0101['age30-34'].astype(int) +
                     S0101['age35-39'].astype(int) +
                     S0101['age40-44'].astype(int))
S0101.drop(['age25-29', 'age30-34', 'age35-39'], axis=1, inplace=True)
S0101.rename(columns={'age40-44':'age25-44'}, inplace=True)

S0101['age50-54'] = (S0101['age45-49'].astype(int) +
                     S0101['age50-54'].astype(int))
S0101.drop(['age45-49'], axis=1, inplace=True)
S0101.rename(columns={'age50-54':'age45-54'}, inplace=True)

S0101['age80+'] = (S0101['age60-64'].astype(int) + 
                   S0101['age65-69'].astype(int) + 
                   S0101['age70-74'].astype(int) + 
                   S0101['age75-79'].astype(int) + 
                   S0101['age80-84'].astype(int) + 
                   S0101['age80+'].astype(int))
S0101.drop(['age60-64', 'age65-69', 'age70-74', 'age75-79', 'age80-84'], axis=1, inplace=True)
S0101.rename(columns={'age80+':'age60+'}, inplace=True)

In [11]:
B19325_colnames = ['GEO_ID',
                 'idincome-9999',
                 'idincome10000-14999',
                 'idincome15000-24999',
                 'idincome25000-34999',
                 'idincome35000-49999',
                 'idincome50000-64999',
                 'idincome65000-74999',
                 'idincome75000+']
B19325 = pd.read_csv('./data/ACS/MSA/B19325.csv', low_memory=False).iloc[1:,:]

B19325['idincome-9999'] = (B19325['B19325_004E'].astype(int) +
                           B19325['B19325_006E'].astype(int) +
                           B19325['B19325_007E'].astype(int) +
                           B19325['B19325_008E'].astype(int) +
                           B19325['B19325_009E'].astype(int) +
                           B19325['B19325_027E'].astype(int) +
                           B19325['B19325_029E'].astype(int) +
                           B19325['B19325_030E'].astype(int) +
                           B19325['B19325_031E'].astype(int) +
                           B19325['B19325_032E'].astype(int) +
                          B19325['B19325_051E'].astype(int) +
                           B19325['B19325_053E'].astype(int) +
                           B19325['B19325_054E'].astype(int) +
                           B19325['B19325_055E'].astype(int) +
                           B19325['B19325_056E'].astype(int) +
                           B19325['B19325_074E'].astype(int) +
                           B19325['B19325_076E'].astype(int) +
                           B19325['B19325_077E'].astype(int) +
                           B19325['B19325_078E'].astype(int) +
                           B19325['B19325_079E'].astype(int))
B19325['idincome10000-14999'] = (B19325['B19325_010E'].astype(int) +
                                 B19325['B19325_011E'].astype(int) +
                                 B19325['B19325_033E'].astype(int) +
                                 B19325['B19325_034E'].astype(int) +
                                B19325['B19325_057E'].astype(int) +
                                 B19325['B19325_058E'].astype(int) +
                                 B19325['B19325_080E'].astype(int) +
                                 B19325['B19325_081E'].astype(int))
B19325['idincome15000-24999'] = (B19325['B19325_012E'].astype(int) +
                                 B19325['B19325_013E'].astype(int) +
                                 B19325['B19325_014E'].astype(int) +
                                 B19325['B19325_015E'].astype(int) +
                                 B19325['B19325_035E'].astype(int) +
                                 B19325['B19325_036E'].astype(int) +
                                 B19325['B19325_037E'].astype(int) +
                                 B19325['B19325_038E'].astype(int) +
                                B19325['B19325_059E'].astype(int) +
                                 B19325['B19325_060E'].astype(int) +
                                 B19325['B19325_061E'].astype(int) +
                                 B19325['B19325_062E'].astype(int) +
                                 B19325['B19325_082E'].astype(int) +
                                 B19325['B19325_083E'].astype(int) +
                                 B19325['B19325_084E'].astype(int) +
                                 B19325['B19325_085E'].astype(int))
B19325['idincome25000-34999'] = (B19325['B19325_016E'].astype(int) +
                                 B19325['B19325_017E'].astype(int) +
                                 B19325['B19325_039E'].astype(int) +
                                 B19325['B19325_040E'].astype(int) +
                                B19325['B19325_063E'].astype(int) +
                                 B19325['B19325_064E'].astype(int) +
                                 B19325['B19325_086E'].astype(int) +
                                 B19325['B19325_087E'].astype(int))
B19325['idincome35000-49999'] = (B19325['B19325_018E'].astype(int) +
                                 B19325['B19325_019E'].astype(int) +
                                 B19325['B19325_020E'].astype(int) +
                                 B19325['B19325_041E'].astype(int) +
                                 B19325['B19325_042E'].astype(int) +
                                 B19325['B19325_043E'].astype(int) +
                                B19325['B19325_065E'].astype(int) +
                                 B19325['B19325_066E'].astype(int) +
                                 B19325['B19325_067E'].astype(int) +
                                 B19325['B19325_088E'].astype(int) +
                                 B19325['B19325_089E'].astype(int) +
                                 B19325['B19325_090E'].astype(int))
B19325['idincome50000-64999'] = (B19325['B19325_021E'].astype(int) +
                                 B19325['B19325_022E'].astype(int) +
                                 B19325['B19325_044E'].astype(int) +
                                 B19325['B19325_045E'].astype(int) +
                                B19325['B19325_068E'].astype(int) +
                                 B19325['B19325_069E'].astype(int) +
                                 B19325['B19325_091E'].astype(int) +
                                 B19325['B19325_092E'].astype(int))
B19325['idincome65000-74999'] = (B19325['B19325_023E'].astype(int) +
                                 B19325['B19325_046E'].astype(int) +
                                B19325['B19325_070E'].astype(int) +
                                 B19325['B19325_093E'].astype(int))
B19325['idincome75000+'] = (B19325['B19325_024E'].astype(int) +
                            B19325['B19325_025E'].astype(int) +
                            B19325['B19325_047E'].astype(int) +
                            B19325['B19325_048E'].astype(int) +
                           B19325['B19325_071E'].astype(int) +
                            B19325['B19325_072E'].astype(int) +
                            B19325['B19325_094E'].astype(int) +
                            B19325['B19325_095E'].astype(int))

B19325.drop(labels=B19325.columns.difference(B19325_colnames), axis=1, inplace=True)

# Combine columns to match bins used in survey
B19325['idincome50000-64999'] = B19325['idincome50000-64999'].astype(int) + B19325['idincome65000-74999'].astype(int)
B19325.drop('idincome65000-74999', axis=1, inplace=True)
B19325.rename(columns={'idincome50000-64999':'idincome50000-74999'}, inplace=True)

In [12]:
# Join data from different census tables
acs_dataframes = [S1501, S0101, B19325]
acs_data = functools.reduce(lambda left,right: pd.merge(left,right,on='GEO_ID'), acs_dataframes)

# Get MSA id from the extended geo id, remove acs labels, save marginal counts for popsim
acs_data.dropna(inplace=True)
acs_data['MSA'] = acs_data['GEO_ID'].str.slice(-5,).astype(int)
acs_data = acs_data[acs_data['MSA'].isin(msa_list)]
acs_data.to_csv('./populationsim-master/example_msa_survey/data/control_totals_msa.csv', index=False)
acs_data

Unnamed: 0,GEO_ID,edu1-15,edu16-17,edu18-19,edu20,edu21,edu22-24,TOTAL_P,age0-19,age20-24,...,sexF,sexM,idincome-9999,idincome10000-14999,idincome15000-24999,idincome25000-34999,idincome35000-49999,idincome50000-74999,idincome75000+,MSA
10,310M500US10580,47134,159409,98708,75219,125071,103106,880736,249266,68764,...,449355,431380,175848,53790,90088,77650,93974,117190,130172,10580
14,310M500US10740,71210,154055,146130,53423,112215,89130,912108,278825,58778,...,463469,448638,219143,66834,103510,86682,88624,86760,91811,10740
19,310M500US10900,58216,199585,99539,54243,106961,65096,837610,244404,53553,...,426481,411128,172095,54108,96108,81677,89421,96856,103155,10900
31,310M500US11460,10823,33523,40451,16248,60380,67843,367000,107567,48200,...,185073,181926,81583,22226,35989,33407,34618,42012,60523,11460
48,310M500US12100,24010,59017,36703,12997,34305,17644,266105,78627,17643,...,137309,128795,57456,18573,32358,26883,25157,26755,32071,12100
69,310M500US12980,8159,33557,21950,8296,12525,6650,134212,42696,8682,...,68440,65771,28905,10464,18206,13762,14540,12973,9922,12980
70,310M500US13020,7120,25805,18781,8523,9689,4724,104104,28467,6120,...,52952,51151,22777,8360,13441,11703,11948,11138,7435,13020
88,310M500US13740,7261,37781,28227,11281,27044,11946,179071,56039,10303,...,90622,88448,32021,11763,20928,17849,21101,21099,19660,13740
89,310M500US13780,14991,52774,29023,20605,25322,20045,241874,69748,21946,...,122654,119219,58750,18460,29048,23178,25700,26233,21387,13780
124,310M500US15380,66205,228268,146051,97172,146777,112234,1130175,321039,73878,...,582564,547610,233011,78160,129226,109748,128147,136766,126946,15380


#### Tract

In [13]:
S1501_cols = ['GEO_ID',
             'S1501_C01_007E',
             'S1501_C01_008E',
             'S1501_C01_009E',
             'S1501_C01_010E',
             'S1501_C01_011E',
             'S1501_C01_012E',
             'S1501_C01_013E']
S1501_colnames = ['GEO_ID',
                 'edu1-11',
                 'edu12-15',
                 'edu16-17',
                 'edu18-19',
                 'edu20',
                 'edu21',
                 'edu22-24']
S1501 = pd.read_csv('./data/ACS/TRACT/S1501.csv', low_memory=False, usecols=S1501_cols).iloc[1:,:]
S1501.columns = S1501_colnames

# Combine columns to match bins used in survey
S1501['edu1-11'] = S1501['edu1-11'].astype(int) + S1501['edu12-15'].astype(int)
S1501.drop('edu12-15', axis=1, inplace=True)
S1501.rename(columns={'edu1-11':'edu1-15'}, inplace=True)

In [14]:
S0101_cols = ['GEO_ID',
             'S0101_C01_001E',
             'S0101_C01_002E',
             'S0101_C01_003E',
             'S0101_C01_004E',
             'S0101_C01_005E',
             'S0101_C01_006E',
             'S0101_C01_007E',
             'S0101_C01_008E',
             'S0101_C01_009E',
             'S0101_C01_010E',
             'S0101_C01_011E',
             'S0101_C01_012E',
             'S0101_C01_013E',
             'S0101_C01_014E',
             'S0101_C01_015E',
             'S0101_C01_016E',
             'S0101_C01_017E',
             'S0101_C01_018E',
             'S0101_C01_019E',
             'S0101_C01_033E',
             'S0101_C01_034E']
S0101_colnames = ['GEO_ID',
                 'TOTAL_P',
                 'age0-5',
                 'age5-9',
                 'age10-14',
                 'age15-19',
                 'age20-24',
                 'age25-29',
                 'age30-34',
                 'age35-39',
                 'age40-44',
                 'age45-49',
                 'age50-54',
                 'age55-59',
                 'age60-64',
                 'age65-69',
                 'age70-74',
                 'age75-79',
                 'age80-84',
                 'age80+',
                 'sexRatio',
                 'sexM']
S0101 = pd.read_csv('./data/ACS/TRACT/S0101.csv', low_memory=False, usecols=S0101_cols).iloc[1:,:]
S0101.columns = S0101_colnames
S0101['sexRatio'] = pd.to_numeric(S0101['sexRatio'], errors='coerce').fillna(0)

# Gender counts are in percentage
S0101['sexRatio'] = S0101['TOTAL_P'].astype(float) / (S0101['sexRatio'].astype(float)/100 + 1.0)
S0101['sexM'] = S0101['TOTAL_P'].astype(float) - S0101['sexRatio'].astype(float)
S0101.rename(columns={'sexRatio':'sexF'}, inplace=True)
S0101['sexM'] = S0101['sexM'].astype(int)
S0101['sexF'] = S0101['sexF'].astype(int)

# Combine columns to get wider bins
S0101['age15-19'] = (S0101['age0-5'].astype(int) + 
                              S0101['age0-5'].astype(int) + 
                              S0101['age5-9'].astype(int) + 
                              S0101['age10-14'].astype(int) + 
                              S0101['age15-19'].astype(int))
S0101.drop(['age0-5', 'age5-9', 'age10-14'], axis=1, inplace=True)
S0101.rename(columns={'age15-19':'age0-19'}, inplace=True)

S0101['age40-44'] = (S0101['age25-29'].astype(int) +
                     S0101['age30-34'].astype(int) +
                     S0101['age35-39'].astype(int) +
                     S0101['age40-44'].astype(int))
S0101.drop(['age25-29', 'age30-34', 'age35-39'], axis=1, inplace=True)
S0101.rename(columns={'age40-44':'age25-44'}, inplace=True)

S0101['age50-54'] = (S0101['age45-49'].astype(int) +
                     S0101['age50-54'].astype(int))
S0101.drop(['age45-49'], axis=1, inplace=True)
S0101.rename(columns={'age50-54':'age45-54'}, inplace=True)

S0101['age80+'] = (S0101['age60-64'].astype(int) + 
                            S0101['age65-69'].astype(int) + 
                            S0101['age70-74'].astype(int) + 
                            S0101['age75-79'].astype(int) + 
                            S0101['age80-84'].astype(int) + 
                            S0101['age80+'].astype(int))
S0101.drop(['age60-64', 'age65-69', 'age70-74', 'age75-79', 'age80-84'], axis=1, inplace=True)
S0101.rename(columns={'age80+':'age60+'}, inplace=True)

In [15]:
B19325_colnames = ['GEO_ID',
                 'idincome-9999',
                 'idincome10000-14999',
                 'idincome15000-24999',
                 'idincome25000-34999',
                 'idincome35000-49999',
                 'idincome50000-64999',
                 'idincome65000-74999',
                 'idincome75000+']
B19325 = pd.read_csv('./data/ACS/TRACT/B19325.csv', low_memory=False).iloc[1:,:]

B19325['idincome-9999'] = (B19325['B19325_004E'].astype(int) +
                           B19325['B19325_006E'].astype(int) +
                           B19325['B19325_007E'].astype(int) +
                           B19325['B19325_008E'].astype(int) +
                           B19325['B19325_009E'].astype(int) +
                           B19325['B19325_027E'].astype(int) +
                           B19325['B19325_029E'].astype(int) +
                           B19325['B19325_030E'].astype(int) +
                           B19325['B19325_031E'].astype(int) +
                           B19325['B19325_032E'].astype(int) +
                          B19325['B19325_051E'].astype(int) +
                           B19325['B19325_053E'].astype(int) +
                           B19325['B19325_054E'].astype(int) +
                           B19325['B19325_055E'].astype(int) +
                           B19325['B19325_056E'].astype(int) +
                           B19325['B19325_074E'].astype(int) +
                           B19325['B19325_076E'].astype(int) +
                           B19325['B19325_077E'].astype(int) +
                           B19325['B19325_078E'].astype(int) +
                           B19325['B19325_079E'].astype(int))
B19325['idincome10000-14999'] = (B19325['B19325_010E'].astype(int) +
                                 B19325['B19325_011E'].astype(int) +
                                 B19325['B19325_033E'].astype(int) +
                                 B19325['B19325_034E'].astype(int) +
                                B19325['B19325_057E'].astype(int) +
                                 B19325['B19325_058E'].astype(int) +
                                 B19325['B19325_080E'].astype(int) +
                                 B19325['B19325_081E'].astype(int))
B19325['idincome15000-24999'] = (B19325['B19325_012E'].astype(int) +
                                 B19325['B19325_013E'].astype(int) +
                                 B19325['B19325_014E'].astype(int) +
                                 B19325['B19325_015E'].astype(int) +
                                 B19325['B19325_035E'].astype(int) +
                                 B19325['B19325_036E'].astype(int) +
                                 B19325['B19325_037E'].astype(int) +
                                 B19325['B19325_038E'].astype(int) +
                                B19325['B19325_059E'].astype(int) +
                                 B19325['B19325_060E'].astype(int) +
                                 B19325['B19325_061E'].astype(int) +
                                 B19325['B19325_062E'].astype(int) +
                                 B19325['B19325_082E'].astype(int) +
                                 B19325['B19325_083E'].astype(int) +
                                 B19325['B19325_084E'].astype(int) +
                                 B19325['B19325_085E'].astype(int))
B19325['idincome25000-34999'] = (B19325['B19325_016E'].astype(int) +
                                 B19325['B19325_017E'].astype(int) +
                                 B19325['B19325_039E'].astype(int) +
                                 B19325['B19325_040E'].astype(int) +
                                B19325['B19325_063E'].astype(int) +
                                 B19325['B19325_064E'].astype(int) +
                                 B19325['B19325_086E'].astype(int) +
                                 B19325['B19325_087E'].astype(int))
B19325['idincome35000-49999'] = (B19325['B19325_018E'].astype(int) +
                                 B19325['B19325_019E'].astype(int) +
                                 B19325['B19325_020E'].astype(int) +
                                 B19325['B19325_041E'].astype(int) +
                                 B19325['B19325_042E'].astype(int) +
                                 B19325['B19325_043E'].astype(int) +
                                B19325['B19325_065E'].astype(int) +
                                 B19325['B19325_066E'].astype(int) +
                                 B19325['B19325_067E'].astype(int) +
                                 B19325['B19325_088E'].astype(int) +
                                 B19325['B19325_089E'].astype(int) +
                                 B19325['B19325_090E'].astype(int))
B19325['idincome50000-64999'] = (B19325['B19325_021E'].astype(int) +
                                 B19325['B19325_022E'].astype(int) +
                                 B19325['B19325_044E'].astype(int) +
                                 B19325['B19325_045E'].astype(int) +
                                B19325['B19325_068E'].astype(int) +
                                 B19325['B19325_069E'].astype(int) +
                                 B19325['B19325_091E'].astype(int) +
                                 B19325['B19325_092E'].astype(int))
B19325['idincome65000-74999'] = (B19325['B19325_023E'].astype(int) +
                                 B19325['B19325_046E'].astype(int) +
                                B19325['B19325_070E'].astype(int) +
                                 B19325['B19325_093E'].astype(int))
B19325['idincome75000+'] = (B19325['B19325_024E'].astype(int) +
                            B19325['B19325_025E'].astype(int) +
                            B19325['B19325_047E'].astype(int) +
                            B19325['B19325_048E'].astype(int) +
                           B19325['B19325_071E'].astype(int) +
                            B19325['B19325_072E'].astype(int) +
                            B19325['B19325_094E'].astype(int) +
                            B19325['B19325_095E'].astype(int))

B19325.drop(labels=B19325.columns.difference(B19325_colnames), axis=1, inplace=True)

# Combine columns to match bins used in survey
B19325['idincome50000-64999'] = B19325['idincome50000-64999'].astype(int) + B19325['idincome65000-74999'].astype(int)
B19325.drop('idincome65000-74999', axis=1, inplace=True)
B19325.rename(columns={'idincome50000-64999':'idincome50000-74999'}, inplace=True)

In [16]:
# Join data from different census tables
acs_dataframes = [S1501, S0101, B19325]
acs_data = functools.reduce(lambda left,right: pd.merge(left,right,on='GEO_ID'), acs_dataframes)

# Get TRACT id from the extended geo id, remove acs labels, save marginal counts for popsim
acs_data.dropna(inplace=True)
acs_data['TRACT'] = acs_data['GEO_ID'].str.slice(-11,).astype(int)
acs_data = acs_data[acs_data['TRACT'].isin(tract_list)]
acs_data.to_csv('./populationsim-master/example_msa_survey/data/control_totals_tract.csv', index=False)
acs_data

Unnamed: 0,GEO_ID,edu1-15,edu16-17,edu18-19,edu20,edu21,edu22-24,TOTAL_P,age0-19,age20-24,...,sexF,sexM,idincome-9999,idincome10000-14999,idincome15000-24999,idincome25000-34999,idincome35000-49999,idincome50000-74999,idincome75000+,TRACT
27494,1400000US26011970500,316,1223,684,350,266,80,4068,1027,291,...,1986,2081,925,412,575,440,488,384,174,26011970500
27503,1400000US26015010500,106,688,557,333,493,287,3480,928,241,...,1687,1792,679,330,251,461,527,471,403,26015010500
27506,1400000US26017280300,117,145,252,62,183,72,1338,485,146,...,846,491,324,161,278,107,113,92,62,26017280300
27507,1400000US26017280400,168,486,480,191,154,91,2555,799,375,...,1179,1375,687,256,360,280,204,337,66,26017280400
27508,1400000US26017280500,185,435,608,231,565,268,3196,749,182,...,1524,1671,686,129,316,257,320,432,394,26017280500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61266,1400000US55109120502,130,786,1083,609,814,326,5665,2006,324,...,2832,2832,633,349,695,647,832,1180,514,55109120502
61267,1400000US55109120700,206,1258,696,431,391,138,4534,1416,245,...,2231,2302,735,332,479,418,614,599,479,55109120700
61268,1400000US55109120901,76,627,540,471,824,343,4320,1351,341,...,2330,1989,519,232,298,359,627,745,587,55109120901
61269,1400000US55109120904,81,629,641,398,707,343,4211,1488,161,...,2045,2165,645,228,227,446,400,648,773,55109120904


#### Region

In [17]:
# Sum the values across all census tracts to get region marginals
region_data = acs_data.iloc[:,1:-1].apply(pd.to_numeric, errors='coerce')
region_data = pd.DataFrame(region_data.sum()).transpose()
region_data['REGION'] = '1'
region_data.dropna(inplace=True)
region_data.to_csv('./populationsim-master/example_msa_survey/data/control_totals_region.csv', index=False)
region_data

Unnamed: 0,edu1-15,edu16-17,edu18-19,edu20,edu21,edu22-24,TOTAL_P,age0-19,age20-24,age25-44,...,sexF,sexM,idincome-9999,idincome10000-14999,idincome15000-24999,idincome25000-34999,idincome35000-49999,idincome50000-74999,idincome75000+,REGION
0,1053898,2843890,2192697,1055291,2056134,1315058,15499884,4763641,1129985,3989237,...,7866903,7628984,3280703,1000776,1684806,1485963,1702037,1759115,1833368,1


In [18]:
region_data.columns

Index(['edu1-15', 'edu16-17', 'edu18-19', 'edu20', 'edu21', 'edu22-24',
       'TOTAL_P', 'age0-19', 'age20-24', 'age25-44', 'age45-54', 'age55-59',
       'age60+', 'sexF', 'sexM', 'idincome-9999', 'idincome10000-14999',
       'idincome15000-24999', 'idincome25000-34999', 'idincome35000-49999',
       'idincome50000-74999', 'idincome75000+', 'REGION'],
      dtype='object')