In [1]:
import functools
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Geographic Relationships

In [2]:
# Clean up the file generated from QGIS spatial join
geo = pd.read_csv('./data/TIGER/msa_puma_tract_join.csv', usecols=['jmsa_GEOID','jpuma_GEOI','GEOID']).dropna()
geo['REGION'] = '1'
geo = geo[['REGION','jmsa_GEOID','GEOID']].astype(int)
geo.columns = ['REGION','MSA','TRACT']

# Get rid of Micropolitan Statistical Areas
msa_ids = pd.read_csv('./data/TIGER/msa_list.csv')
geo = geo[geo['MSA'].isin(msa_ids['GEOID'])]
full_msa_list = pd.unique(geo['MSA'])

# Limit to certain MSAs if desired, there are 925 total MSA + MicroSA, 384 MSA
msas_to_use = full_msa_list[50:99]
geo = geo[geo['MSA'].isin(msas_to_use)]

geo.to_csv('./populationsim-master/example_msa_survey/data/geo_cross_walk.csv', index=False)

# Get the list of msas, and tracts that are being synthesized
msa_list = list(pd.unique(geo['MSA']))
msa_list = [int(i) for i in msa_list]
tract_list = list(pd.unique(geo['TRACT']))
tract_list = [int(i) for i in tract_list]

## Seed Samples from Survey

#### Persons/Households

In [3]:
p_data = pd.read_csv('./data/scoot_socio.csv', dtype=str)
p_data.reset_index(inplace=True)

In [4]:
# Check for NA values
p_data.isna().sum()

index       0
user        0
age         0
gender      0
hispanic    0
race        0
usborn      0
edu         0
student     0
work        0
zipcode     0
hhsize      0
child       0
hhincome    0
idincome    0
disable     0
veh         0
bike        0
dtype: int64

In [5]:
# Fake household for each person
h_data = p_data[['index','user']].copy()
h_data['REGION'] = '1'
h_data['WGTP'] = 1.0

# Allocate all sample data to every MSA (seed geography)
# Every person is alone in a single household (id=index)
# Every person/household is repeated for every MSA
p_data_list = []
h_data_list = []
for msa in msa_list:
    p_data_new = p_data.copy()
    p_data_new['MSA'] = msa
    p_data_list.append(p_data_new)
    h_data_new = h_data.copy()
    h_data_new['MSA'] = msa
    h_data_list.append(h_data_new)
p_data = pd.concat(p_data_list)
p_data.reset_index(inplace=True)
p_data.drop(['level_0','index'], axis=1, inplace=True)
p_data.reset_index(inplace=True)
h_data = pd.concat(h_data_list)
h_data.reset_index(inplace=True)
h_data.drop(['level_0','index'], axis=1, inplace=True)
h_data.reset_index(inplace=True)

In [6]:
p_data

Unnamed: 0,index,user,age,gender,hispanic,race,usborn,edu,student,work,zipcode,hhsize,child,hhincome,idincome,disable,veh,bike,MSA
0,0,04RbJrWYLoUs721TfrKU,61,F,N,1,Y,4,N,2,29576,1,0,3,3,no,2,no,31460
1,1,05jO2rxIAAUb56QPb0se,42,M,N,4,N,6,N,1,54956,4,2,9,9,no,2,yes,31460
2,2,0a6oBOvnElapBG5P96Bv,58,F,N,4,P,5,N,3,94709,2,0,4,2,no,1,no,31460
3,3,0ALR8Wou755wsfL2CcTe,36,M,N,1,Y,4,N,7,78634,3,0,7,1,no,4,yes,31460
4,4,0bgDWIbxXUIBvjlUbPPN,33,M,N,1,Y,3,N,1,12084,1,0,6,6,no,1,yes,31460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86921,86921,Zz7HWhA1rLeaq0SZvtHQ,46,M,N,1,Y,5,N,1,92122,4,2,10,7,no,2,no,12060
86922,86922,ZZhIMbD8AGPTEy5FthwW,34,M,N,1,Y,5,N,1,89141,3,1,9,9,no,2,yes,12060
86923,86923,zzJNMD87EWq41ChYzGgD,33,F,N,1,Y,4,P,1,85746,4,2,7,3,no,3,yes,12060
86924,86924,ZzqLinqwV4Q9GGtl5iLq,40,M,Y,1,Y,5,F,1,68701,4,2,5,5,yes,4,yes,12060


In [7]:
h_data

Unnamed: 0,index,user,REGION,WGTP,MSA
0,0,04RbJrWYLoUs721TfrKU,1,1.0,31460
1,1,05jO2rxIAAUb56QPb0se,1,1.0,31460
2,2,0a6oBOvnElapBG5P96Bv,1,1.0,31460
3,3,0ALR8Wou755wsfL2CcTe,1,1.0,31460
4,4,0bgDWIbxXUIBvjlUbPPN,1,1.0,31460
...,...,...,...,...,...
86921,86921,Zz7HWhA1rLeaq0SZvtHQ,1,1.0,12060
86922,86922,ZZhIMbD8AGPTEy5FthwW,1,1.0,12060
86923,86923,zzJNMD87EWq41ChYzGgD,1,1.0,12060
86924,86924,ZzqLinqwV4Q9GGtl5iLq,1,1.0,12060


In [8]:
# Save seed data to file
h_data.to_csv('./populationsim-master/example_msa_survey/data/seed_households.csv', index=False)
p_data.to_csv('./populationsim-master/example_msa_survey/data/seed_persons.csv', index=False)

## Control Totals from ACS

#### MSA

In [9]:
S1501_cols = ['GEO_ID',
             'S1501_C01_007E',
             'S1501_C01_008E',
             'S1501_C01_009E',
             'S1501_C01_010E',
             'S1501_C01_011E',
             'S1501_C01_012E',
             'S1501_C01_013E']
S1501_colnames = ['GEO_ID',
                 'edu1-11',
                 'edu12-15',
                 'edu16-17',
                 'edu18-19',
                 'edu20',
                 'edu21',
                 'edu22-24']
S1501 = pd.read_csv('./data/ACS/MSA/S1501.csv', low_memory=False, usecols=S1501_cols).iloc[1:,:]
S1501.columns = S1501_colnames

# Combine columns to match bins used in survey
S1501['edu1-11'] = S1501['edu1-11'].astype(int) + S1501['edu12-15'].astype(int)
S1501.drop('edu12-15', axis=1, inplace=True)
S1501.rename(columns={'edu1-11':'edu1-15'}, inplace=True)

In [10]:
S0101_cols = ['GEO_ID',
             'S0101_C01_001E',
             'S0101_C01_002E',
             'S0101_C01_003E',
             'S0101_C01_004E',
             'S0101_C01_005E',
             'S0101_C01_006E',
             'S0101_C01_007E',
             'S0101_C01_008E',
             'S0101_C01_009E',
             'S0101_C01_010E',
             'S0101_C01_011E',
             'S0101_C01_012E',
             'S0101_C01_013E',
             'S0101_C01_014E',
             'S0101_C01_015E',
             'S0101_C01_016E',
             'S0101_C01_017E',
             'S0101_C01_018E',
             'S0101_C01_019E',
             'S0101_C01_033E',
             'S0101_C01_034E']
S0101_colnames = ['GEO_ID',
                 'TOTAL_P',
                 'age0-5',
                 'age5-9',
                 'age10-14',
                 'age15-19',
                 'age20-24',
                 'age25-29',
                 'age30-34',
                 'age35-39',
                 'age40-44',
                 'age45-49',
                 'age50-54',
                 'age55-59',
                 'age60-64',
                 'age65-69',
                 'age70-74',
                 'age75-79',
                 'age80-84',
                 'age80+',
                 'sexRatio',
                 'sexM']
S0101 = pd.read_csv('./data/ACS/MSA/S0101.csv', low_memory=False, usecols=S0101_cols).iloc[1:,:]
S0101.columns = S0101_colnames
S0101['sexRatio'] = pd.to_numeric(S0101['sexRatio'], errors='coerce').fillna(0)

# Gender counts are in percentage
S0101['sexRatio'] = S0101['TOTAL_P'].astype(float) / (S0101['sexRatio'].astype(float)/100 + 1.0)
S0101['sexM'] = S0101['TOTAL_P'].astype(float) - S0101['sexRatio'].astype(float)
S0101.rename(columns={'sexRatio':'sexF'}, inplace=True)
S0101['sexM'] = S0101['sexM'].astype(int)
S0101['sexF'] = S0101['sexF'].astype(int)

# Combine columns to get wider bins and match controls/PUMS
S0101['age15-19'] = (S0101['age0-5'].astype(int) +
                     S0101['age0-5'].astype(int) +
                     S0101['age5-9'].astype(int) +
                     S0101['age10-14'].astype(int) +
                     S0101['age15-19'].astype(int))
S0101.drop(['age0-5', 'age5-9', 'age10-14'], axis=1, inplace=True)
S0101.rename(columns={'age15-19':'age0-19'}, inplace=True)

S0101['age40-44'] = (S0101['age25-29'].astype(int) +
                     S0101['age30-34'].astype(int) +
                     S0101['age35-39'].astype(int) +
                     S0101['age40-44'].astype(int))
S0101.drop(['age25-29', 'age30-34', 'age35-39'], axis=1, inplace=True)
S0101.rename(columns={'age40-44':'age25-44'}, inplace=True)

S0101['age50-54'] = (S0101['age45-49'].astype(int) +
                     S0101['age50-54'].astype(int))
S0101.drop(['age45-49'], axis=1, inplace=True)
S0101.rename(columns={'age50-54':'age45-54'}, inplace=True)

S0101['age80+'] = (S0101['age60-64'].astype(int) + 
                   S0101['age65-69'].astype(int) + 
                   S0101['age70-74'].astype(int) + 
                   S0101['age75-79'].astype(int) + 
                   S0101['age80-84'].astype(int) + 
                   S0101['age80+'].astype(int))
S0101.drop(['age60-64', 'age65-69', 'age70-74', 'age75-79', 'age80-84'], axis=1, inplace=True)
S0101.rename(columns={'age80+':'age60+'}, inplace=True)

In [11]:
B19325_colnames = ['GEO_ID',
                 'idincome-9999',
                 'idincome10000-14999',
                 'idincome15000-24999',
                 'idincome25000-34999',
                 'idincome35000-49999',
                 'idincome50000-64999',
                 'idincome65000-74999',
                 'idincome75000+']
B19325 = pd.read_csv('./data/ACS/MSA/B19325.csv', low_memory=False).iloc[1:,:]

B19325['idincome-9999'] = (B19325['B19325_004E'].astype(int) +
                           B19325['B19325_006E'].astype(int) +
                           B19325['B19325_007E'].astype(int) +
                           B19325['B19325_008E'].astype(int) +
                           B19325['B19325_009E'].astype(int) +
                           B19325['B19325_027E'].astype(int) +
                           B19325['B19325_029E'].astype(int) +
                           B19325['B19325_030E'].astype(int) +
                           B19325['B19325_031E'].astype(int) +
                           B19325['B19325_032E'].astype(int) +
                          B19325['B19325_051E'].astype(int) +
                           B19325['B19325_053E'].astype(int) +
                           B19325['B19325_054E'].astype(int) +
                           B19325['B19325_055E'].astype(int) +
                           B19325['B19325_056E'].astype(int) +
                           B19325['B19325_074E'].astype(int) +
                           B19325['B19325_076E'].astype(int) +
                           B19325['B19325_077E'].astype(int) +
                           B19325['B19325_078E'].astype(int) +
                           B19325['B19325_079E'].astype(int))
B19325['idincome10000-14999'] = (B19325['B19325_010E'].astype(int) +
                                 B19325['B19325_011E'].astype(int) +
                                 B19325['B19325_033E'].astype(int) +
                                 B19325['B19325_034E'].astype(int) +
                                B19325['B19325_057E'].astype(int) +
                                 B19325['B19325_058E'].astype(int) +
                                 B19325['B19325_080E'].astype(int) +
                                 B19325['B19325_081E'].astype(int))
B19325['idincome15000-24999'] = (B19325['B19325_012E'].astype(int) +
                                 B19325['B19325_013E'].astype(int) +
                                 B19325['B19325_014E'].astype(int) +
                                 B19325['B19325_015E'].astype(int) +
                                 B19325['B19325_035E'].astype(int) +
                                 B19325['B19325_036E'].astype(int) +
                                 B19325['B19325_037E'].astype(int) +
                                 B19325['B19325_038E'].astype(int) +
                                B19325['B19325_059E'].astype(int) +
                                 B19325['B19325_060E'].astype(int) +
                                 B19325['B19325_061E'].astype(int) +
                                 B19325['B19325_062E'].astype(int) +
                                 B19325['B19325_082E'].astype(int) +
                                 B19325['B19325_083E'].astype(int) +
                                 B19325['B19325_084E'].astype(int) +
                                 B19325['B19325_085E'].astype(int))
B19325['idincome25000-34999'] = (B19325['B19325_016E'].astype(int) +
                                 B19325['B19325_017E'].astype(int) +
                                 B19325['B19325_039E'].astype(int) +
                                 B19325['B19325_040E'].astype(int) +
                                B19325['B19325_063E'].astype(int) +
                                 B19325['B19325_064E'].astype(int) +
                                 B19325['B19325_086E'].astype(int) +
                                 B19325['B19325_087E'].astype(int))
B19325['idincome35000-49999'] = (B19325['B19325_018E'].astype(int) +
                                 B19325['B19325_019E'].astype(int) +
                                 B19325['B19325_020E'].astype(int) +
                                 B19325['B19325_041E'].astype(int) +
                                 B19325['B19325_042E'].astype(int) +
                                 B19325['B19325_043E'].astype(int) +
                                B19325['B19325_065E'].astype(int) +
                                 B19325['B19325_066E'].astype(int) +
                                 B19325['B19325_067E'].astype(int) +
                                 B19325['B19325_088E'].astype(int) +
                                 B19325['B19325_089E'].astype(int) +
                                 B19325['B19325_090E'].astype(int))
B19325['idincome50000-64999'] = (B19325['B19325_021E'].astype(int) +
                                 B19325['B19325_022E'].astype(int) +
                                 B19325['B19325_044E'].astype(int) +
                                 B19325['B19325_045E'].astype(int) +
                                B19325['B19325_068E'].astype(int) +
                                 B19325['B19325_069E'].astype(int) +
                                 B19325['B19325_091E'].astype(int) +
                                 B19325['B19325_092E'].astype(int))
B19325['idincome65000-74999'] = (B19325['B19325_023E'].astype(int) +
                                 B19325['B19325_046E'].astype(int) +
                                B19325['B19325_070E'].astype(int) +
                                 B19325['B19325_093E'].astype(int))
B19325['idincome75000+'] = (B19325['B19325_024E'].astype(int) +
                            B19325['B19325_025E'].astype(int) +
                            B19325['B19325_047E'].astype(int) +
                            B19325['B19325_048E'].astype(int) +
                           B19325['B19325_071E'].astype(int) +
                            B19325['B19325_072E'].astype(int) +
                            B19325['B19325_094E'].astype(int) +
                            B19325['B19325_095E'].astype(int))

B19325.drop(labels=B19325.columns.difference(B19325_colnames), axis=1, inplace=True)

# Combine columns to match bins used in survey
B19325['idincome50000-64999'] = B19325['idincome50000-64999'].astype(int) + B19325['idincome65000-74999'].astype(int)
B19325.drop('idincome65000-74999', axis=1, inplace=True)
B19325.rename(columns={'idincome50000-64999':'idincome50000-74999'}, inplace=True)

In [12]:
# Join data from different census tables
acs_dataframes = [S1501, S0101, B19325]
acs_data = functools.reduce(lambda left,right: pd.merge(left,right,on='GEO_ID'), acs_dataframes)

# Get MSA id from the extended geo id, remove acs labels, save marginal counts for popsim
acs_data.dropna(inplace=True)
acs_data['MSA'] = acs_data['GEO_ID'].str.slice(-5,).astype(int)
acs_data = acs_data[acs_data['MSA'].isin(msa_list)]
acs_data.to_csv('./populationsim-master/example_msa_survey/data/control_totals_msa.csv', index=False)
acs_data

Unnamed: 0,GEO_ID,edu1-15,edu16-17,edu18-19,edu20,edu21,edu22-24,TOTAL_P,age0-19,age20-24,...,sexF,sexM,idincome-9999,idincome10000-14999,idincome15000-24999,idincome25000-34999,idincome35000-49999,idincome50000-74999,idincome75000+,MSA
47,310M500US12060,403985,927197,757835,292896,930303,566050,5862424,1975516,381717,...,3028111,2834312,1307620,339117,580501,515153,563348,600925,762425,12060
105,310M500US14500,10596,24916,32127,13045,72262,59809,322510,91073,33477,...,160293,162216,73289,17311,28891,25558,28985,34620,62885,14500
115,310M500US14860,64195,138177,91683,40831,172191,136208,943926,292982,59791,...,484312,459613,204660,47610,84212,67902,76850,90331,198044,14860
122,310M500US15260,11379,25620,18644,6583,12412,8319,117400,34596,6367,...,61305,56094,27107,8582,15178,11787,11960,11005,10733,15260
139,310M500US15980,64308,171793,112872,49167,97615,59091,737468,180532,36434,...,376259,361208,160813,57015,100894,83439,80817,72041,73894,15980
163,310M500US17020,15864,33186,43620,14690,26518,13631,225817,65794,24898,...,114106,111710,57316,22525,28324,18291,19595,20029,21985,17020
182,310M500US17820,26525,96694,113175,53246,110796,70410,723498,242491,58226,...,358167,365330,147783,45905,72390,67193,75867,78012,91160,17820
206,310M500US18880,16083,48905,46857,20406,37022,20833,272056,81936,17511,...,134083,137972,56888,18191,32499,27534,27573,27098,32289,18880
225,310M500US19660,44240,155010,112182,54587,75207,39955,646288,158720,36000,...,332452,313835,153075,52932,92613,75220,68035,61222,49621,19660
227,310M500US19740,175319,398856,396154,153003,554780,321178,2892066,898265,169997,...,1446756,1445309,526490,142133,260357,257783,313428,355872,486663,19740


#### Tract

In [13]:
S1501_cols = ['GEO_ID',
             'S1501_C01_007E',
             'S1501_C01_008E',
             'S1501_C01_009E',
             'S1501_C01_010E',
             'S1501_C01_011E',
             'S1501_C01_012E',
             'S1501_C01_013E']
S1501_colnames = ['GEO_ID',
                 'edu1-11',
                 'edu12-15',
                 'edu16-17',
                 'edu18-19',
                 'edu20',
                 'edu21',
                 'edu22-24']
S1501 = pd.read_csv('./data/ACS/TRACT/S1501.csv', low_memory=False, usecols=S1501_cols).iloc[1:,:]
S1501.columns = S1501_colnames

# Combine columns to match bins used in survey
S1501['edu1-11'] = S1501['edu1-11'].astype(int) + S1501['edu12-15'].astype(int)
S1501.drop('edu12-15', axis=1, inplace=True)
S1501.rename(columns={'edu1-11':'edu1-15'}, inplace=True)

In [14]:
S0101_cols = ['GEO_ID',
             'S0101_C01_001E',
             'S0101_C01_002E',
             'S0101_C01_003E',
             'S0101_C01_004E',
             'S0101_C01_005E',
             'S0101_C01_006E',
             'S0101_C01_007E',
             'S0101_C01_008E',
             'S0101_C01_009E',
             'S0101_C01_010E',
             'S0101_C01_011E',
             'S0101_C01_012E',
             'S0101_C01_013E',
             'S0101_C01_014E',
             'S0101_C01_015E',
             'S0101_C01_016E',
             'S0101_C01_017E',
             'S0101_C01_018E',
             'S0101_C01_019E',
             'S0101_C01_033E',
             'S0101_C01_034E']
S0101_colnames = ['GEO_ID',
                 'TOTAL_P',
                 'age0-5',
                 'age5-9',
                 'age10-14',
                 'age15-19',
                 'age20-24',
                 'age25-29',
                 'age30-34',
                 'age35-39',
                 'age40-44',
                 'age45-49',
                 'age50-54',
                 'age55-59',
                 'age60-64',
                 'age65-69',
                 'age70-74',
                 'age75-79',
                 'age80-84',
                 'age80+',
                 'sexRatio',
                 'sexM']
S0101 = pd.read_csv('./data/ACS/TRACT/S0101.csv', low_memory=False, usecols=S0101_cols).iloc[1:,:]
S0101.columns = S0101_colnames
S0101['sexRatio'] = pd.to_numeric(S0101['sexRatio'], errors='coerce').fillna(0)

# Gender counts are in percentage
S0101['sexRatio'] = S0101['TOTAL_P'].astype(float) / (S0101['sexRatio'].astype(float)/100 + 1.0)
S0101['sexM'] = S0101['TOTAL_P'].astype(float) - S0101['sexRatio'].astype(float)
S0101.rename(columns={'sexRatio':'sexF'}, inplace=True)
S0101['sexM'] = S0101['sexM'].astype(int)
S0101['sexF'] = S0101['sexF'].astype(int)

# Combine columns to get wider bins
S0101['age15-19'] = (S0101['age0-5'].astype(int) + 
                              S0101['age0-5'].astype(int) + 
                              S0101['age5-9'].astype(int) + 
                              S0101['age10-14'].astype(int) + 
                              S0101['age15-19'].astype(int))
S0101.drop(['age0-5', 'age5-9', 'age10-14'], axis=1, inplace=True)
S0101.rename(columns={'age15-19':'age0-19'}, inplace=True)

S0101['age40-44'] = (S0101['age25-29'].astype(int) +
                     S0101['age30-34'].astype(int) +
                     S0101['age35-39'].astype(int) +
                     S0101['age40-44'].astype(int))
S0101.drop(['age25-29', 'age30-34', 'age35-39'], axis=1, inplace=True)
S0101.rename(columns={'age40-44':'age25-44'}, inplace=True)

S0101['age50-54'] = (S0101['age45-49'].astype(int) +
                     S0101['age50-54'].astype(int))
S0101.drop(['age45-49'], axis=1, inplace=True)
S0101.rename(columns={'age50-54':'age45-54'}, inplace=True)

S0101['age80+'] = (S0101['age60-64'].astype(int) + 
                            S0101['age65-69'].astype(int) + 
                            S0101['age70-74'].astype(int) + 
                            S0101['age75-79'].astype(int) + 
                            S0101['age80-84'].astype(int) + 
                            S0101['age80+'].astype(int))
S0101.drop(['age60-64', 'age65-69', 'age70-74', 'age75-79', 'age80-84'], axis=1, inplace=True)
S0101.rename(columns={'age80+':'age60+'}, inplace=True)

In [15]:
B19325_colnames = ['GEO_ID',
                 'idincome-9999',
                 'idincome10000-14999',
                 'idincome15000-24999',
                 'idincome25000-34999',
                 'idincome35000-49999',
                 'idincome50000-64999',
                 'idincome65000-74999',
                 'idincome75000+']
B19325 = pd.read_csv('./data/ACS/TRACT/B19325.csv', low_memory=False).iloc[1:,:]

B19325['idincome-9999'] = (B19325['B19325_004E'].astype(int) +
                           B19325['B19325_006E'].astype(int) +
                           B19325['B19325_007E'].astype(int) +
                           B19325['B19325_008E'].astype(int) +
                           B19325['B19325_009E'].astype(int) +
                           B19325['B19325_027E'].astype(int) +
                           B19325['B19325_029E'].astype(int) +
                           B19325['B19325_030E'].astype(int) +
                           B19325['B19325_031E'].astype(int) +
                           B19325['B19325_032E'].astype(int) +
                          B19325['B19325_051E'].astype(int) +
                           B19325['B19325_053E'].astype(int) +
                           B19325['B19325_054E'].astype(int) +
                           B19325['B19325_055E'].astype(int) +
                           B19325['B19325_056E'].astype(int) +
                           B19325['B19325_074E'].astype(int) +
                           B19325['B19325_076E'].astype(int) +
                           B19325['B19325_077E'].astype(int) +
                           B19325['B19325_078E'].astype(int) +
                           B19325['B19325_079E'].astype(int))
B19325['idincome10000-14999'] = (B19325['B19325_010E'].astype(int) +
                                 B19325['B19325_011E'].astype(int) +
                                 B19325['B19325_033E'].astype(int) +
                                 B19325['B19325_034E'].astype(int) +
                                B19325['B19325_057E'].astype(int) +
                                 B19325['B19325_058E'].astype(int) +
                                 B19325['B19325_080E'].astype(int) +
                                 B19325['B19325_081E'].astype(int))
B19325['idincome15000-24999'] = (B19325['B19325_012E'].astype(int) +
                                 B19325['B19325_013E'].astype(int) +
                                 B19325['B19325_014E'].astype(int) +
                                 B19325['B19325_015E'].astype(int) +
                                 B19325['B19325_035E'].astype(int) +
                                 B19325['B19325_036E'].astype(int) +
                                 B19325['B19325_037E'].astype(int) +
                                 B19325['B19325_038E'].astype(int) +
                                B19325['B19325_059E'].astype(int) +
                                 B19325['B19325_060E'].astype(int) +
                                 B19325['B19325_061E'].astype(int) +
                                 B19325['B19325_062E'].astype(int) +
                                 B19325['B19325_082E'].astype(int) +
                                 B19325['B19325_083E'].astype(int) +
                                 B19325['B19325_084E'].astype(int) +
                                 B19325['B19325_085E'].astype(int))
B19325['idincome25000-34999'] = (B19325['B19325_016E'].astype(int) +
                                 B19325['B19325_017E'].astype(int) +
                                 B19325['B19325_039E'].astype(int) +
                                 B19325['B19325_040E'].astype(int) +
                                B19325['B19325_063E'].astype(int) +
                                 B19325['B19325_064E'].astype(int) +
                                 B19325['B19325_086E'].astype(int) +
                                 B19325['B19325_087E'].astype(int))
B19325['idincome35000-49999'] = (B19325['B19325_018E'].astype(int) +
                                 B19325['B19325_019E'].astype(int) +
                                 B19325['B19325_020E'].astype(int) +
                                 B19325['B19325_041E'].astype(int) +
                                 B19325['B19325_042E'].astype(int) +
                                 B19325['B19325_043E'].astype(int) +
                                B19325['B19325_065E'].astype(int) +
                                 B19325['B19325_066E'].astype(int) +
                                 B19325['B19325_067E'].astype(int) +
                                 B19325['B19325_088E'].astype(int) +
                                 B19325['B19325_089E'].astype(int) +
                                 B19325['B19325_090E'].astype(int))
B19325['idincome50000-64999'] = (B19325['B19325_021E'].astype(int) +
                                 B19325['B19325_022E'].astype(int) +
                                 B19325['B19325_044E'].astype(int) +
                                 B19325['B19325_045E'].astype(int) +
                                B19325['B19325_068E'].astype(int) +
                                 B19325['B19325_069E'].astype(int) +
                                 B19325['B19325_091E'].astype(int) +
                                 B19325['B19325_092E'].astype(int))
B19325['idincome65000-74999'] = (B19325['B19325_023E'].astype(int) +
                                 B19325['B19325_046E'].astype(int) +
                                B19325['B19325_070E'].astype(int) +
                                 B19325['B19325_093E'].astype(int))
B19325['idincome75000+'] = (B19325['B19325_024E'].astype(int) +
                            B19325['B19325_025E'].astype(int) +
                            B19325['B19325_047E'].astype(int) +
                            B19325['B19325_048E'].astype(int) +
                           B19325['B19325_071E'].astype(int) +
                            B19325['B19325_072E'].astype(int) +
                            B19325['B19325_094E'].astype(int) +
                            B19325['B19325_095E'].astype(int))

B19325.drop(labels=B19325.columns.difference(B19325_colnames), axis=1, inplace=True)

# Combine columns to match bins used in survey
B19325['idincome50000-64999'] = B19325['idincome50000-64999'].astype(int) + B19325['idincome65000-74999'].astype(int)
B19325.drop('idincome65000-74999', axis=1, inplace=True)
B19325.rename(columns={'idincome50000-64999':'idincome50000-74999'}, inplace=True)

In [16]:
# Join data from different census tables
acs_dataframes = [S1501, S0101, B19325]
acs_data = functools.reduce(lambda left,right: pd.merge(left,right,on='GEO_ID'), acs_dataframes)

# Get TRACT id from the extended geo id, remove acs labels, save marginal counts for popsim
acs_data.dropna(inplace=True)
acs_data['TRACT'] = acs_data['GEO_ID'].str.slice(-11,).astype(int)
acs_data = acs_data[acs_data['TRACT'].isin(tract_list)]
acs_data.to_csv('./populationsim-master/example_msa_survey/data/control_totals_tract.csv', index=False)
acs_data

Unnamed: 0,GEO_ID,edu1-15,edu16-17,edu18-19,edu20,edu21,edu22-24,TOTAL_P,age0-19,age20-24,...,sexF,sexM,idincome-9999,idincome10000-14999,idincome15000-24999,idincome25000-34999,idincome35000-49999,idincome50000-74999,idincome75000+,TRACT
3276,1400000US06007000102,121,421,810,315,549,332,4032,1514,260,...,1947,2084,714,458,467,267,416,378,403,6007000102
3277,1400000US06007000103,222,421,707,194,1169,542,5393,2340,404,...,2832,2560,1100,346,532,310,412,607,928,6007000103
3278,1400000US06007000104,369,625,1112,384,904,414,5517,1270,593,...,3124,2392,867,551,708,1081,574,463,520,6007000104
3279,1400000US06007000201,247,684,879,213,507,228,4075,1125,432,...,1890,2184,558,289,722,299,534,394,516,6007000201
3280,1400000US06007000202,189,565,839,345,565,223,3703,786,321,...,2037,1665,824,349,408,332,585,432,349,6007000202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,1400000US54037972505,320,590,506,75,246,231,2545,591,107,...,1275,1269,509,195,90,407,177,406,272,54037972505
59942,1400000US54037972506,252,819,492,109,557,308,3725,1193,182,...,2033,1691,727,337,474,298,288,384,541,54037972506
59943,1400000US54037972601,77,460,321,97,277,131,1922,561,116,...,961,961,402,88,159,182,256,352,290,54037972601
59946,1400000US54037972702,341,961,802,235,488,186,4235,1096,296,...,2052,2182,1089,191,456,221,350,686,529,54037972702


#### Region

In [17]:
# Sum the values across all census tracts to get region marginals
region_data = acs_data.iloc[:,1:-1].apply(pd.to_numeric, errors='coerce')
region_data = pd.DataFrame(region_data.sum()).transpose()
region_data['REGION'] = '1'
region_data.dropna(inplace=True)
region_data.to_csv('./populationsim-master/example_msa_survey/data/control_totals_region.csv', index=False)
region_data

Unnamed: 0,edu1-15,edu16-17,edu18-19,edu20,edu21,edu22-24,TOTAL_P,age0-19,age20-24,age25-44,...,sexF,sexM,idincome-9999,idincome10000-14999,idincome15000-24999,idincome25000-34999,idincome35000-49999,idincome50000-74999,idincome75000+,REGION
0,4220910,9341787,6405193,2844621,8071982,5644771,52485079,15639705,3388652,14030156,...,26943315,25529283,11531827,3171676,5272952,4524492,4946067,5606139,8368107,1


In [18]:
region_data.columns

Index(['edu1-15', 'edu16-17', 'edu18-19', 'edu20', 'edu21', 'edu22-24',
       'TOTAL_P', 'age0-19', 'age20-24', 'age25-44', 'age45-54', 'age55-59',
       'age60+', 'sexF', 'sexM', 'idincome-9999', 'idincome10000-14999',
       'idincome15000-24999', 'idincome25000-34999', 'idincome35000-49999',
       'idincome50000-74999', 'idincome75000+', 'REGION'],
      dtype='object')