In [1]:
import functools
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Geographic Relationships

In [2]:
# Clean up the file generated from QGIS spatial join
geo = pd.read_csv('./data/TIGER/msa_puma_tract_join.csv', usecols=['jmsa_GEOID','jpuma_GEOI','GEOID']).dropna()
geo['REGION'] = '1'
geo = geo[['REGION','jmsa_GEOID','jpuma_GEOI','GEOID']].astype(int)
geo.columns = ['REGION','MSA','PUMA','TRACT']

# Get rid of Micropolitan Statistical Areas
msa_ids = pd.read_csv('./data/TIGER/msa_list.csv')
geo = geo[geo['MSA'].isin(msa_ids['GEOID'])]
full_msa_list = pd.unique(geo['MSA'])

# Limit to certain MSAs if desired, there are 925 total MSA + MicroSA, 384 MSA
msas_to_use = full_msa_list[0:49]
geo = geo[geo['MSA'].isin(msas_to_use)]

# Remove select problem PUMAs not Matched to Tracts/MSAs in ACS Tables
geo = geo[geo['PUMA']!=2101600]
geo = geo[geo['PUMA']!=4701400]
geo = geo[geo['PUMA']!=4501300]
geo = geo[geo['PUMA']!=5151145]

geo.to_csv('./populationsim-master/example_msa/data/geo_cross_walk.csv', index=False)

# Get the list of pumas, msas, and tracts that are being synthesized
puma_list = list(pd.unique(geo['PUMA']))
puma_list = [int(i) for i in puma_list]
msa_list = list(pd.unique(geo['MSA']))
msa_list = [int(i) for i in msa_list]
tract_list = list(pd.unique(geo['TRACT']))
tract_list = [int(i) for i in tract_list]

## Seed Samples from PUMS

#### Persons

In [3]:
p_data = []
pums_file_list=['./data/PUMS/psam_pusa.csv','./data/PUMS/psam_pusb.csv','./data/PUMS/psam_pusc.csv','./data/PUMS/psam_pusd.csv']
for file in pums_file_list:
    print(file)
    p_df = pd.read_csv(file,
                       usecols=['SERIALNO','ST','PUMA','PWGTP','AGEP','SEX','RAC1P','COW','PINCP','SCHL'],
                       chunksize=100000,
                       dtype=str)
    chunk_list = []
    for chunk in p_df:
        chunk.dropna(inplace=True) # Side effect limits to >16yrs old
        chunk['ST'] = chunk['ST'].astype(str)
        chunk['PUMA'] = chunk['PUMA'].astype(str)
        chunk['PUMA'] = chunk['PUMA'].str.zfill(5)
        chunk['PUMA'] = chunk['ST'] + chunk['PUMA']
        chunk['PUMA'] = chunk['PUMA'].astype(int)
        chunk = chunk[chunk['PUMA'].isin(puma_list)]
        chunk_list.append(chunk)
    p_df = pd.concat(chunk_list)
    p_data.append(p_df)

./data/PUMS/psam_pusa.csv
./data/PUMS/psam_pusb.csv
./data/PUMS/psam_pusc.csv
./data/PUMS/psam_pusd.csv


In [4]:
p_data = pd.concat(p_data)
p_data

Unnamed: 0,SERIALNO,PUMA,ST,PWGTP,AGEP,COW,SCHL,SEX,PINCP,RAC1P
0,2015000000067,102701,01,9,56,6,14,1,3450,2
3,2015000000067,102701,01,14,61,1,19,1,35000,2
6,2015000000463,102701,01,7,53,7,21,1,150000,1
7,2015000000463,102701,01,9,53,1,19,2,6000,1
10,2015000000518,100100,01,38,46,1,14,2,29400,1
...,...,...,...,...,...,...,...,...,...,...
2800411,2019HU1405425,4953001,49,25,45,1,16,2,40000,1
2800412,2019HU1405425,4953001,49,34,19,1,16,2,33300,1
2800500,2019HU1409016,4953001,49,19,36,3,16,1,47950,1
2800501,2019HU1409255,4953001,49,40,65,1,16,1,56300,8


In [5]:
# Check for NA values
p_data['SCHL'] = p_data['SCHL'].fillna(value=-1) # 1
p_data['PINCP'] = p_data['PINCP'].fillna(value=-10000) # 0
p_data['COW'] = p_data['COW'].fillna(value=-1) # 9
p_data.isna().sum()

SERIALNO    0
PUMA        0
ST          0
PWGTP       0
AGEP        0
COW         0
SCHL        0
SEX         0
PINCP       0
RAC1P       0
dtype: int64

#### Households

In [6]:
h_data = []
pums_file_list=['./data/PUMS/psam_husa.csv','./data/PUMS/psam_husb.csv','./data/PUMS/psam_husc.csv','./data/PUMS/psam_husd.csv']
for file in pums_file_list:
    print(file)
    h_df = pd.read_csv(file,
                       usecols=['SERIALNO','ST','PUMA','WGTP','NP','VEH'],
                       chunksize=100000,
                       dtype=str)
    chunk_list = []
    for chunk in h_df:
        chunk.dropna(inplace=True) # Side effect limits to >16yrs old
        chunk = chunk[chunk['NP'].astype(int) > 0] # Remove households with 0 people in them
        chunk['ST'] = chunk['ST'].astype(str)
        chunk['PUMA'] = chunk['PUMA'].astype(str)
        chunk['PUMA'] = chunk['PUMA'].str.zfill(5)
        chunk['PUMA'] = chunk['ST'] + chunk['PUMA']
        chunk['PUMA'] = chunk['PUMA'].astype(int)
        chunk = chunk[chunk['PUMA'].isin(puma_list)]
        chunk_list.append(chunk)
    h_df = pd.concat(chunk_list)
    h_data.append(h_df)

./data/PUMS/psam_husa.csv
./data/PUMS/psam_husb.csv
./data/PUMS/psam_husc.csv
./data/PUMS/psam_husd.csv


In [7]:
h_data = pd.concat(h_data)
h_data

Unnamed: 0,SERIALNO,PUMA,ST,WGTP,NP,VEH
1,2015000000067,102701,01,9,4,1
3,2015000000160,101000,01,15,1,0
4,2015000000345,101000,01,29,1,1
5,2015000000463,102701,01,6,3,3
6,2015000000477,101100,01,13,1,1
...,...,...,...,...,...,...
1319186,2019HU1404737,4953001,49,53,6,6
1319196,2019HU1405375,4953001,49,20,2,2
1319197,2019HU1405425,4953001,49,33,3,3
1319229,2019HU1409016,4953001,49,19,1,1


In [8]:
# Check for NA values
h_data.isna().sum()

SERIALNO    0
PUMA        0
ST          0
WGTP        0
NP          0
VEH         0
dtype: int64

In [9]:
# Remove records that are unable to be joined
z = pd.merge(h_data, p_data, on='SERIALNO')
serials_to_keep = list(z['SERIALNO'])
h_data = h_data[h_data['SERIALNO'].isin(serials_to_keep)]
p_data = p_data[p_data['SERIALNO'].isin(serials_to_keep)]
print(len(h_data))
print(len(p_data))

819252
1498954


In [10]:
# Reset household ids
h_data.reset_index(inplace=True)
h_data.reset_index(inplace=True)
h_data['index'] = h_data['level_0']
h_data.drop(labels=['level_0'], axis=1)
p_data = pd.merge(h_data[['index','SERIALNO']], p_data, on='SERIALNO')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  h_data['index'] = h_data['level_0']


In [11]:
# Check for data issues
print(h_data.duplicated('index').any())

False


In [12]:
# Save seed data to file
h_data.to_csv('./populationsim-master/example_msa/data/seed_households.csv', index=False)
p_data.to_csv('./populationsim-master/example_msa/data/seed_persons.csv', index=False)

## Control Totals from ACS

#### MSA

In [13]:
DP03_cols = ['GEO_ID',
            'DP03_0047E',
            'DP03_0048E',
            'DP03_0049E',
            'DP03_0050E']
DP03_colnames = ['GEO_ID',
                'COW1-2',
                'COW3-5',
                'COW6-7',
                'COW8']
DP03 = pd.read_csv('./data/ACS/MSA/DP03.csv', low_memory=False, usecols=DP03_cols)
DP03.columns = DP03_colnames

In [14]:
DP03

Unnamed: 0,GEO_ID,COW1-2,COW3-5,COW6-7,COW8
0,id,Estimate!!CLASS OF WORKER!!Civilian employed p...,Estimate!!CLASS OF WORKER!!Civilian employed p...,Estimate!!CLASS OF WORKER!!Civilian employed p...,Estimate!!CLASS OF WORKER!!Civilian employed p...
1,310M500US10100,17901,2750,2089,123
2,310M500US10140,19655,6885,1693,90
3,310M500US10180,57562,11820,4692,212
4,310M500US10220,11653,4832,852,127
...,...,...,...,...,...
934,310M500US49660,199098,28474,13411,321
935,310M500US49700,50432,12962,5216,256
936,310M500US49740,56673,15964,3044,222
937,310M500US49780,31827,4319,2152,33


In [15]:
S1501_cols = ['GEO_ID',
             'S1501_C01_007E',
             'S1501_C01_008E',
             'S1501_C01_009E',
             'S1501_C01_010E',
             'S1501_C01_011E',
             'S1501_C01_012E',
             'S1501_C01_013E']
S1501_colnames = ['GEO_ID',
                 'SCHL1-11',
                 'SCHL12-15',
                 'SCHL16-17',
                 'SCHL18-19',
                 'SCHL20',
                 'SCHL21',
                 'SCHL22-24']
S1501 = pd.read_csv('./data/ACS/MSA/S1501.csv', low_memory=False, usecols=S1501_cols)
S1501.columns = S1501_colnames

In [16]:
S0802_cols = ['GEO_ID',
             'S0802_C01_001E',
             'S0802_C01_002E',
             'S0802_C01_003E',
             'S0802_C01_004E',
             'S0802_C01_005E',
             'S0802_C01_006E',
             'S0802_C01_007E',
             'S0802_C01_009E',
             'S0802_C01_010E',
             'S0802_C01_012E',
             'S0802_C01_013E',
             'S0802_C01_014E',
             'S0802_C01_015E',
             'S0802_C01_016E',
             'S0802_C01_017E',
             'S0802_C01_018E',
             'S0802_C01_029E',
             'S0802_C01_030E',
             'S0802_C01_031E',
             'S0802_C01_032E',
             'S0802_C01_033E',
             'S0802_C01_034E',
             'S0802_C01_035E',
             'S0802_C01_036E']
S0802_colnames = ['GEO_ID',
                 'TOTAL_P',
                 'AGEP16-19',
                 'AGEP20-24',
                 'AGEP25-44',
                 'AGEP45-54',
                 'AGEP55-59',
                 'AGEP60+',
                 'SEXM',
                 'SEXF',
                 'RAC1P1',
                 'RAC1P2',
                 'RAC1P3-5',
                 'RAC1P6',
                 'RAC1P7',
                 'RAC1P8',
                 'RAC1P9',
                 'PINCP-9999',
                 'PINCP10000-14999',
                 'PINCP15000-24999',
                 'PINCP25000-34999',
                 'PINCP35000-49999',
                 'PINCP50000-64999',
                 'PINCP65000-74999',
                 'PINCP75000+']
S0802 = pd.read_csv('./data/ACS/MSA/S0802.csv', low_memory=False, usecols=S0802_cols)
S0802.columns = S0802_colnames

# All S0802 counts are in percentage, must multiply by the total to get counts
pcts = S0802.iloc[1:,1:].astype(float)
pcts = (pcts / 100).multiply(pcts['TOTAL_P'], axis="index").astype(int)
S0802.iloc[1:,2:] = pcts.iloc[:,1:]

In [17]:
B08201_cols = ['GEO_ID',
              'B08201_001E',
              'B08201_002E',
              'B08201_003E',
              'B08201_004E',
              'B08201_005E',
              'B08201_006E',
              'B08201_007E',
              'B08201_013E',
              'B08201_019E',
              'B08201_025E']
B08201_colnames = ['GEO_ID',
                  'TOTAL_HH',
                  'VEH0',
                  'VEH1',
                  'VEH2',
                  'VEH3',
                  'VEH4+',
                  'NP1',
                  'NP2',
                  'NP3',
                  'NP4+']
B08201 = pd.read_csv('./data/ACS/MSA/B08201.csv', low_memory=False, usecols=B08201_cols)
B08201.columns = B08201_colnames

In [18]:
# Join data from different census tables
acs_dataframes = [DP03, S1501, S0802, B08201]
acs_data = functools.reduce(lambda left,right: pd.merge(left,right,on='GEO_ID'), acs_dataframes)

# Get MSA id from the extended geo id, remove acs labels, save marginal counts for popsim
acs_data.dropna(inplace=True)
acs_data = acs_data.iloc[1:,:]
acs_data['MSA'] = acs_data['GEO_ID'].str.slice(-5,).astype(int)
acs_data = acs_data[acs_data['MSA'].isin(msa_list)]
acs_data.to_csv('./populationsim-master/example_msa/data/control_totals_msa.csv', index=False)
acs_data

Unnamed: 0,GEO_ID,COW1-2,COW3-5,COW6-7,COW8,SCHL1-11,SCHL12-15,SCHL16-17,SCHL18-19,SCHL20,...,VEH0,VEH1,VEH2,VEH3,VEH4+,NP1,NP2,NP3,NP4+,MSA
29,310M500US11260,140473,39588,11362,380,6000,10509,67489,68262,23789,...,7520,40759,55079,23150,11276,35188,46380,22680,33536,11260
33,310M500US11500,36518,8757,2569,52,4081,8234,25940,19759,6474,...,2659,13546,16817,7497,4086,12522,15561,7156,9366,11500
53,310M500US12220,57278,13826,3496,170,2750,6099,23026,21993,8914,...,2650,19351,23127,9981,5128,17918,19866,10233,12220,12220
59,310M500US12540,262672,57362,22747,664,72345,66971,149493,119987,40909,...,18572,83311,101891,44356,22152,56033,74753,43532,95964,12540
91,310M500US13820,408613,65329,23379,849,25497,55956,203223,160401,64376,...,25554,132542,157116,69454,32896,123195,141853,69069,83445,13820
186,310M500US17980,96915,23098,6090,196,8307,19083,60219,52609,19842,...,11306,40567,42403,16290,8214,37394,39237,17954,24195,17980
217,310M500US19300,75806,12725,6435,125,3467,10426,41797,32998,14276,...,2656,26040,34844,12503,4887,23955,31304,10710,14961,19300
221,310M500US19460,52961,8155,3123,109,6603,11359,34680,22710,9034,...,3028,16623,21523,11230,6191,16134,22001,8909,11551,19460
235,310M500US20020,49883,7681,3886,138,5185,11072,33233,23491,9967,...,3483,19117,20385,9435,3904,16799,20763,8304,10458,20020
255,310M500US20940,40387,15530,4053,104,17182,16154,26782,26061,7186,...,3497,12939,16037,8587,3769,9235,11633,7847,16114,20940


#### Tract

In [19]:
DP03_cols = ['GEO_ID',
            'DP03_0047E',
            'DP03_0048E',
            'DP03_0049E',
            'DP03_0050E']
DP03_colnames = ['GEO_ID',
                'COW1-2',
                'COW3-5',
                'COW6-7',
                'COW8']
DP03 = pd.read_csv('./data/ACS/TRACT/DP03.csv', low_memory=False, usecols=DP03_cols)
DP03.columns = DP03_colnames

In [20]:
S1501_cols = ['GEO_ID',
             'S1501_C01_007E',
             'S1501_C01_008E',
             'S1501_C01_009E',
             'S1501_C01_010E',
             'S1501_C01_011E',
             'S1501_C01_012E',
             'S1501_C01_013E']
S1501_colnames = ['GEO_ID',
                 'SCHL1-11',
                 'SCHL12-15',
                 'SCHL16-17',
                 'SCHL18-19',
                 'SCHL20',
                 'SCHL21',
                 'SCHL22-24']
S1501 = pd.read_csv('./data/ACS/TRACT/S1501.csv', low_memory=False, usecols=S1501_cols)
S1501.columns = S1501_colnames

In [21]:
S0802_cols = ['GEO_ID',
             'S0802_C01_001E',
             'S0802_C01_002E',
             'S0802_C01_003E',
             'S0802_C01_004E',
             'S0802_C01_005E',
             'S0802_C01_006E',
             'S0802_C01_007E',
             'S0802_C01_009E',
             'S0802_C01_010E',
             'S0802_C01_012E',
             'S0802_C01_013E',
             'S0802_C01_014E',
             'S0802_C01_015E',
             'S0802_C01_016E',
             'S0802_C01_017E',
             'S0802_C01_018E',
             'S0802_C01_029E',
             'S0802_C01_030E',
             'S0802_C01_031E',
             'S0802_C01_032E',
             'S0802_C01_033E',
             'S0802_C01_034E',
             'S0802_C01_035E',
             'S0802_C01_036E']
S0802_colnames = ['GEO_ID',
                 'TOTAL_P',
                 'AGEP16-19',
                 'AGEP20-24',
                 'AGEP25-44',
                 'AGEP45-54',
                 'AGEP55-59',
                 'AGEP60+',
                 'SEXM',
                 'SEXF',
                 'RAC1P1',
                 'RAC1P2',
                 'RAC1P3-5',
                 'RAC1P6',
                 'RAC1P7',
                 'RAC1P8',
                 'RAC1P9',
                 'PINCP-9999',
                 'PINCP10000-14999',
                 'PINCP15000-24999',
                 'PINCP25000-34999',
                 'PINCP35000-49999',
                 'PINCP50000-64999',
                 'PINCP65000-74999',
                 'PINCP75000+']
S0802 = pd.read_csv('./data/ACS/TRACT/S0802.csv', low_memory=False, usecols=S0802_cols)
S0802.columns = S0802_colnames

# All S0802 counts are in percentage, must multiply by the total to get counts
pcts = S0802.iloc[1:,1:]
pcts = pcts.replace('-','0', regex=True).astype(float)
pcts = (pcts / 100).multiply(pcts['TOTAL_P'], axis="index").astype(int)
S0802.iloc[1:,2:] = pcts.iloc[:,1:]

In [22]:
B08201_cols = ['GEO_ID',
              'B08201_001E',
              'B08201_002E',
              'B08201_003E',
              'B08201_004E',
              'B08201_005E',
              'B08201_006E',
              'B08201_007E',
              'B08201_013E',
              'B08201_019E',
              'B08201_025E']
B08201_colnames = ['GEO_ID',
                  'TOTAL_HH',
                  'VEH0',
                  'VEH1',
                  'VEH2',
                  'VEH3',
                  'VEH4+',
                  'NP1',
                  'NP2',
                  'NP3',
                  'NP4+']
B08201 = pd.read_csv('./data/ACS/TRACT/B08201.csv', low_memory=False, usecols=B08201_cols)
B08201.columns = B08201_colnames

In [23]:
# Join data from different census tables
acs_dataframes = [DP03, S1501, S0802, B08201]
acs_data = functools.reduce(lambda left,right: pd.merge(left,right,on='GEO_ID'), acs_dataframes)

# Get MSA id from the extended geo id, remove acs labels, save marginal counts for popsim
acs_data.dropna(inplace=True)
acs_data = acs_data.iloc[1:,:]
acs_data['TRACT'] = acs_data['GEO_ID'].str.slice(-11,).astype(int)
acs_data = acs_data[acs_data['TRACT'].isin(tract_list)]
acs_data.to_csv('./populationsim-master/example_msa/data/control_totals_tract.csv', index=False)
acs_data

Unnamed: 0,GEO_ID,COW1-2,COW3-5,COW6-7,COW8,SCHL1-11,SCHL12-15,SCHL16-17,SCHL18-19,SCHL20,...,VEH0,VEH1,VEH2,VEH3,VEH4+,NP1,NP2,NP3,NP4+,TRACT
1,1400000US01001020100,757,137,58,0,60,106,463,203,132,...,7,204,235,131,132,121,321,83,184,1001020100
2,1400000US01001020200,585,101,37,0,37,171,646,240,69,...,98,258,207,77,48,281,169,93,145,1001020200
3,1400000US01001020300,1260,393,62,12,89,286,940,593,135,...,63,539,454,200,104,365,433,282,280,1001020300
4,1400000US01001020400,1223,417,85,0,18,215,788,542,299,...,21,562,759,289,44,476,706,218,275,1001020400
5,1400000US01001020500,3759,1348,83,0,136,405,1396,1226,751,...,199,1609,1968,540,167,1361,1428,855,839,1001020500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63113,1400000US48037011700,544,271,36,0,108,76,552,363,114,...,62,213,394,220,70,220,473,122,144,48037011700
68373,1400000US49053271600,1871,258,210,0,110,148,1486,1608,519,...,148,1264,946,484,171,793,1672,104,444,49053271600
68374,1400000US49053271701,3138,499,211,0,67,101,1148,1524,795,...,44,645,1099,834,362,545,1020,302,1117,49053271701
68375,1400000US49053271702,3763,409,100,0,70,313,997,1692,693,...,20,541,1439,522,294,348,956,300,1212,49053271702


#### Region

In [24]:
# Sum the values across all census tracts to get region marginals
region_data = acs_data.iloc[:,1:-1].apply(pd.to_numeric, errors='coerce')
region_data = pd.DataFrame(region_data.sum()).transpose()
region_data['REGION'] = '1'
region_data.dropna(inplace=True)
region_data.to_csv('./populationsim-master/example_msa/data/control_totals_region.csv', index=False)
region_data

Unnamed: 0,COW1-2,COW3-5,COW6-7,COW8,SCHL1-11,SCHL12-15,SCHL16-17,SCHL18-19,SCHL20,SCHL21,...,VEH0,VEH1,VEH2,VEH3,VEH4+,NP1,NP2,NP3,NP4+,REGION
0,18966103,3263127,1742155,41415,2757645,2611701,7660908,7448776,2723094,7058309,...,1215412,5553514,6562054,2761802,1461372,4398865,5532872,2861924,4760493,1
