### Introduction
This notebook is to allocate the synthesized population from block groups to the block level

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Function to change the sf1 data structure to crerate one row per households
def disaggregate_sf1(code_num,sf1):
    sf_i=[]
    for tenure in range (1,3):
        for size in range (1,8):
            if tenure==1:
                col_name='own'+str(size)
            else:
                col_name='rent'+str(size)
                
            sf1_filter=sf1[sf1[col_name]>0]
            hh=np.repeat(sf1_filter.reset_index()['blockid10'], sf1_filter[col_name]).tolist()
            if len(hh)>0:
                sf1_dis=pd.DataFrame(index=[list(range(len(hh)))],columns=['block_id','tenure','size'])
                sf1_dis.loc[:,'block_id']=hh
                sf1_dis.loc[:,'state']=sf1_dis['block_id'].apply(lambda x: x[0:2])
                sf1_dis.loc[:,'county']=sf1_dis['block_id'].apply(lambda x: x[2:5])
                sf1_dis.loc[:,'tract']=sf1_dis['block_id'].apply(lambda x: x[5:11])
                sf1_dis.loc[:,'block_group']=sf1_dis['block_id'].apply(lambda x: x[11])
                sf1_dis.loc[:,'tenure']=tenure
                sf1_dis.loc[:,'size']=size
                sf_i.append(sf1_dis)
    return pd.concat(sf_i).reset_index().drop('level_0',axis=1)

In [3]:
# Allocate household from synthetic population to sf1 based household type
def allocate_from_blockgroup_by_hhtype(sf1_county,synth_hh_county,county):
    list_sf1_filt=[]

    for tract in sf1_county['tract'].unique():
        sf1_tract=sf1_county[sf1_county['tract']==tract]
        for group in sf1_tract['block_group'].unique():
            for tenure in range (1,3):
                for size in range (1,8):
                    # Filter the synthpop and sf1 by the case we are studying, block group, tenure and size
                    synth_hh_filt=synth_hh_county[(synth_hh_county['tract']==tract)&\
                                           (synth_hh_county['block group']==int(group))&
                                           (synth_hh_county['size_group']==size)&(synth_hh_county['tenure']==tenure)]

                    sf1_filt=sf1_county[(sf1_county['tract']==tract)&(sf1_county['block_group']==group)&\
                                         (sf1_county['tenure']==tenure)&(sf1_county['size']==size)]


                    if not (synth_hh_filt.empty | sf1_filt.empty):
                        # If sf1 smaller than hh_block, all sf1 hh will be filled with hh from synthpop
                        if len(sf1_filt) < len(synth_hh_filt):
                            sf1_filt.loc[:,'synth_hh_id']=np.random.choice(synth_hh_filt['household_id'],\
                                                                       size=len(sf1_filt), replace=False)
                        # If sf1 greater than hh_block, not all sf1 hh will be filled in with the hh from syntpop. 
                        # First we will allocate by hh type as we did in the above step      
                        else:
                            # random sample the households that will get an agent in this step
                            sf1_filt_sample=sf1_filt.sample(n=len(synth_hh_filt))
                            sf1_filt_sample['synth_hh_id']=np.random.choice(synth_hh_filt['household_id'],\
                                                                        size=len(sf1_filt_sample), replace=False)

                            sf1_filt['synth_hh_id']=sf1_filt_sample['synth_hh_id']         


                        list_sf1_filt.append(sf1_filt)

    df_sf1_filt=pd.concat(list_sf1_filt) 
    sf1_county.loc[df_sf1_filt.index,'synth_hh_id']=df_sf1_filt['synth_hh_id']

    return sf1_county                       
            

In [4]:
def allocate_from_tract_by_hhtype(sf1_county,synth_hh_county,county):
    list_sf1_null_filt=[]
    # Get all the records from sf1 that were not filled in from the country we are looking at.
    sf1_county_null=sf1_county[(sf1_county['synth_hh_id'].isnull())]
    # Get all the records from the synthpop that were not used yet
    synth_hh_county_notused=synth_hh_county[(~synth_hh_county['household_id'].isin(sf1_county['synth_hh_id'].tolist()))]

    for tract in sf1_county_null['tract'].unique():
        for tenure in range (1,3):
            for size in range (1,8):
                synth_hh_notused_filt=synth_hh_county_notused[(synth_hh_county_notused['tract']==tract)&\
                                                             (synth_hh_county_notused['size_group']==size)&\
                                                             (synth_hh_county_notused['tenure']==tenure)]

                sf1_county_null_filt=sf1_county_null[(sf1_county_null['tract']==tract)&\
                                                     (sf1_county_null['tenure']==tenure)&(sf1_county_null['size']==size)]
                
                if not (synth_hh_notused_filt.empty | sf1_county_null_filt.empty):
                    # If sf1 smaller than what is in the tract, all sf1 will be filled with synth pop from the tract
                    if len(sf1_county_null_filt) < len(synth_hh_notused_filt):
                        sf1_county_null_filt.loc[:,'synth_hh_id']=np.random.choice(synth_hh_notused_filt['household_id'],\
                                                                       size=len(sf1_county_null_filt), replace=False)
                    else:
                        sf1_filt_sample=sf1_county_null_filt.sample(n=len(synth_hh_notused_filt))
                        sf1_filt_sample['synth_hh_id']=np.random.choice(synth_hh_notused_filt['household_id'],\
                                                                        size=len(sf1_filt_sample), replace=False)
                                                
                        sf1_county_null_filt['synth_hh_id']=sf1_filt_sample['synth_hh_id']   


                list_sf1_null_filt.append(sf1_county_null_filt)
                
    df_sf1_null_filt=pd.concat(list_sf1_null_filt) 
    sf1_county.loc[df_sf1_null_filt.index,'synth_hh_id']=df_sf1_null_filt['synth_hh_id']
    return sf1_county

In [5]:
def any_notused_hh_tract(sf1_county,synth_hh_county):
    list_sf1_null_filt=[]
    # Get all the records from sf1 that were not filled in from the country we are looking at.
    sf1_county_null=sf1_county[(sf1_county['synth_hh_id'].isnull())]
    # Get all the records from the synthpop that were not used yet
    synth_hh_county_notused=synth_hh_county[(~synth_hh_county['household_id'].isin(sf1_county['synth_hh_id'].tolist()))]
    
    for tract in sf1_county_null['tract'].unique():
        synth_hh_county_notused_filt=synth_hh_county_notused[(synth_hh_county_notused['tract']==tract)]

        sf1_county_null_filt=sf1_county_null[(sf1_county_null['tract']==tract)]    
        
    
    if not (synth_hh_county_notused_filt.empty | sf1_county_null_filt.empty):
        # If sf1 smaller than what is in the tract, all sf1 will be filled with synth pop from the tract
        if len(sf1_county_null_filt) < len(synth_hh_county_notused_filt):
            sf1_county_null_filt.loc[:,'synth_hh_id']=np.random.choice(synth_hh_county_notused_filt['household_id'],\
                                                                       size=len(sf1_county_null_filt), replace=False)
    
        else:
            sf1_filt_sample=sf1_county_null_filt.sample(n=len(synth_hh_county_notused_filt))
            sf1_filt_sample['synth_hh_id']=np.random.choice(synth_hh_county_notused_filt['household_id'],\
                                                            size=len(sf1_filt_sample), replace=False)

            sf1_county_null_filt['synth_hh_id']=sf1_filt_sample['synth_hh_id']   
    
    
    list_sf1_null_filt.append(sf1_county_null_filt)
                
    df_sf1_null_filt=pd.concat(list_sf1_null_filt) 
    sf1_county.loc[df_sf1_null_filt.index,'synth_hh_id']=df_sf1_null_filt['synth_hh_id']
    return sf1_county    

In [6]:
def repeat_from_tract_hhtype(sf1_county,synth_hh_county):
    list_sf1_null_filt=[]
    # Get all the records from sf1 that were not filled in from the country we are looking at.
    sf1_county_null=sf1_county[(sf1_county['synth_hh_id'].isnull())]

    for tract in sf1_county_null['tract'].unique():
        for tenure in range (1,3):
            for size in range (1,8):
                synth_hh_filt=synth_hh_county[(synth_hh_county['tract']==tract)&\
                                                (synth_hh_county['size_group']==size)&\
                                                (synth_hh_county['tenure']==tenure)]

                sf1_county_null_filt=sf1_county_null[(sf1_county_null['tract']==tract)&\
                                                     (sf1_county_null['tenure']==tenure)&(sf1_county_null['size']==size)]
                
                if not (synth_hh_filt.empty | sf1_county_null_filt.empty):
                    sf1_county_null_filt.loc[:,'synth_hh_id']=np.random.choice(synth_hh_filt['household_id'],\
                                                                       size=len(sf1_county_null_filt), replace=True)
    
                list_sf1_null_filt.append(sf1_county_null_filt)
                
    df_sf1_null_filt=pd.concat(list_sf1_null_filt) 
    sf1_county.loc[df_sf1_null_filt.index,'synth_hh_id']=df_sf1_null_filt['synth_hh_id']
    return sf1_county    

In [7]:
def repeat_from_tract(sf1_county,synth_hh_county):
    list_sf1_null_filt=[]
    # Get all the records from sf1 that were not filled in from the country we are looking at.
    sf1_county_null=sf1_county[(sf1_county['synth_hh_id'].isnull())]

    for tract in sf1_county_null['tract'].unique():
        synth_hh_filt=synth_hh_county[(synth_hh_county['tract']==tract)]

        sf1_county_null_filt=sf1_county_null[(sf1_county_null['tract']==tract)]

        if not (synth_hh_filt.empty | sf1_county_null_filt.empty):
            sf1_county_null_filt.loc[:,'synth_hh_id']=np.random.choice(synth_hh_filt['household_id'],\
                                                               size=len(sf1_county_null_filt), replace=True)

        list_sf1_null_filt.append(sf1_county_null_filt)
                
    df_sf1_null_filt=pd.concat(list_sf1_null_filt) 
    sf1_county.loc[df_sf1_null_filt.index,'synth_hh_id']=df_sf1_null_filt['synth_hh_id']
    return sf1_county    

In [8]:
code_num={        
        'Cook': '031',
        'DuPage': '043',
        'Kane': '089',
        'Kendall': '093',
        'Lake': '097',
        'McHenry': '111',
        'Will': '197',
        }

In [9]:
# Create the tenure list to filter sf1 
tenure_list=[]
for i in range(1,8):
    tenure_list.append('own'+str(i))
    tenure_list.append('rent'+str(i))

In [82]:
# Upload the sf1 table 
sf1=pd.read_csv('./illinois_sf1.csv', dtype={'blockid10':'str','state':'str','county':'str','tract':'str'})
sf1.loc[:,'total']=sf1[tenure_list].sum(axis=1)
sf1['blockid10']=sf1['blockid10'].astype('str')
sf1.set_index('blockid10',inplace=True)
sf1_mpo=sf1[sf1['county'].isin(code_num.values())]
sf_counties=disaggregate_sf1(code_num,sf1_mpo)

In [11]:
dis=disaggregate_sf1(code_num,sf1_mpo)

In [12]:
# Upload the synthetic population hh
hh=pd.read_csv('synthetic_households.csv',
    dtype={'serialno':'str','state':'str','county':'str','tract':'str'})

In [13]:
hh.columns

Index(['household_id', 'serialno', 'persons', 'group_quarters',
       'building_type', 'cars', 'income', 'race_of_head', 'hispanic_head',
       'age_of_head', 'workers', 'state', 'county', 'tract', 'block group',
       'children', 'tenure', 'recent_mover'],
      dtype='object')

In [None]:
sf1_all_counties=[]
for key,value in code_num.items():
    county=value
    print (key)
    synth_hh=hh[hh['county']==county]
    print ('The number of sf1 hh is ', sf1[sf1['county']==county][tenure_list].sum().sum())
    print ('The number of synthetic hh is',synth_hh['block group'].value_counts().sum())
    # Create the size_groups to match sf1
    size_group=pd.cut(synth_hh.persons,[0,2,3,4,5,6,7,1000],right=False,labels=[1,2,3,4,5,6,7])
    synth_hh['size_group']=size_group
    
    sf1_county=sf_counties[sf_counties['county']==county]
    synth_hh_county=synth_hh[synth_hh['county']==county]
    
    print('Running allocation')
    sf1_county=allocate_from_blockgroup_by_hhtype(sf1_county,synth_hh_county,county)  
    
    if not (sf1_county[sf1_county['synth_hh_id'].isnull()].empty):        
        sf1_county=allocate_from_tract_by_hhtype(sf1_county,synth_hh_county,county)
        
    if not (sf1_county[sf1_county['synth_hh_id'].isnull()].empty):        
        sf1_county=any_notused_hh_tract(sf1_county,synth_hh_county)
        
    if not (sf1_county[sf1_county['synth_hh_id'].isnull()].empty):    
        sf1_county=repeat_from_tract_hhtype(sf1_county,synth_hh_county)  
        
    if not (sf1_county[sf1_county['synth_hh_id'].isnull()].empty):    
        sf1_county=repeat_from_tract(sf1_county,synth_hh_county) 
        
    sf1_all_counties.append(sf1_county)

Cook
The number of sf1 hh is  1966356
The number of synthetic hh is 1947737


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Running allocation


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
key

'Will'

In [None]:
sf_counties=pd.concat(sf1_all_counties)

In [None]:
sf_counties.to_csv('il_counties.csv')

In [None]:
sf1.loc['250158208021005']

In [None]:
hh[(hh['county']=='015')&(hh['tract']=='820802')]

# Check the data

In [83]:
sf_counties=pd.read_csv('il_counties.csv', dtype={'block_id':'str','state':'str','county':'str',\
                                                         'tract':'str','block_group':'str'})

In [76]:
a=pd.merge(sf_counties,hh[['household_id','state','county','tract','block group']],left_on='synth_hh_id',right_on='household_id',how='left')

In [77]:
# Check the allocation worked by county
a[a['county_x']!=a['county_y']]

Unnamed: 0.1,Unnamed: 0,block_id,tenure,size,state_x,county_x,tract_x,block_group,synth_hh_id,household_id,state_y,county_y,tract_y,block group
2706254,2394617,170978630051000,2,1,17,97,863005,1,,,,,,


In [78]:
# Check the allocation worked by tract
a[a['tract_x']!=a['tract_y']]

Unnamed: 0.1,Unnamed: 0,block_id,tenure,size,state_x,county_x,tract_x,block_group,synth_hh_id,household_id,state_y,county_y,tract_y,block group
2706254,2394617,170978630051000,2,1,17,97,863005,1,,,,,,


In [79]:
# Check if a county was not allocated
sf_counties[sf_counties['synth_hh_id'].isnull()]

Unnamed: 0.1,Unnamed: 0,block_id,tenure,size,state,county,tract,block_group,synth_hh_id
2706254,2394617,170978630051000,2,1,17,97,863005,1,


In [80]:
# check the amount of records that were duplicated
sf_counties[sf_counties['synth_hh_id'].duplicated()].shape

(389274, 9)

In [84]:
sf1[sf1['county'].isin(code_num.values())]['total'].sum()-len(sf_counties)

0

Delete the value with null results since this is part of a campus and does not have any value in the synthetic population

In [85]:
sf_counties=sf_counties[sf_counties['synth_hh_id'].notnull()]

# Process the data

In [118]:
# Add the rest of the household characteristic to the sf_counties frame
final_hh=pd.merge(hh[['household_id', 'serialno', 'persons', 'group_quarters',
       'building_type', 'cars', 'income', 'race_of_head', 'hispanic_head',
       'age_of_head', 'workers', 'children', 'tenure', 'recent_mover']],\
                  sf_counties[['block_id','state','county','tract','block_group','synth_hh_id']]\
                  ,left_on='household_id',right_on='synth_hh_id',how='right')

In [119]:
final_hh.shape,hh.shape,sf_counties.shape

((3088155, 20), (3075951, 18), (3088155, 9))

In [120]:
final_hh.drop('synth_hh_id',axis=1,inplace=True)

In [121]:
final_hh['block_group']=final_hh['block_group'].astype('str')

In [122]:
ppl=pd.read_csv('./synthetic_persons.csv', dtype={'serialno':'str','county':'str'})

In [123]:
ppl.shape

(7955397, 19)

In [124]:
final_pp=ppl[ppl['household_id'].isin(final_hh['household_id'].tolist())]

In [125]:
final_pp.shape

(6996488, 19)

Reindex

In [127]:
final_hh.loc[:,'household_id_new']=list(range(len(final_hh)))

In [129]:
final_hh.shape

(3088155, 20)

In [128]:
final_pp.loc[:,'person_id_new']=list(range(len(final_pp)))

In [130]:
final_pp.shape

(6996488, 20)

In [131]:
final_pp=pd.merge(final_pp,final_hh[['household_id','household_id_new']],
                  left_on='household_id',right_on='household_id',how='left')

In [135]:
final_pp.shape

(8262885, 21)

In [136]:
final_hh.drop('household_id',inplace=True,axis=1)

In [137]:
final_hh.rename(columns={'household_id_new':'household_id'},inplace=True)

In [138]:
final_hh.set_index('household_id',inplace=True)

In [139]:
final_pp.drop(['household_id','person_id'],axis=1,inplace=True)

In [140]:
final_pp.rename(columns={'household_id_new':'household_id','person_id_new':'person_id'},inplace=True)

In [141]:
final_pp.set_index('person_id',inplace=True)

In [142]:
final_pp.shape,final_hh['persons'].sum()

((8262885, 18), 8262885)

In [143]:
final_pp.head()

Unnamed: 0_level_0,member_id,age,relate,edu,sex,hours,hispanic,earning,race_id,hispanic.1,student,work_at_home,worker,self_employed,unemployed,nilf,county,household_id
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1,30,0,21.0,1,45.0,1,59000.0,1,no,0,0,1,0,0,0,31,54
1,2,30,1,21.0,2,35.0,1,35000.0,1,no,0,0,1,0,0,0,31,54
2,2,28,12,21.0,1,50.0,1,100000.0,1,no,1,0,1,0,0,0,31,136
3,2,28,12,21.0,1,50.0,1,100000.0,1,no,1,0,1,0,0,0,31,137
4,2,28,12,21.0,1,50.0,1,100000.0,1,no,1,0,1,0,0,0,31,289


# Check we reached the desired targets at each block group

In [144]:
group_hh=final_hh.groupby(['county','tract','state','block_group']).size()
sf1_mpo['blkgrp']=sf1_mpo['blkgrp'].astype('str')
sf1=sf1_mpo.groupby(['county','tract','state','blkgrp'])[['total']].sum()
sf1['total_2']=group_hh
sf1['diff']=sf1['total']-sf1['total_2']
sf1['diff'].sum()

0.0

In [145]:
sf1_mpo[(sf1_mpo['tract']=='007801')&(sf1_mpo['blkgrp']=='2')]['total'].sum()

0

# Check we reached the desired targets at each block

In [146]:
group_block_final=final_hh.groupby('block_id').size()

In [147]:
sf1_mpo['allocation']=group_block_final
sf1_mpo['allocation'].fillna(0,inplace=True)

In [148]:
sf1_mpo['error']=sf1_mpo['total']-sf1_mpo['allocation']

In [149]:
sf1_mpo['error'].sum()

1.0

In [150]:
pd.set_option('display.max_columns',500)
sf1_mpo[sf1_mpo['error']==1]

Unnamed: 0_level_0,state,county,tract,blkgrp,own1,own2,own3,own4,own5,own6,own7,rent1,rent2,rent3,rent4,rent5,rent6,rent7,residential_units,state_county,total_own,total_rent,total,allocation,error
blockid10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
170978630051000,17,97,863005,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,17097,0,1,1,0.0,1.0


# Check the tracts

In [151]:
final_hh['test_tract']=final_hh['block_id'].apply(lambda x:str(x)[5:11])
final_hh[final_hh['test_tract']!=final_hh['tract']]

Unnamed: 0_level_0,serialno,persons,group_quarters,building_type,cars,income,race_of_head,hispanic_head,age_of_head,workers,children,tenure,recent_mover,block_id,state,county,tract,block_group,test_tract
household_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1


In [152]:
len(final_pp['household_id'].unique())

3088155

In [153]:
len(final_hh.index.unique())

3088155

In [154]:
final_pp.shape

(8262885, 18)

In [155]:
final_pp.head()

Unnamed: 0_level_0,member_id,age,relate,edu,sex,hours,hispanic,earning,race_id,hispanic.1,student,work_at_home,worker,self_employed,unemployed,nilf,county,household_id
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1,30,0,21.0,1,45.0,1,59000.0,1,no,0,0,1,0,0,0,31,54
1,2,30,1,21.0,2,35.0,1,35000.0,1,no,0,0,1,0,0,0,31,54
2,2,28,12,21.0,1,50.0,1,100000.0,1,no,1,0,1,0,0,0,31,136
3,2,28,12,21.0,1,50.0,1,100000.0,1,no,1,0,1,0,0,0,31,137
4,2,28,12,21.0,1,50.0,1,100000.0,1,no,1,0,1,0,0,0,31,289


# Check the serialno

In [None]:
# pums_pp=pd.read_csv('puma_p_08.csv',dtype={'serialno':'str'})

In [None]:
# pums_hh=pd.read_csv('puma_h_08.csv',dtype={'serialno':'str'})

In [None]:
# merge=pd.merge(final_hh,pums_hh,left_on='serialno',right_on='serialno',how='left')

In [None]:
# # if the resulting df is empty it means the serialno works
# merge[merge['WGTP'].isnull()]

In [None]:
# # add seriano to people
# merge_p=pd.merge(final_pp,final_hh[['serialno']],left_on='household_id',right_index=True,how='left')

In [None]:
# merge_p.shape,final_pp.shape

In [None]:
# (merge_p['serialno'][0])

In [None]:
# (pums_pp['serialno'][0])

In [None]:
# merge_pp=pd.merge(merge_p,pums_pp[['serialno','PUMA00']],left_on='serialno',right_on='serialno',how='left')

In [None]:
# merge_pp[merge_pp['PUMA00'].isnull()]

# Save the data

In [156]:
final_hh.to_csv('./data/final_hh.csv')

In [157]:
final_pp.to_csv('./data/final_pp.csv')