In [1]:
import io
import pathlib
import urllib.request
import zipfile

import numpy as np
import pandas as pd

# Background

## This takes lots of RAM and time

Summary file 1:
https://www.census.gov/prod/cen2010/doc/sf1.pdf

Data File Segement contains population table 3 "P3" to population table 9 "P9". "P" prefixes denote census block population tables. "PCT" prefixes denote census tract tables. The target tagble is P5 (census doc page 184)

Census file 3:
    P3. RACE Universe: Total population (8)
    P4. HISPANIC OR LATINO ORIGIN Universe: Total population (3)
    P5. HISPANIC OR LATINO ORIGIN BY RACE Universe: Total population (17)

P5 data:
    Total: P0050001 03 9 
        Not Hispanic or Latino: P0050002 03 9 
            White alone P0050003 03 9 
            Black or African American alone P0050004 03 9 
            American Indian and Alaska Native alone P0050005 03 9 
            Asian alone P0050006 03 9 
            Native Hawaiian and Other Pacific Islander alone P0050007 03 9 
            Some Other Race alone P0050008 03 9 
            Two or More Races P0050009 03 9 
        Hispanic or Latino: P0050010 03 9 
            White alone P0050011 03 9 
            Black or African American alone P0050012 03 9 
            American Indian and Alaska Native alone P0050013 03 9 
            Asian alone P0050014 03 9 
            Native Hawaiian and Other Pacific Islander alone P0050015 03 9 
            Some Other Race alone P0050016 03 9 
            Two or More Races P0050017 03 9

Levels:
871 State-5-Digit ZIP Code Tabulation Area 
881 State-5-Digit ZIP Code Tabulation Area-County

All hispanic aggregated
Some other race alone (iterative prop)
Asian and API combined

In [2]:
# https://www.census.gov/prod/cen2010/doc/sf2.pdf
GEO_MAP_2010 = {
    'FILEID'  : (1  , 7  ),
    'STUSAB'  : (7  , 9  ),
    'SUMLEV'  : (9  , 12 ),
    'GEOCOMP' : (12 , 14 ),
    'CHARITER': (14 , 17 ),
    'CIFSN'   : (17 , 19 ),
    'LOGRECNO': (19 , 26 ),
    'REGION'  : (26 , 27 ),
    'DIVISION': (27 , 28 ),
    'STATE'   : (28 , 30 ),
    'COUNTY'  : (30 , 33 ),
    'COUNTYCC': (33 , 35 ),
    'COUNTYSC': (35 , 37 ),
    'COUSUB'  : (37 , 42 ),
    'COUSUBCC': (42 , 44 ),
    'COUSUBSC': (44 , 46 ),
    'PLACE'   : (46 , 51 ),
    'PLACECC' : (51 , 53 ),
    'PLACESC' : (53 , 55 ),
    'TRACT'   : (55 , 61 ),
    'BLKGRP'  : (61 , 62 ),
    'BLOCK'   : (62 , 66 ),
    'IUC'     : (66 , 68 ),
    'CONCIT'  : (68 , 73 ),
    'CONCITCC': (73 , 75 ),
    'CONCITSC': (75 , 77 ),
    'AIANHH'  : (77 , 81 ),
    'AIANHHFP': (81 , 86 ),
    'AIANHHCC': (86 , 88 ),
    'AIHHTLI' : (88 , 89 ),
    'AITSCE'  : (89 , 92 ),
    'AITS'    : (92 , 97 ),
    'AITSCC'  : (97 , 99 ),
    'TTRACT'  : (99 , 105),
    'TBLKGRP' : (105, 106),
    'ANRC'    : (106, 111),
    'ANRCCC'  : (111, 113),
    'CBSA'    : (113, 118),
    'CBSASC'  : (118, 120),
    'METDIV'  : (120, 125),
    'CSA'     : (125, 128),
    'NECTA'   : (128, 133),
    'NECTASC' : (133, 135),
    'NECTADIV': (135, 140),
    'CNECTA'  : (140, 143),
    'CBSAPCI' : (143, 144),
    'NECTAPCI': (144, 145),
    'UA'      : (145, 150),
    'UASC'    : (150, 152),
    'UATYPE'  : (152, 153),
    'UR'      : (153, 154),
    'CD'      : (154, 156),
    'SLDU'    : (156, 159),
    'SLDL'    : (159, 162),
    'VTD'     : (162, 168),
    'VTDI'    : (168, 169),
    'RESERVE2': (169, 172),
    'ZCTA5'   : (172, 177),
    'SUBMCD'  : (177, 182),
    'SUBMCDCC': (182, 184),
    'SDELM'   : (184, 189),
    'SDSEC'   : (189, 194),
    'SDUNI'   : (194, 199),
    'AREALAND': (119, 213),
    'AREAWATR': (213, 227),
    'NAME'    : (227, 317),
    'FUNCSTAT': (317, 318),
    'GCUNI'   : (318, 319),
    'POP100'  : (319, 328),
    'HU100'   : (328, 337),
    'INTPTLAT': (337, 348),
    'INTPTLON': (348, 360),
    'LSADC'   : (360, 362),
    'PARTFLAG': (362, 363),
    'RESERVE3': (363, 369),
    'UGA'     : (369, 374),
    'STATENS' : (374, 382),
    'COUNTYNS': (382, 390),
    'COUSUBNS': (390, 398),
    'PLACENS' : (398, 406),
    'CONCITNS': (406, 414),
    'AIANHHNS': (414, 422),
    'AITSNS'  : (422, 430),
    'ANRCNS'  : (430, 438),
    'SUBMCDNS': (438, 446),
    'CD113'   : (446, 448),
    'CD114'   : (448, 450),
    'CD115'   : (450, 452),
    'SLDU2'   : (452, 455),
    'SLDU3'   : (455, 458),
    'SLDU4'   : (458, 461),
    'SLDL2'   : (461, 464),
    'SLDL3'   : (464, 467),
    'SLDL4'   : (467, 470),
    'AIANHHSC': (470, 472),
    'CSASC'   : (472, 476),
    'CNECTASC': (474, 477),
    'MEMI'    : (476, 478),
    'NMEMI'   : (477, 478),
    'PUMA'    : (478, 483),
    'RESERVED': (483, 501),
}

# https://www.census.gov/prod/cen2010/doc/sf2.pdf
FILE_1_DATA_COLS = [
    'FILEID',
    'STUSAB',
    'CHARITER',
    'CIFSN',
    'LOGRECNO',
    'TOTAL_POPULATION',
]

STATES = {
   'AL': 'Alabama',
   'AK': 'Alaska',
#    'AZ': 'Arizona',
#     'AR': 'Arkansas',
#     'CA': 'California',
#     'CO': 'Colorado',
#     'CT': 'Connecticut',
#     'DE': 'Delaware',
#     'DC': 'District_of_Columbia',
#     'FL': 'Florida',
#     'GA': 'Georgia',
#     'HI': 'Hawaii',
#     'ID': 'Idaho',
#     'IL': 'Illinois',
#     'IN': 'Indiana',
#     'IA': 'Iowa',
#     'KS': 'Kansas',
#     'KY': 'Kentucky',
#     'LA': 'Louisiana',
#     'ME': 'Maine',
#     'MD': 'Maryland',
#     'MA': 'Massachusetts',
#     'MI': 'Michigan',
#     'MN': 'Minnesota',
#     'MS': 'Mississippi',
#     'MO': 'Missouri',
#     'MT': 'Montana',
#     'NE': 'Nebraska',
#     'NV': 'Nevada',
#     'NH': 'New_Hampshire',
#     'NJ': 'New_Jersey',
#     'NM': 'New_Mexico',
#     'NY': 'New_York',
#     'NC': 'North_Carolina',
#     'ND': 'North_Dakota',
#     'OH': 'Ohio',
#     'OK': 'Oklahoma',
#     'OR': 'Oregon',
#     'PA': 'Pennsylvania',
#     'PR': 'Puerto_Rico',
#     'RI': 'Rhode_Island',
#     'SC': 'South_Carolina',
#     'SD': 'South_Dakota',
#     'TN': 'Tennessee',
#     'TX': 'Texas',
#     'UT': 'Utah',
#     'VT': 'Vermont',
#     'VA': 'Virginia',
#     'WA': 'Washington',
#     'WV': 'West_Virginia',
#     'WI': 'Wisconsin',
#     'WY': 'Wyoming',    
}

URL_TEMPLATE_ZIP = 'https://www2.census.gov/census_2010/04-Summary_File_1/{state}/{state_abbrev}2010.sf1.zip'

# Create URLS
urls = [
    URL_TEMPLATE_ZIP.format(state_abbrev=code.lower(), state=name)
    for code, name
    in STATES.items()
]

In [11]:
def dl_file(url):
    with urllib.request.urlopen(url) as f:
        data = io.BytesIO(f.read())
        return data

def make_geo_df(data):
    with zipfile.ZipFile(data) as zf:
        # Filter out everything except the ZipInfo for csv we want
        target = zf.filelist[0]
        # Read that CSV into BytesIO object
        geo_data = io.BytesIO(zf.read(target))
        geo_df = pd.read_fwf(
            geo_data, 
            header=None,
            # Subtract 1
            colspecs=[
                (tuple_[0] - 1, tuple_[1] - 1)
                for tuple_
                in GEO_MAP_2010.values()
            ],
            dtype=str
        )
    geo_df.columns = tuple(GEO_MAP_2010.keys())
    geo_df = geo_df.loc[geo_df.SUMLEV == '871']    
    geo_df = geo_df[['STUSAB', 'LOGRECNO', 'ZCTA5']].dropna(subset=['ZCTA5'])
    return geo_df

def make_pop_df(data):
    with zipfile.ZipFile(data) as zf:
        # Filter out everything except the ZipInfo for csv we want
        target = zf.filelist[3]
        # Read that CSV into BytesIO object
        pop_data = io.BytesIO(zf.read(target))
        pop_df = pd.read_csv(
            pop_data, 
            header=None,
            dtype=str
        )
        pop_df = pop_df[[1, 4, 18, 19, 20, 21, 22, 23, 24, 25]]
        pop_df.columns = [
            'STUSAB',
            'LOGRECNO',
            'white',
            'black',
            'native',
            'asian',
            'pi',
            'other',
            'multiple',
            'hispanic',
        ]
        return pop_df

def merge_frames(geo_df, pop_df):
    merged = geo_df.merge(pop_df)
    merged = merged.set_index('ZCTA5')
    merged = merged.sort_index()
    return merged
    
def create_df(url):
    print('.', end='')
    data   = dl_file(url)
    geo_df = make_geo_df(data)
    pop_df = make_pop_df(data)
    df     = merge_frames(geo_df, pop_df)
    df     = df.sort_index()
    df = df.iloc[:, 2:]
    df = df.astype(np.float64)
    return df

data = [
    create_df(url)
    for url
    in urls
]

print('Download complete.')

..

In [33]:
# PROCESS
df = pd.concat(data)
df.head()

Unnamed: 0_level_0,white,black,native,asian,pi,other,multiple,hispanic
ZCTA5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
30165,66.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
31905,2.0,1.0,1.0,0.0,1.0,0.0,0.0,4.0
35004,8947.0,931.0,28.0,167.0,0.0,10.0,130.0,214.0
35005,4655.0,2986.0,38.0,16.0,1.0,8.0,81.0,157.0
35006,2868.0,167.0,23.0,1.0,0.0,2.0,31.0,29.0


In [34]:
def process_frame(df):
    '''This collapses asian and pi, and splits other in accordance percentages'''
    # Store totals
    totals = df.sum(axis=1)
    # Store other
    other = df['other']
    # Create API
    df['api'] = df['asian'] + df['pi']
    # Drop columns
    df = df.drop(columns=['other', 'asian', 'pi'])
    # Store percentages
    percentages = df.divide(totals, axis='rows')
    # Apportioned others
    apportioned_other = percentages.multiply(other, axis='rows').head()
    # Add apportioned other to frame
    df += apportioned_other
    return df

df = process_frame(df)
df.head()

Unnamed: 0_level_0,white,black,native,asian,pi,other,multiple,hispanic,api
ZCTA5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
30165,66.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31905,2.0,1.0,1.0,0.0,1.0,0.0,0.0,4.0,1.0
35004,8947.0,931.0,28.0,167.0,0.0,10.0,130.0,214.0,167.0
35005,4655.0,2986.0,38.0,16.0,1.0,8.0,81.0,157.0,17.0
35006,2868.0,167.0,23.0,1.0,0.0,2.0,31.0,29.0,1.0


# Write data to module as CSV

In [12]:
current_directory = pathlib.Path().cwd()
project_directory = current_directory.parents[0]
data_directory    = project_directory / 'surgeo' / 'data'
df_path           = data_directory / 'population_2010.csv'
df.to_csv(df_path)