## Census Data

Use the censusdata module to import census data

https://pypi.org/project/CensusData/


In [None]:
# pip install censusdata

In [None]:
import pandas as pd
import censusdata
# from tabulate import tabulate

## Understanding the Census Bureau labyrinth
Learn about the data:
https://www.census.gov/programs-surveys/acs/technical-documentation/table-shells.html

Search for data tables:
https://data.census.gov/cedsci/

Here are some of the main tables of interest:

Population sizes: 

* Total Population: B01001e1
* Population by Age and Sex: B01001*
* Population by Household Incomes: B19001*
* Population by Education Level: B15003*
* Population by Race: B02001*
* Population by Hispanic Ethnicity: B03003*
* Population by Race & Hispanic Ethnicity: B03002*
* Population by Household Type: B11001e*

Summary Statistics: 

* Median Household Income: B19013e1
* Aggregate Household Income:  B19025e1
* Per Capita Income: B19301e1
* Median Age: B01002e1

For this workshop, let's work with "Population by race," or, in census lingo, Table ID B02001

In [None]:
# command to print the metadata about a specific census table
censusdata.printtable(censusdata.censustable('acs5',2018,'B02001'))

In [None]:
# grab race columns by county
df_counties = censusdata.download('acs5', 
           2018,
           censusdata.censusgeo([('state', '06'), ('county', '*')]),
           ['B02001_001E','B02001_002E','B02001_003E','B02001_004E','B02001_005E','B02001_006E','B02001_007E','B02001_008E'])

In [None]:
df_counties.head()

In [None]:
# grab race columns by tracts
df_tracts = censusdata.download('acs5', 
           2018,
           censusdata.censusgeo([('state', '06'), ('county', '037'), ('tract', '*')]),
           ['B02001_001E','B02001_002E','B02001_003E','B02001_004E','B02001_005E','B02001_006E','B02001_007E','B02001_008E'])

In [None]:
df_tracts.head()

## Changing column names


In [None]:
column_names = ['total','white','black','am_indian_alaskan','asian','nhop','other along','two or more']
df_counties.columns = column_names
df_tracts.columns = column_names
df_counties.head()

## Adding a new index and name column

In [None]:
# look at a specific row
df_counties.iloc[0]

In [None]:
# when you output a column, it shows the index plus the column
df_counties.total.head()

In [None]:
# to put the index into a list, do the following:
df_counties.index.tolist()

In [None]:
# let's look at this index in more detail... what is it comprised of?
# loop through it and print out the different elements

for index in df_counties.index.tolist():
    print(index)
    print(index.geo)
    print(index.name)

In [None]:
# create two new columns, one for fips, and one for county name
state_county_fips = []
county_names = []

for index in df_counties.index.tolist():
    new_index = index.geo[0][1] + index.geo[1][1]
    state_county_fips.append(new_index)
    county_name = index.name.split(',')[0]
    county_names.append(county_name)


In [None]:
# do the same for the census tracts
tract_fips = []
tract_names = []

for index in df_tracts.index.tolist():
    new_index = index.geo[0][1] + index.geo[1][1] + index.geo[2][1]
    tract_fips.append(new_index)
    tract_name = index.name.split(',')[0]
    tract_names.append(tract_name)


In [None]:
state_county_fips
tract_fips

In [None]:
# create a new index with our fips code
df_counties.index = state_county_fips

# add a county name column
df_counties['county_name'] = county_names

In [None]:
df_counties.head()

In [None]:
# create a new index with our fips code
df_tracts.index = tract_fips

# add a tract name column
df_tracts['tract_name'] = tract_names

In [None]:
df_tracts.head()

## Mapping our census data

In [None]:
# import the us county geojson
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

In [None]:
counties["features"][0]

In [None]:
# census tracts
with urlopen('https://opendata.arcgis.com/datasets/152f90d3a34a43ef998448281505d45e_0.geojson') as response:
    tracts = json.load(response)


In [None]:
# let's look like one of these features looks like
tracts["features"][0]

In [None]:

import plotly.express as px

fig = px.choropleth(df_tracts, 
                    geojson=tracts, 
                    locations=df_tracts.index, 
                    featureidkey="properties.FIPS", # this is the join column
                    color='black',
                    color_continuous_scale="Viridis",                    
                    projection="mercator"
                          )

# zoom to the geographies with data
fig.update_geos(fitbounds="locations", visible=False)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [None]:
import plotly.express as px

fig = px.choropleth(df_counties, 
                    geojson=counties, 
                    locations=df_counties.index, 
                    color='black',
                    color_continuous_scale="Viridis",
                    scope="usa"
                          )

# show just the geographies with data
fig.update_geos(fitbounds="locations", visible=False)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [None]:
# how about normalizing the data?
df_counties['percent_white'] = df_counties['white'] / df_counties['total'] * 100
df_counties['percent_black'] = df_counties['black'] / df_counties['total'] * 100
df_tracts['percent_white'] = df_tracts['white'] / df_tracts['total'] * 100
df_tracts['percent_black'] = df_tracts['black'] / df_tracts['total'] * 100

In [None]:
df_tracts.head()

In [None]:
import plotly.express as px

fig = px.choropleth(df_tracts, 
                    geojson=tracts, 
                    locations=df_tracts.index, 
                    featureidkey="properties.FIPS", # this is the join column
                    color='percent_black',
                    color_continuous_scale="Viridis",                    
                    projection="mercator"
                   )

# show just the geographies with data
fig.update_geos(fitbounds="locations", visible=False)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [None]:
import plotly.express as px

fig = px.choropleth(df_counties, 
                    geojson=counties, 
                    locations=df_counties.index, 
                    color='percent_black',
                    color_continuous_scale="Viridis",
                    scope="usa"
                          )

# show just the geographies with data
fig.update_geos(fitbounds="locations", visible=False)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [None]:
# use mapbox
fig = px.choropleth_mapbox(df_counties, 
                            geojson=counties, 
                            locations=df_counties.index, 
                            color='percent_white',
                            color_continuous_scale="Viridis",
                            mapbox_style="carto-positron",
                            zoom=3, center = {"lat": 37.0902, "lon": -120},
                            opacity=0.5,
                          )

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()