in this notebook, I walk through how to download ACS ses data from CensusData package in python. At the bottom are some questions for next steps. -val

In [66]:
from statsmodels.graphics.tsaplots import plot_acf
import numpy as np
import scipy.stats as st
from scipy.io import loadmat
import statistics as stats
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.interpolate import interp1d
import math

import pandas as pd
import scipy.io as sio
import scipy.stats as st


## helpful documentation: https://jtleider.github.io/censusdata/

# 1. Install CensusData Package

In [2]:
# will need to install every few hours, consider making a covidcrew env. to keep installed packages
!pip install CensusData

Collecting CensusData
  Downloading CensusData-1.8.tar.gz (23.2 MB)
[K     |████████████████████████████████| 23.2 MB 3.3 MB/s eta 0:00:01
Building wheels for collected packages: CensusData
  Building wheel for CensusData (setup.py) ... [?25ldone
[?25h  Created wheel for CensusData: filename=CensusData-1.8-py3-none-any.whl size=24706120 sha256=fece451bdf6fa7aec2989adf4e67d4bb9d7073826e97d3ff642e4502547c962c
  Stored in directory: /home/jovyan/.cache/pip/wheels/eb/74/d3/75a737e0305a81270bd9a0129077c208a4334e3c202e9d4274
Successfully built CensusData
Installing collected packages: CensusData
Successfully installed CensusData-1.8


In [3]:
# We also need to import these two

import pandas as pd
import censusdata

# 2. Determine the data you want:

### To find the code for the marker you're interested, download and look at the excel sheet 'table shells' at https://www.census.gov/programs-surveys/acs/technical-documentation/summary-file-documentation.html

In [4]:
# here's what that table shells sheet looks like
table_shells = pd.read_excel('ACS2018_Table_Shells.xlsx')
table_shells[:10]

Unnamed: 0,Table ID,Line,UniqueID,Stub,Data Release
0,,,,,
1,B00001,,,UNWEIGHTED SAMPLE COUNT OF THE POPULATION,15.0
2,B00001,,,Universe: Total population,
3,B00001,1.0,B00001_001,Total,
4,,,,,
5,B00002,,,UNWEIGHTED SAMPLE HOUSING UNITS,15.0
6,B00002,,,Universe: Housing units,
7,B00002,1.0,B00002_001,Total,
8,,,,,
9,B01001,,,SEX BY AGE,15.0


In [5]:
markers_codes = ['B01001_001E', 'B01002_001E', 'B19013_001E', 'B02001_002E','B02001_003E','B02001_004E','B02001_005E','B02001_006E', 'B02001_007E','B02001_008E']

In [6]:
markers = ['population size', 'medium age', 'medium household income', 'White alone', 'Black or African American alone','American Indian and Alaska Native alone',
'Asian alone', 'Native Hawaiian and Other Pacific Islander alone','Some other race alone','Two or more races']


# 3. Determine the geographies you want

In [7]:
# find the state code for MA
censusdata.geographies(censusdata.censusgeo([('state', '*')]), 'acs5', 2015)

{'Alabama': censusgeo((('state', '01'),)),
 'Alaska': censusgeo((('state', '02'),)),
 'Arizona': censusgeo((('state', '04'),)),
 'Arkansas': censusgeo((('state', '05'),)),
 'California': censusgeo((('state', '06'),)),
 'Colorado': censusgeo((('state', '08'),)),
 'Connecticut': censusgeo((('state', '09'),)),
 'Delaware': censusgeo((('state', '10'),)),
 'District of Columbia': censusgeo((('state', '11'),)),
 'Florida': censusgeo((('state', '12'),)),
 'Georgia': censusgeo((('state', '13'),)),
 'Hawaii': censusgeo((('state', '15'),)),
 'Idaho': censusgeo((('state', '16'),)),
 'Illinois': censusgeo((('state', '17'),)),
 'Indiana': censusgeo((('state', '18'),)),
 'Iowa': censusgeo((('state', '19'),)),
 'Kansas': censusgeo((('state', '20'),)),
 'Kentucky': censusgeo((('state', '21'),)),
 'Louisiana': censusgeo((('state', '22'),)),
 'Maine': censusgeo((('state', '23'),)),
 'Maryland': censusgeo((('state', '24'),)),
 'Massachusetts': censusgeo((('state', '25'),)),
 'Michigan': censusgeo((('stat

### MA state code: 'Massachusetts': ('state', '25')

In [8]:
# find the city code for counties in MA
censusdata.geographies(censusdata.censusgeo([('state','25'),('county', '*')]), 'acs5', 2015)

{'Barnstable County, Massachusetts': censusgeo((('state', '25'), ('county', '001'))),
 'Berkshire County, Massachusetts': censusgeo((('state', '25'), ('county', '003'))),
 'Bristol County, Massachusetts': censusgeo((('state', '25'), ('county', '005'))),
 'Dukes County, Massachusetts': censusgeo((('state', '25'), ('county', '007'))),
 'Essex County, Massachusetts': censusgeo((('state', '25'), ('county', '009'))),
 'Franklin County, Massachusetts': censusgeo((('state', '25'), ('county', '011'))),
 'Hampden County, Massachusetts': censusgeo((('state', '25'), ('county', '013'))),
 'Hampshire County, Massachusetts': censusgeo((('state', '25'), ('county', '015'))),
 'Middlesex County, Massachusetts': censusgeo((('state', '25'), ('county', '017'))),
 'Nantucket County, Massachusetts': censusgeo((('state', '25'), ('county', '019'))),
 'Norfolk County, Massachusetts': censusgeo((('state', '25'), ('county', '021'))),
 'Plymouth County, Massachusetts': censusgeo((('state', '25'), ('county', '023'

### We're interested in Norfolk (021), Suffolk (025), and Middlesex (017)

# 4. Download data you want

### You can download data for the state of MA by county name 

In [9]:
# for Norfolk, county code = 021
acs_norfolk = censusdata.download('acs5', 2015,censusdata.censusgeo([('state', '25'), ('county','021')]), markers_codes)
acs_norfolk.columns=markers
acs_norfolk

Unnamed: 0,population size,medium age,medium household income,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races
"Norfolk County, Massachusetts: Summary level: 050, state:25> county:021",687721,40.9,88262,555100,43069,564,66682,35,8096,14175


In [72]:
# I'm still learning to code so let's do this to work with the data easier:
data_norfolk = np.array([687721, 40.9 ,88262 ,555100 ,43069 ,564 ,66682 ,35 ,8096 ,14175])
data_norfolk

array([6.87721e+05, 4.09000e+01, 8.82620e+04, 5.55100e+05, 4.30690e+04,
       5.64000e+02, 6.66820e+04, 3.50000e+01, 8.09600e+03, 1.41750e+04])

In [11]:
# for Suffolk, county code = 025
acs_suffolk = censusdata.download('acs5', 2015,censusdata.censusgeo([('state', '25'), ('county','025')]), markers_codes)
acs_suffolk.columns=markers
acs_suffolk

Unnamed: 0,population size,medium age,medium household income,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races
"Suffolk County, Massachusetts: Summary level: 050, state:25> county:025",758919,32.2,55044,421489,169946,2593,65396,107,56190,43198


In [74]:
# I'm still learning to code so let's do this to work with the data easier:
data_suffolk = np.array([758919 ,32.2 ,55044 ,421489 ,169946 ,2593 ,65396 ,107 ,56190 ,43198])
data_suffolk

array([7.58919e+05, 3.22000e+01, 5.50440e+04, 4.21489e+05, 1.69946e+05,
       2.59300e+03, 6.53960e+04, 1.07000e+02, 5.61900e+04, 4.31980e+04])

In [13]:
# for Middlesex, county code = 017
acs_middlesex = censusdata.download('acs5', 2015,censusdata.censusgeo([('state', '25'), ('county','017')]), markers_codes)
acs_middlesex.columns=markers
acs_middlesex

Unnamed: 0,population size,medium age,medium household income,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races
"Middlesex County, Massachusetts: Summary level: 050, state:25> county:017",1556116,38.5,85118,1230158,75980,2074,163386,424,37162,46932


In [75]:
# I'm still learning to code so let's do this to work with the data easier:
data_middlesex = np.array([1556116 ,38.5, 85118 ,1230158 ,75980 ,2074 ,163386 ,424 ,37162 ,46932])
data_middlesex

array([1.556116e+06, 3.850000e+01, 8.511800e+04, 1.230158e+06,
       7.598000e+04, 2.074000e+03, 1.633860e+05, 4.240000e+02,
       3.716200e+04, 4.693200e+04])

## Make a table with the values you want

In [88]:
table_allcounties = pd.DataFrame({"Middlesex": data_middlesex , "Suffolk": data_suffolk, "Norfolk": data_norfolk })
table_allcounties = table_allcounties.T

table_allcounties.columns= markers
table_allcounties

Unnamed: 0,population size,medium age,medium household income,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races
Middlesex,1556116.0,38.5,85118.0,1230158.0,75980.0,2074.0,163386.0,424.0,37162.0,46932.0
Suffolk,758919.0,32.2,55044.0,421489.0,169946.0,2593.0,65396.0,107.0,56190.0,43198.0
Norfolk,687721.0,40.9,88262.0,555100.0,43069.0,564.0,66682.0,35.0,8096.0,14175.0


# 5. Split counties up into North and South Region

In [91]:
# set percentage weights for each county in each region
# this is approximate, we can adjust with more accurate weights
middlesex_north = 0.8
suffolk_north = 0.5
norfolk_north = 0.3

middlesex_south = 0.2
suffolk_south = 0.5
norfolk_south = 0.7

In [95]:
# create a table for the north region with percentage weights
table_north = pd.DataFrame({"Middlesex": middlesex_north*data_middlesex , "Suffolk": suffolk_north*data_suffolk, "Norfolk": norfolk_north*data_norfolk })
table_north = table_north.T

table_north.columns= markers
table_north

Unnamed: 0,population size,medium age,medium household income,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races
Middlesex,1244892.8,30.8,68094.4,984126.4,60784.0,1659.2,130708.8,339.2,29729.6,37545.6
Suffolk,379459.5,16.1,27522.0,210744.5,84973.0,1296.5,32698.0,53.5,28095.0,21599.0
Norfolk,206316.3,12.27,26478.6,166530.0,12920.7,169.2,20004.6,10.5,2428.8,4252.5


In [112]:
# let's add up the counties counts so we get a total aggregate table
n = table_north.sum(axis = 0, skipna = True)
north = pd.DataFrame({"north_sum": n})
north = north.T
north

Unnamed: 0,population size,medium age,medium household income,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races
north_sum,1830668.6,59.17,122095.0,1361400.9,158677.7,3124.9,183411.4,403.2,60253.4,63397.1


In [142]:
# calculate demographic data as percentages of thet total population
a = np.array([north['White alone'],north['Black or African American alone'],north['American Indian and Alaska Native alone'], north['Asian alone'],north['Native Hawaiian and Other Pacific Islander alone'],north['Some other race alone'],north['Two or more races']])
demographic_north = 100*a/np.array([north['population size']])
np.set_printoptions(suppress=True)
print(demographic_north)

[[74.36632168]
 [ 8.66774576]
 [ 0.1706972 ]
 [10.01882045]
 [ 0.02202474]
 [ 3.29133301]
 [ 3.46305716]]


## This is the demographic data fort

### Do the same for the south region

In [96]:
# create a table for the south region with percentage weights
table_south = pd.DataFrame({"Middlesex": middlesex_south*data_middlesex , "Suffolk": suffolk_south*data_suffolk, "Norfolk": norfolk_south*data_norfolk })
table_south = table_south.T

table_south.columns= markers
table_south

Unnamed: 0,population size,medium age,medium household income,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races
Middlesex,311223.2,7.7,17023.6,246031.6,15196.0,414.8,32677.2,84.8,7432.4,9386.4
Suffolk,379459.5,16.1,27522.0,210744.5,84973.0,1296.5,32698.0,53.5,28095.0,21599.0
Norfolk,481404.7,28.63,61783.4,388570.0,30148.3,394.8,46677.4,24.5,5667.2,9922.5


In [148]:
# let's add up the counties counts so we get a total aggregate table
s = table_south.sum(axis = 0, skipna = True)
south = pd.DataFrame({"south_sum": s})
south = south.T
south

Unnamed: 0,population size,medium age,medium household income,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races
south_sum,1172087.4,52.43,106329.0,845346.1,130317.3,2106.1,112052.6,162.8,41194.6,40907.9


In [156]:
# calculate demographic data as percentages of thet total population
b = np.array([south['White alone'],south['Black or African American alone'],south['American Indian and Alaska Native alone'], south['Asian alone'],south['Native Hawaiian and Other Pacific Islander alone'],south['Some other race alone'],south['Two or more races']])
demographic_south = 100*b/np.array([south['population size']])
np.set_printoptions(suppress=True)
print(demographic_south)

[[72.12312836]
 [11.11839441]
 [ 0.17968797]
 [ 9.56008912]
 [ 0.01388975]
 [ 3.51463551]
 [ 3.49017488]]


# Questions:

1. how do we select just the cities grouped by North and South regions from this list?
- Joey providing county names for north and south locations
- consider weighted average of sorts 
2. How do we access demographic data (race/ethnicity? - the documentation I'm following suggests this data is published in the centennial census data? code: 'sf1'. 
- Pranjali will find code for demographic data, which may only be present at county level