The Census Bureau publishes estimates of the number of people
with health insurance in small areas (ie - counties) based on survey data
every year: https://www.census.gov/data/datasets/time-series/demo/sahie/estimates-acs.html
We want to know what regions of the country have the highest percentage
of unemployed residents

In [1]:
#first import our libraries
#pandas will let us turn our csv into a dataframe,
#which is way easier to work with
import pandas as pd

#requests will go to the web for us and grab whatever is at a URL
#and bring it back to our file
import requests

#in Python 2, you'd use these libraries to turn the response 
#into a string and then unzip it
#import zipfile, StringIO

#Use these imports in Python3 to use the same libraries with slightly
#different names
from io import BytesIO
import zipfile

In [2]:
#here's where our data zipfile lives
url = "https://www2.census.gov/programs-surveys/sahie/datasets/time-series/estimates-acs/sahie-2015-csv.zip"

In [3]:
#use the requests library's .get() method to make a get request
#to the url we declared in the variable above
r = requests.get(url)
#if you have problems with this request, try this: 
# https://stackoverflow.com/questions/31649390/python-requests-ssl-handshake-failure
#pip install requests[security]

In [6]:
#in python3, take the content of the response of 
#our get request (r.content) and use BytesIO to read it in
#then use the zipfile library's .ZipFile() method to unzip it
z = zipfile.ZipFile(BytesIO(r.content))

#in python2, these are slightly renamed
#z = zipfile.ZipFile(StringIO.StringIO(r.content))

In [9]:
#take our z zipfile object and open just the csv file within it
#that we want, sahie_2015.csv
#then use the .readlines() method on that file object,
#which gives us a list of strings for each row in the csv
#chop off the first 79 rows, which are documentation
#wickedly stacked at the top of the file.
#[79:] means keep only the lines from row 80 to the end 
my_data = z.open('sahie_2015.csv').readlines()[79:]

In [13]:
#now we have to convert each row, which is a string with commas in it,
#to a list of data cells

#create a new empty list that we'll put our cleaned rows into
new_data = []

#then loop through our my_data list of strings
for row in my_data:
    #in Python3, for each row, use the .decode() method to convert it
    #from bytes to a string, then use the .split() method to separate
    #it by the commas
    new_row = row.decode().split(',')
    #in Python 2, no need to .decode()
    #new_row = row.split(',')
    
    #take that new row, which is a list of data cells, and append it
    #to our new_data list
    new_data.append(new_row)

In [14]:
#look at new_data.
#now this looks like what pandas needs:
#each bit of data is its own item in a list for each row in the table
new_data

[['year',
  'version',
  'statefips',
  'countyfips',
  'geocat',
  'agecat',
  'racecat',
  'sexcat',
  'iprcat',
  'NIPR',
  'nipr_moe',
  'NUI',
  'nui_moe',
  'NIC',
  'nic_moe',
  'PCTUI',
  'pctui_moe',
  'PCTIC',
  'pctic_moe',
  'PCTELIG',
  'pctelig_moe',
  'PCTLIIC',
  'pctliic_moe',
  'state_name',
  'county_name',
  '\n'],
 ['2015',
  '        ',
  '01',
  '000',
  '40',
  '0',
  '0',
  '0',
  '0',
  ' 3994181',
  '       0',
  '  475233',
  '   12979',
  ' 3518948',
  '   12979',
  ' 11.9',
  '  0.3',
  ' 88.1',
  '  0.3',
  ' 11.9',
  '  0.3',
  ' 88.1',
  '  0.3',
  'Alabama                                                               ',
  '                                             ',
  '\n'],
 ['2015',
  '        ',
  '01',
  '000',
  '40',
  '0',
  '0',
  '0',
  '1',
  ' 1588535',
  '   13145',
  '  315278',
  '   10117',
  ' 1273257',
  '   14077',
  ' 19.8',
  '  0.6',
  ' 80.2',
  '  0.6',
  '  7.9',
  '  0.3',
  ' 31.9',
  '  0.4',
  'Alabama                   

In [15]:
#now create a dataframe, using pandas .DataFrame() method
#give it our data (everything in new_data except the 0 row, which is our
#column headers)
sahie_df = pd.DataFrame(new_data[1:], columns = new_data[0])

In [16]:
#look at the top of our new dataframe. 
#now we're getting somewhere
sahie_df.head()

Unnamed: 0,year,version,statefips,countyfips,geocat,agecat,racecat,sexcat,iprcat,NIPR,...,pctui_moe,PCTIC,pctic_moe,PCTELIG,pctelig_moe,PCTLIIC,pctliic_moe,state_name,county_name,Unnamed: 21
0,2015,,1,0,40,0,0,0,0,3994181,...,0.3,88.1,0.3,11.9,0.3,88.1,0.3,Alabama ...,,\n
1,2015,,1,0,40,0,0,0,1,1588535,...,0.6,80.2,0.6,7.9,0.3,31.9,0.4,Alabama ...,,\n
2,2015,,1,0,40,0,0,0,2,1948390,...,0.5,81.4,0.5,9.1,0.3,39.7,0.4,Alabama ...,,\n
3,2015,,1,0,40,0,0,0,3,1104165,...,0.7,78.8,0.7,5.9,0.2,21.8,0.3,Alabama ...,,\n
4,2015,,1,0,40,0,0,0,4,2798291,...,0.4,84.5,0.4,10.8,0.3,59.2,0.4,Alabama ...,,\n


In [18]:
#that last column is just noise. We can use
#the sahie_df dataframe's .drop() method to thorw it out
sahie_df.drop(['\n'], axis=1, inplace=True)

In [19]:
#let's look at our dataframe's state_name column
#those look OK but there's an awful lot of whitespace in there.
sahie_df.state_name

0         Alabama                                       ...
1         Alabama                                       ...
2         Alabama                                       ...
3         Alabama                                       ...
4         Alabama                                       ...
5         Alabama                                       ...
6         Alabama                                       ...
7         Alabama                                       ...
8         Alabama                                       ...
9         Alabama                                       ...
10        Alabama                                       ...
11        Alabama                                       ...
12        Alabama                                       ...
13        Alabama                                       ...
14        Alabama                                       ...
15        Alabama                                       ...
16        Alabama                       

In [34]:
#we want to create a new column for the cleaned-up state names.
#so take the dataframe's state_name column 
#and apply the lambda function to strip the whitespace off the string
#with the .strip() method. This should not be clean enough to join
#with our descriptive state_table
sahie_df['clean_state_name'] = sahie_df.state_name.apply(
    lambda x: x.strip())

In [21]:
#go to statetable.com and download a csv
#be sure to put in the directory where this notebook is

#use pandas' .read_csv() method to create a new dataframe
state_df = pd.read_csv('state_table.csv')

In [22]:
#look at the top of the state_df dataframe
state_df.head()

Unnamed: 0,id,name,abbreviation,country,type,sort,status,occupied,notes,fips_state,assoc_press,standard_federal_region,census_region,census_region_name,census_division,census_division_name,circuit_court
0,1,Alabama,AL,USA,state,10,current,occupied,,1,Ala.,IV,3,South,6,East South Central,11
1,2,Alaska,AK,USA,state,10,current,occupied,,2,Alaska,X,4,West,9,Pacific,9
2,3,Arizona,AZ,USA,state,10,current,occupied,,4,Ariz.,IX,4,West,8,Mountain,9
3,4,Arkansas,AR,USA,state,10,current,occupied,,5,Ark.,VI,3,South,7,West South Central,8
4,5,California,CA,USA,state,10,current,occupied,,6,Calif.,IX,4,West,9,Pacific,9


In [24]:
#take our sahie_df dataframe and use its .merge() method
#to add the data from the state_df and align it on the right-hand side.
#join these tables where the sahie_df.clean_state_name column 
#matches the state_df.name column
#assign the resulting dataframe to the sahie_df variable,
#overwriting our original dataframe
sahie_df = sahie_df.merge(
    state_df, left_on="clean_state_name", right_on="name")

In [25]:
#let's look at our columns now. 
#see all the new ones from state_df are on the bottom (right)
#of our dataframe?
sahie_df.columns

Index([u'year', u'version', u'statefips', u'countyfips', u'geocat', u'agecat',
       u'racecat', u'sexcat', u'iprcat', u'NIPR', u'nipr_moe', u'NUI',
       u'nui_moe', u'NIC', u'nic_moe', u'PCTUI', u'pctui_moe', u'PCTIC',
       u'pctic_moe', u'PCTELIG', u'pctelig_moe', u'PCTLIIC', u'pctliic_moe',
       u'state_name', u'county_name', u'clean_state_name', u'id', u'name',
       u'abbreviation', u'country', u'type', u'sort', u'status', u'occupied',
       u'notes', u'fips_state', u'assoc_press', u'standard_federal_region',
       u'census_region', u'census_region_name', u'census_division',
       u'census_division_name', u'circuit_court'],
      dtype='object')

As always, be sure to read the data dictionary
to understand each column and the codes and possible values it can have
https://www2.census.gov/programs-surveys/sahie/technical-documentation/file-layouts/sahie-file-layout-2008-2015.pdf

In [30]:
#let's filter our many, many rows of data to just the top-level
#numbers for each state, making a new dataframe called "states"
#filter our sahie_df dataframe
states = sahie_df[
    #for just states, where geocat == "40"
    (sahie_df.geocat=='40')&
    #for all age categories
    (sahie_df.agecat=='0')&
    #for all race categories
    (sahie_df.racecat=='0')&
    #for all sex categories
    (sahie_df.sexcat=="0")&
    #for all income categories
    (sahie_df.iprcat=='0')]

In [31]:
#now the NIC (insured) and NUI (uninsured) columns contain numbers that 
#are stored as strings
#so let's convert them to numbers we can do math on
states['NIC'] = pd.to_numeric(states.NIC)
states['NUI'] = pd.to_numeric(states.NUI)

#now, let's group the states dataframe by the census_region_name column
#and sum up each numeric column
regionals = states.groupby('census_region_name').sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [32]:
#here's what it looks like.
#most of these are nonsensical—you can sum an id or fips code—
#but the NUI and NIC numbers are good
regionals

Unnamed: 0_level_0,NUI,NIC,id,sort,notes,fips_state,census_region,census_division,circuit_court
census_region_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Midwest,4647881,51552713,314,120,,365,24,43,91
Northeast,3538552,42635713,260,90,,296,9,12,16
South,14446160,85538660,399,160,,458,48,92,101
West,6508436,57608342,302,130,,347,52,109,121


In [33]:
#now, just divide the uninsured column (NUI)
#by the total number of people in each region, adding
#the NIC and NUI columns together for the denominator
regionals.NUI/(regionals.NIC+regionals.NUI)

census_region_name
Midwest      0.082702
Northeast    0.076635
South        0.144484
West         0.101509
dtype: float64

In [None]:
#what other questions would you want to ask this data?