# Examine census response rates, with data at state and census tract levels
## 2020 response rates from: https://2020census.gov/en/response-rates.html
## 2010 response rates from: https://api.census.gov/data/2010/dec/responserate/variables.html
## Demographic information in 2014-2018 ACS 5-year-estimate from: https://data2.nhgis.org/main

In [99]:
import pandas as pd
import numpy as np

# in excel changed original dates from m/d/yyyy to yyyy-mm-dd
initial_df = pd.read_csv('decennialrr2020_working.csv')
# had to resave as UTF-8 CSV, hence the 2
crosswalk = pd.read_csv('decennialrr2020_crosswalkfile2.csv')
# states paired with region as defined by census map at https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf
regions = pd.read_csv('state_region.csv')

## Columns


GEO_ID = Geographic Identifier

RESP_DATE = Posting Date

State = name of state (one of the 50 states, District of Columbia, Puerto Rico, or NaN)

Geo_Name = name of the tract, county, state

Region = region of the U.S. in which state is located as defined by census map at https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf

Geo_Type = type of geography; possible answers include Census Tract, Congressional District, Consolidated City, Country, County, County Subdivision, Place, Region, State, Tribal Tract, Tribal Area

DRRINT = Daily Self-Response Rate - Internet

DRRALL = Daily Self-Response Rate – Overall

CRRINT = Cumulative Self-Response Rate - Internet

not_int = new calculated column showing response rate NOT from internet

CRRALL = Cumulative Self-Response Rate – Overall

DINTMIN = Minimum Daily Internet Self-Response Rate

DMIN = Minimum Daily Overall Self-Response Rate

CINTMIN = Minimum Cumulative Internet Self-Response Rate

CMIN = Minimum Cumulative Overall Self-Response Rate

DINTMAX = Maximum Daily Internet Self-Response Rate

DMAX = Maximum Daily Overall Self-Response Rate

CINTMAX = Maximum Cumulative Internet Self-Response Rate

CMAX = Maximum Cumulative Overall Self-Response Rate

DINTAVG = Average Daily Internet Self-Response Rate

DAVG = Average Daily Overall Self-Response Rate

CINTAVG = Average Cumulative Internet Self-Response Rate

CAVG = Average Cumulative Overall Self-Response Rate

DINTMED = Median Daily Internet Self-Response Rate

DMED = Median Daily Overall Self-Response Rate

CINTMED = Median Cumulative Internet Self-Response Rate

CMED = Median Cumulative Overall Self-Response Rate

In [110]:
# merge responses and crosswalk
merged1 = pd.merge(initial_df, crosswalk, on='GEO_ID')

#merge merged1 with region data
merged = pd.merge(merged1, regions, on='State')

# create column showing responses not from internet
merged['not_int'] = merged.CRRALL - merged.CRRINT

#reorder columns to move State, Geo_Name and Geo_Type to front; also going to drop some values
cols = merged.columns.tolist()
cols = ['GEO_ID', 'RESP_DATE', 'State', 'Geo_Name', 'Region', 'Geo_Type', 
        'CRRINT', 'not_int', 'CRRALL']
merged = merged[cols]
merged = merged.rename(columns={'CRRINT':'internet', 'CRRALL':'2020_rate'})
merged

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,internet,not_int,2020_rate
0,0100000US,6/9/2020,,United States,na,Country,48.9,11.9,60.8
1,0200000US1,6/9/2020,,Northeast,na,Region,48.9,11.3,60.2
2,0200000US2,6/9/2020,,Midwest,na,Region,52.6,13.4,66.0
3,0200000US3,6/9/2020,,South,na,Region,45.2,12.8,58.0
4,0200000US4,6/9/2020,,West,na,Region,51.7,9.4,61.1
...,...,...,...,...,...,...,...,...,...
123245,2560000US4755T00100,6/9/2020,Tribal Land,"T001, Ysleta del Sur",Tribal Land,Tribal Tract,31.8,0.9,32.7
123246,2560000US4760T00100,6/9/2020,Tribal Land,"T001, Yurok",Tribal Land,Tribal Tract,10.1,0.1,10.2
123247,2560000US4770T00100,6/9/2020,Tribal Land,"T001, Zia",Tribal Land,Tribal Tract,16.4,0.5,16.9
123248,2560000US4785T00100,6/9/2020,Tribal Land,"T001, Zuni",Tribal Land,Tribal Tract,7.8,0.1,7.9


## Census Tracts

In [111]:
# just census tract geo types
tracts = merged[merged['Geo_Type'].str.contains("Tract")]

# highest cumulative response rate (CRRALL)
tracts.sort_values(by='2020_rate', ascending=False)

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,internet,not_int,2020_rate
26459,1400000US13215010606,6/9/2020,Georgia,"Tract 106.06, Muscogee",South,Census Tract,0.0,98.1,98.1
49682,1400000US25013812903,6/9/2020,Massachusetts,"Tract 8129.03, Hampden",Midwest,Census Tract,86.2,6.9,93.1
53285,1400000US26099223801,6/9/2020,Michigan,"Tract 2238.01, Macomb",Midwest,Census Tract,86.1,6.5,92.6
54097,1400000US26139021605,6/9/2020,Michigan,"Tract 216.05, Ottawa",Midwest,Census Tract,85.4,7.0,92.4
113068,1400000US51059492202,6/9/2020,Virginia,"Tract 4922.02, Fairfax",South,Census Tract,88.9,3.3,92.2
...,...,...,...,...,...,...,...,...,...
74413,1400000US36061000500,6/9/2020,New York,"Tract 5, New York",Northeast,Census Tract,0.0,0.0,0.0
7920,1400000US06037277400,6/9/2020,California,"Tract 2774, Los Angeles",West,Census Tract,0.0,0.0,0.0
122998,2560000US2147T00100,6/9/2020,Tribal Land,"T001, Mashpee Wampanoag",Tribal Land,Tribal Tract,0.0,0.0,0.0
3402,1400000US04017940010,6/9/2020,Arizona,"Tract 9400.10, Navajo",West,Census Tract,0.0,0.0,0.0


#### Tribal tracts

In [113]:
# response rates in tribal tracts
tribal = tracts[tracts['Geo_Type'].str.contains("Tribal")]
tribal.sort_values(by='2020_rate', ascending=False)

tribal

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,internet,not_int,2020_rate
122824,2560000US0010T00100,6/9/2020,Tribal Land,"T001, Acoma",Tribal Land,Tribal Tract,8.2,1.3,9.5
122825,2560000US0020T00100,6/9/2020,Tribal Land,"T001, Agua Caliente",Tribal Land,Tribal Tract,41.8,6.6,48.4
122826,2560000US0020T00200,6/9/2020,Tribal Land,"T002, Agua Caliente",Tribal Land,Tribal Tract,33.7,5.5,39.2
122827,2560000US0020T00300,6/9/2020,Tribal Land,"T003, Agua Caliente",Tribal Land,Tribal Tract,49.6,11.1,60.7
122828,2560000US0020T00400,6/9/2020,Tribal Land,"T004, Agua Caliente",Tribal Land,Tribal Tract,42.2,9.0,51.2
...,...,...,...,...,...,...,...,...,...
123245,2560000US4755T00100,6/9/2020,Tribal Land,"T001, Ysleta del Sur",Tribal Land,Tribal Tract,31.8,0.9,32.7
123246,2560000US4760T00100,6/9/2020,Tribal Land,"T001, Yurok",Tribal Land,Tribal Tract,10.1,0.1,10.2
123247,2560000US4770T00100,6/9/2020,Tribal Land,"T001, Zia",Tribal Land,Tribal Tract,16.4,0.5,16.9
123248,2560000US4785T00100,6/9/2020,Tribal Land,"T001, Zuni",Tribal Land,Tribal Tract,7.8,0.1,7.9


In [114]:
### tribal areas and tracts stats

#mean non internet response
tribal.mean(axis=0)['not_int']

6.115962441314556

In [115]:
# mean internet response rate
tribal.mean(axis=0)['internet']

19.630281690140855

In [116]:
# mean overall response rate
tribal.mean(axis=0)['2020_rate']

25.7462441314554

#### Tracts with 0 overall response rate

In [117]:
## Tracts with 0 cumulative response rate: 28
is_zero = tracts['2020_rate'] == 0.0
zeros = tracts[is_zero]
zeros.sort_values(by='State')


Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,internet,not_int,2020_rate
2218,1400000US04001944202,6/9/2020,Arizona,"Tract 9442.02, Apache",West,Census Tract,0.0,0.0,0.0
2220,1400000US04001944302,6/9/2020,Arizona,"Tract 9443.02, Apache",West,Census Tract,0.0,0.0,0.0
2300,1400000US04005942202,6/9/2020,Arizona,"Tract 9422.02, Coconino",West,Census Tract,0.0,0.0,0.0
2305,1400000US04005980000,6/9/2020,Arizona,"Tract 9800, Coconino",West,Census Tract,0.0,0.0,0.0
3401,1400000US04017940008,6/9/2020,Arizona,"Tract 9400.08, Navajo",West,Census Tract,0.0,0.0,0.0
3402,1400000US04017940010,6/9/2020,Arizona,"Tract 9400.10, Navajo",West,Census Tract,0.0,0.0,0.0
7920,1400000US06037277400,6/9/2020,California,"Tract 2774, Los Angeles",West,Census Tract,0.0,0.0,0.0
9345,1400000US06037980020,6/9/2020,California,"Tract 9800.20, Los Angeles",West,Census Tract,0.0,0.0,0.0
13265,1400000US06083980100,6/9/2020,California,"Tract 9801, Santa Barbara",West,Census Tract,0.0,0.0,0.0
17975,1400000US10001980000,6/9/2020,Delaware,"Tract 9800, Kent",South,Census Tract,0.0,0.0,0.0


## States

In [119]:
# response rate by state
is_states = merged['Geo_Type'] == 'State'
states = merged[is_states]
states = states.rename(columns={"internet": "state_internet", "not_int" : "state_not_int", "2020_rate": "2020_state_rate"})

# highest cumulative response rate (CRRALL)
states.sort_values(by='2020_state_rate', ascending=False)

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,state_internet,state_not_int,2020_state_rate
55595,0400000US27,6/9/2020,Minnesota,Minnesota,Midwest,State,59.5,11.0,70.5
117774,0400000US55,6/9/2020,Wisconsin,Wisconsin,Midwest,State,56.2,11.7,67.9
38079,0400000US19,6/9/2020,Iowa,Iowa,Midwest,State,53.1,14.3,67.4
50831,0400000US26,6/9/2020,Michigan,Michigan,Midwest,State,52.9,14.4,67.3
64533,0400000US31,6/9/2020,Nebraska,Nebraska,Midwest,State,53.4,13.5,66.9
114850,0400000US53,6/9/2020,Washington,Washington,West,State,57.7,8.4,66.1
112324,0400000US51,6/9/2020,Virginia,Virginia,South,State,54.2,11.7,65.9
82562,0400000US39,6/9/2020,Ohio,Ohio,Midwest,State,50.9,14.9,65.8
28633,0400000US17,6/9/2020,Illinois,Illinois,Midwest,State,53.9,11.9,65.8
34724,0400000US18,6/9/2020,Indiana,Indiana,Midwest,State,50.4,15.1,65.5


In [120]:
# highest non-internet response rate (not_int)
states.sort_values(by='state_not_int', ascending=False)

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,state_internet,state_not_int,2020_state_rate
59796,0400000US28,6/9/2020,Mississippi,Mississippi,South,State,34.0,22.0,56.0
4047,0400000US05,6/9/2020,Arkansas,Arkansas,South,State,37.2,18.2,55.4
5,0400000US01,6/9/2020,Alabama,Alabama,South,State,40.9,17.9,58.8
42735,0400000US21,6/9/2020,Kentucky,Kentucky,South,State,47.8,16.5,64.3
100434,0400000US47,6/9/2020,Tennessee,Tennessee,South,State,45.1,15.7,60.8
44576,0400000US22,6/9/2020,Louisiana,Louisiana,South,State,39.7,15.3,55.0
34724,0400000US18,6/9/2020,Indiana,Indiana,Midwest,State,50.4,15.1,65.5
82562,0400000US39,6/9/2020,Ohio,Ohio,Midwest,State,50.9,14.9,65.8
97521,0400000US45,6/9/2020,South Carolina,South Carolina,South,State,41.2,14.4,55.6
50831,0400000US26,6/9/2020,Michigan,Michigan,Midwest,State,52.9,14.4,67.3


#### State stats

In [12]:
states.mean(axis=0)['state_not_int']

11.63076923076923

In [121]:
states.mean(axis=0)['state_internet']

47.101923076923065

In [122]:
states.mean(axis=0)['2020_state_rate']

58.73269230769233

In [123]:
# average by region
# NOTE as tribal tracts are not assigned to a state they do not have a corresponding region and thus are not counted in the regional calculations
states.groupby('Region').mean().sort_values(by='2020_state_rate', ascending=False)

Unnamed: 0_level_0,state_internet,state_not_int,2020_state_rate
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Midwest,52.238462,12.815385,65.053846
Northeast,47.3375,11.2,58.5375
South,44.152941,13.705882,57.858824
West,48.646154,8.846154,57.492308
Puerto Rico,8.5,0.6,9.1


#### State stats without Puerto Rico

In [124]:
no_pr = states[states.State != 'Puerto Rico']

# average non internet response rate
no_pr.mean(axis=0)['state_not_int']

11.84705882352941

In [125]:
# average internet response
no_pr.mean(axis=0)['state_internet']

47.85882352941175

In [126]:
# average overall response rate
no_pr.mean(axis=0)['2020_state_rate']

59.7058823529412

In [127]:
# average region response rate (so south now excludes Puerto Rico)
no_pr.groupby('Region').mean().sort_values(by='2020_state_rate', ascending=False)

Unnamed: 0_level_0,state_internet,state_not_int,2020_state_rate
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Midwest,52.238462,12.815385,65.053846
Northeast,47.3375,11.2,58.5375
South,44.152941,13.705882,57.858824
West,48.646154,8.846154,57.492308


## Compare to 2010 response rates

### 2010 to 2020 State Response Differences

In [128]:
# read in csvs with 2010 response data for tracts and states
rates2010 = pd.read_csv('2010responserate.csv')
rates2010 = rates2010.rename(columns={'FSRR2010':'2010_rate'})
rates2010

Unnamed: 0,NAME,county,State,GEO_ID,2010_rate,state_num,county_num,tract
0,Census Tract 201,Autauga County,Alabama,1400000US01001020100,70.6,1,1,20100
1,Census Tract 202,Autauga County,Alabama,1400000US01001020200,70.1,1,1,20200
2,Census Tract 203,Autauga County,Alabama,1400000US01001020300,73.6,1,1,20300
3,Census Tract 204,Autauga County,Alabama,1400000US01001020400,78.4,1,1,20400
4,Census Tract 205.01,Autauga County,Alabama,1400000US01001020501,81.2,1,1,20501
...,...,...,...,...,...,...,...,...
84514,Census Tract 7505.01,Yauco Municipio,Puerto Rico,1400000US72153750501,70.4,72,153,750501
84515,Census Tract 7505.02,Yauco Municipio,Puerto Rico,1400000US72153750502,73.4,72,153,750502
84516,Census Tract 7505.03,Yauco Municipio,Puerto Rico,1400000US72153750503,63.4,72,153,750503
84517,Census Tract 7506.01,Yauco Municipio,Puerto Rico,1400000US72153750601,67.1,72,153,750601


In [129]:
## difference in row numbers: both tract dfs have 84519 rows, but when joined only 84093
# Identify what values are in rates2010 and not in tracts
key_diff1 = set(rates2010.GEO_ID).difference(tracts.GEO_ID)
len(key_diff1)
key_diff1

# Identify what values are in tracts and not in rates2010
key_diff2 = set(tracts.GEO_ID).difference(rates2010.GEO_ID)
len(key_diff2)
key_diff2

{'2560000US0010T00100',
 '2560000US0020T00100',
 '2560000US0020T00200',
 '2560000US0020T00300',
 '2560000US0020T00400',
 '2560000US0020T00500',
 '2560000US0020T00600',
 '2560000US0020T00700',
 '2560000US0020T00800',
 '2560000US0020T00900',
 '2560000US0020T01000',
 '2560000US0050T00100',
 '2560000US0080T00100',
 '2560000US0080T00200',
 '2560000US0110T00100',
 '2560000US0115T00100',
 '2560000US0140T00100',
 '2560000US0155T00100',
 '2560000US0165T00100',
 '2560000US0170T00100',
 '2560000US0185T00100',
 '2560000US0200T00100',
 '2560000US0225T00100',
 '2560000US0250T00100',
 '2560000US0265T00100',
 '2560000US0275T00100',
 '2560000US0290T00100',
 '2560000US0305T00100',
 '2560000US0305T00200',
 '2560000US0305T00300',
 '2560000US0325T00100',
 '2560000US0335T00100',
 '2560000US0350T00100',
 '2560000US0360T00100',
 '2560000US0400T00100',
 '2560000US0415T00100',
 '2560000US0435T00100',
 '2560000US0440T00100',
 '2560000US0450T00100',
 '2560000US0510T00100',
 '2560000US0525T00100',
 '2560000US0540T

##### Conclusion: 2010 rates do not include tribal tracts while 2020 tracts are missing some tracts in a multitude of states, likely due to a change in tract boundaries. Those differences account for 426 tracts, which is .5% of the original 84519 tracts. As these tracts are small percentage of all tracts, they can be dropped. 

In [160]:
# merge with 2020 tracts
tracts_merged = pd.merge(tracts, rates2010, on='GEO_ID')

cols = tracts_merged.columns.tolist()
cols = ['GEO_ID','Geo_Name','county', 'State_y', 'Region', '2020_rate', '2010_rate']
tracts_merged = tracts_merged[cols]
tracts_merged = tracts_merged.rename(columns={'State_y':'State'})
tracts_merged.sort_values(by='2010_rate', ascending=False)

Unnamed: 0,GEO_ID,Geo_Name,county,State,Region,2020_rate,2010_rate
10379,1400000US06071010906,"Tract 109.06, San Bernardino",San Bernardino County,California,West,2.9,100.0
48849,1400000US35035940000,"Tract 9400, Otero",Otero County,New Mexico,West,15.8,100.0
13467,1400000US08005980000,"Tract 9800, Arapahoe",Arapahoe County,Colorado,West,87.5,100.0
8287,1400000US06053980000,"Tract 9800, Monterey",Monterey County,California,West,0.7,100.0
6449,1400000US06037265301,"Tract 2653.01, Los Angeles",Los Angeles County,California,West,4.8,100.0
...,...,...,...,...,...,...,...
83055,1400000US56013940202,"Tract 9402.02, Fremont",Fremont County,Wyoming,West,17.9,
83056,1400000US56013940301,"Tract 9403.01, Fremont",Fremont County,Wyoming,West,21.5,
83057,1400000US56013940302,"Tract 9403.02, Fremont",Fremont County,Wyoming,West,31.6,
83058,1400000US56013940400,"Tract 9404, Fremont",Fremont County,Wyoming,West,43.1,


In [161]:
#how many null 2010 response values are there: 531, which is .6% of all rows, 84093
is_no_2010 = tracts_merged.isnull()
no_2010 = is_no_2010.any(axis=1)
no_2010 = tracts_merged[no_2010]
no_2010.sort_values(by="2020_rate")

#due to the low percentage, these null values will be discarded
tracts_merged = tracts_merged.dropna(axis=0)
#check they'd discraded
tracts_merged.sort_values(by='2010_rate')
#there are no 2010 na values, so all were dropped

Unnamed: 0,GEO_ID,Geo_Name,county,State,Region,2020_rate,2010_rate
77029,1400000US50027966300,"Tract 9663, Windsor",Windsor County,Vermont,Northeast,17.3,0.0
48861,1400000US35039940800,"Tract 9408, Rio Arriba",Rio Arriba County,New Mexico,West,20.5,0.0
6111,1400000US06037207400,"Tract 2074, Los Angeles",Los Angeles County,California,West,25.8,0.0
21943,1400000US13089023115,"Tract 231.15, DeKalb",DeKalb County,Georgia,South,10.0,0.0
63439,1400000US42027012200,"Tract 122, Centre",Centre County,Pennsylvania,Northeast,17.6,0.0
...,...,...,...,...,...,...,...
13467,1400000US08005980000,"Tract 9800, Arapahoe",Arapahoe County,Colorado,West,87.5,100.0
24218,1400000US16001000703,"Tract 7.03, Ada",Ada County,Idaho,West,77.3,100.0
73376,1400000US48215020207,"Tract 202.07, Hidalgo",Hidalgo County,Texas,South,36.3,100.0
50116,1400000US36035970400,"Tract 9704, Fulton",Fulton County,New York,Northeast,13.4,100.0


In [162]:
tracts_merged['10-20 difference'] = tracts_merged['2010_rate'] - tracts_merged['2020_rate']
tracts_merged.sort_values(by='10-20 difference', ascending=False)

Unnamed: 0,GEO_ID,Geo_Name,county,State,Region,2020_rate,2010_rate,10-20 difference
8287,1400000US06053980000,"Tract 9800, Monterey",Monterey County,California,West,0.7,100.0,99.3
53795,1400000US36103159510,"Tract 1595.10, Suffolk",Suffolk County,New York,Northeast,0.8,100.0,99.2
7980,1400000US06037980021,"Tract 9800.21, Los Angeles",Los Angeles County,California,West,1.6,100.0,98.4
49059,1400000US36001000404,"Tract 4.04, Albany",Albany County,New York,Northeast,1.8,100.0,98.2
10379,1400000US06071010906,"Tract 109.06, San Bernardino",San Bernardino County,California,West,2.9,100.0,97.1
...,...,...,...,...,...,...,...,...
21302,1400000US13051010707,"Tract 107.07, Chatham",Chatham County,Georgia,South,67.7,2.4,-65.3
40288,1400000US26163982400,"Tract 9824, Wayne",Wayne County,Michigan,Midwest,83.1,16.7,-66.4
65389,1400000US42101036400,"Tract 364, Philadelphia",Philadelphia County,Pennsylvania,Northeast,80.2,13.5,-66.7
22281,1400000US13121003700,"Tract 37, Fulton",Fulton County,Georgia,South,67.3,0.0,-67.3


In [163]:
# average difference as of 6/9/20
tracts_merged.mean(axis=0)['10-20 difference']

7.510194825399082

In [137]:
# average difference by region
tracts_merged.groupby('Region').mean().sort_values(by='10-20 difference', ascending=False)

Unnamed: 0_level_0,2020_rate,2010_rate,10-20 difference
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Puerto Rico,9.131379,56.631705,47.500326
Northeast,59.578297,67.728176,8.149879
South,57.894541,66.022076,8.127535
West,61.518009,67.693454,6.175444
Midwest,65.48441,70.971303,5.486893


### 2010 to 2020 State Response Differences

In [165]:
# read in csvs with 2010 response data for states
states2010 = pd.read_csv('states2010.csv')

# merge with 2020 states
states_merged = pd.merge(states, states2010, on='State')
cols = states_merged.columns.tolist()
cols = ['GEO_ID', 'State', 'Region',
 '2020_state_rate', '2000_rate', '2010_rate']
states_merged = states_merged[cols]
states_merged['10-20 difference'] = states_merged['2010_rate'] - states_merged['2020_state_rate']
states_merged.sort_values(by='10-20 difference', ascending=False)

Unnamed: 0,GEO_ID,State,Region,2020_state_rate,2000_rate,2010_rate,10-20 difference
51,0400000US72,Puerto Rico,Puerto Rico,9.1,54,54,44.9
1,0400000US02,Alaska,West,41.5,67,64,22.5
40,0400000US45,South Carolina,South,55.6,68,75,19.4
33,0400000US37,North Carolina,South,57.1,69,76,18.9
50,0400000US56,Wyoming,West,50.8,75,69,18.2
45,0400000US50,Vermont,Northeast,51.5,68,69,17.5
19,0400000US23,Maine,Northeast,50.9,67,68,17.1
48,0400000US54,West Virginia,South,48.2,68,65,16.8
31,0400000US35,New Mexico,West,48.4,68,65,16.6
26,0400000US30,Montana,West,51.8,72,68,16.2


In [145]:
# average difference as of 6/9/20
states_merged.mean(axis=0)['10-20 difference']

14.094230769230771

In [146]:
# average difference by region
states_merged.groupby('Region').mean().sort_values(by='10-20 difference', ascending=False)

Unnamed: 0_level_0,2020_state_rate,2000_rate,2010_rate,10-20 difference
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Puerto Rico,9.1,54.0,54.0,44.9
South,57.858824,70.647059,72.176471,14.317647
Northeast,58.5375,72.875,72.375,13.8375
West,57.492308,72.538462,70.923077,13.430769
Midwest,65.053846,79.230769,77.307692,12.253846


## Demographic Data

In [155]:
#load both sets of demographic data, join
demo1 = pd.read_csv('demographics1_working.csv')
demo2 = pd.read_csv('demographics2_working.csv')
demo = pd.merge(demo1, demo2, on=['GISJOIN', "YEAR", "STATE", "STATEA", 
                                              'COUNTY', 'COUNTYA', 'TRACTA', 
                                              'Geo_Name', 'NAME_E'])
demo

Unnamed: 0,GISJOIN,YEAR,STATE,STATEA,COUNTY,COUNTYA,TRACTA,Geo_Name,NAME_E,total_population,...,us_territory_born,us_born_abroad,us_naturalization,not_us_citizen,total_pr,pr_born,pr_us_born,pr_born_abroad,pr_naturalization,pr_not_us_citizen
0,G0100010020100,2014-2018,Alabama,1,Autauga County,1,20100,"Tract 201, Autauga","Tract 201, Autauga Alabama",1923,...,0.0,11.0,18.0,24.0,,,,,,
1,G0100010020200,2014-2018,Alabama,1,Autauga County,1,20200,"Tract 202, Autauga","Tract 202, Autauga Alabama",2028,...,0.0,10.0,15.0,10.0,,,,,,
2,G0100010020300,2014-2018,Alabama,1,Autauga County,1,20300,"Tract 203, Autauga","Tract 203, Autauga Alabama",3476,...,5.0,60.0,49.0,131.0,,,,,,
3,G0100010020400,2014-2018,Alabama,1,Autauga County,1,20400,"Tract 204, Autauga","Tract 204, Autauga Alabama",3831,...,0.0,30.0,30.0,51.0,,,,,,
4,G0100010020500,2014-2018,Alabama,1,Autauga County,1,20500,"Tract 205, Autauga","Tract 205, Autauga Alabama",9883,...,0.0,124.0,328.0,121.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73971,G7201530750501,2014-2018,Puerto Rico,72,Yauco Municipio,153,750501,"Tract 7505.01, Yauco","Tract 7505.01, Yauco Puerto Rico",6303,...,,,,,6303.0,6148.0,135.0,0.0,0.0,20.0
73972,G7201530750502,2014-2018,Puerto Rico,72,Yauco Municipio,153,750502,"Tract 7505.02, Yauco","Tract 7505.02, Yauco Puerto Rico",2316,...,,,,,2316.0,2201.0,97.0,0.0,13.0,5.0
73973,G7201530750503,2014-2018,Puerto Rico,72,Yauco Municipio,153,750503,"Tract 7505.03, Yauco","Tract 7505.03, Yauco Puerto Rico",2244,...,,,,,2244.0,2164.0,35.0,34.0,6.0,5.0
73974,G7201530750601,2014-2018,Puerto Rico,72,Yauco Municipio,153,750601,"Tract 7506.01, Yauco","Tract 7506.01, Yauco Puerto Rico",4107,...,,,,,4107.0,4007.0,100.0,0.0,0.0,0.0


In [164]:
df = pd.merge(tracts_merged, demo, on='Geo_Name')
df

Unnamed: 0,GEO_ID,Geo_Name,county,State,Region,2020_rate,2010_rate,10-20 difference,GISJOIN,YEAR,...,us_territory_born,us_born_abroad,us_naturalization,not_us_citizen,total_pr,pr_born,pr_us_born,pr_born_abroad,pr_naturalization,pr_not_us_citizen
0,1400000US01001020100,"Tract 201, Autauga",Autauga County,Alabama,South,64.3,70.6,6.3,G0100010020100,2014-2018,...,0.0,11.0,18.0,24.0,,,,,,
1,1400000US01001020200,"Tract 202, Autauga",Autauga County,Alabama,South,65.6,70.1,4.5,G0100010020200,2014-2018,...,0.0,10.0,15.0,10.0,,,,,,
2,1400000US01001020300,"Tract 203, Autauga",Autauga County,Alabama,South,73.8,73.6,-0.2,G0100010020300,2014-2018,...,5.0,60.0,49.0,131.0,,,,,,
3,1400000US01001020400,"Tract 204, Autauga",Autauga County,Alabama,South,77.6,78.4,0.8,G0100010020400,2014-2018,...,0.0,30.0,30.0,51.0,,,,,,
4,1400000US01001020600,"Tract 206, Autauga",Autauga County,Alabama,South,66.1,71.9,5.8,G0100010020600,2014-2018,...,0.0,25.0,21.0,12.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64408,1400000US72153750501,"Tract 7505.01, Yauco",Yauco Municipio,Puerto Rico,Puerto Rico,16.7,70.4,53.7,G7201530750501,2014-2018,...,,,,,6303.0,6148.0,135.0,0.0,0.0,20.0
64409,1400000US72153750502,"Tract 7505.02, Yauco",Yauco Municipio,Puerto Rico,Puerto Rico,21.4,73.4,52.0,G7201530750502,2014-2018,...,,,,,2316.0,2201.0,97.0,0.0,13.0,5.0
64410,1400000US72153750503,"Tract 7505.03, Yauco",Yauco Municipio,Puerto Rico,Puerto Rico,9.9,63.4,53.5,G7201530750503,2014-2018,...,,,,,2244.0,2164.0,35.0,34.0,6.0,5.0
64411,1400000US72153750601,"Tract 7506.01, Yauco",Yauco Municipio,Puerto Rico,Puerto Rico,11.1,67.1,56.0,G7201530750601,2014-2018,...,,,,,4107.0,4007.0,100.0,0.0,0.0,0.0
