# Examine census response rates, with data at state and census tract levels
## Data from: https://2020census.gov/en/response-rates.html

In [112]:
import pandas as pd

# in excel changed original dates from m/d/yyyy to yyyy-mm-dd
initial_df = pd.read_csv('decennialrr2020_working.csv')
# had to resave as UTF-8 CSV, hence the 2
crosswalk = pd.read_csv('decennialrr2020_crosswalkfile2.csv')
# states paired with region as defined by census map at https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf
regions = pd.read_csv('state_region.csv')

## Columns


GEO_ID = Geographic Identifier

RESP_DATE = Posting Date

State = name of state (one of the 50 states, District of Columbia, Puerto Rico, or NaN)

Geo_Name = name of the tract, county, state

Region = region of the U.S. in which state is located as defined by census map at https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf

Geo_Type = type of geography; possible answers include Census Tract, Congressional District, Consolidated City, Country, County, County Subdivision, Place, Region, State, Tribal Tract, Tribal Area

DRRINT = Daily Self-Response Rate - Internet

DRRALL = Daily Self-Response Rate – Overall

CRRINT = Cumulative Self-Response Rate - Internet

not_int = new calculated column showing response rate NOT from internet

CRRALL = Cumulative Self-Response Rate – Overall

DINTMIN = Minimum Daily Internet Self-Response Rate

DMIN = Minimum Daily Overall Self-Response Rate

CINTMIN = Minimum Cumulative Internet Self-Response Rate

CMIN = Minimum Cumulative Overall Self-Response Rate

DINTMAX = Maximum Daily Internet Self-Response Rate

DMAX = Maximum Daily Overall Self-Response Rate

CINTMAX = Maximum Cumulative Internet Self-Response Rate

CMAX = Maximum Cumulative Overall Self-Response Rate

DINTAVG = Average Daily Internet Self-Response Rate

DAVG = Average Daily Overall Self-Response Rate

CINTAVG = Average Cumulative Internet Self-Response Rate

CAVG = Average Cumulative Overall Self-Response Rate

DINTMED = Median Daily Internet Self-Response Rate

DMED = Median Daily Overall Self-Response Rate

CINTMED = Median Cumulative Internet Self-Response Rate

CMED = Median Cumulative Overall Self-Response Rate

In [158]:
# merge responses and crosswalk
merged1 = pd.merge(initial_df, crosswalk, on='GEO_ID')

#merge merged1 with region data
merged = pd.merge(merged1, regions, on='State')

# create column showing responses not from internet
merged['not_int'] = merged.CRRALL - merged.CRRINT

#reorder columns to move State, Geo_Name and Geo_Type to front; also going to drop some values
cols = merged.columns.tolist()
cols = ['GEO_ID', 'RESP_DATE', 'State', 'Geo_Name', 'Region', 'Geo_Type', 
        'DRRINT', 'DRRALL', 'CRRINT', 'not_int', 'CRRALL']
merged = merged[cols]
merged

# save merged df to csv
#merged.to_csv('merged2020rates.csv')

## Census Tracts

In [136]:
# just census tract geo types
tracts = merged[merged['Geo_Type'].str.contains("Tract")]

# which tract has the highest response self-response rate on 6/9
tracts.sort_values(by='DRRALL', ascending=False)

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,DRRINT,DRRALL,CRRINT,not_int,CRRALL
100700,1400000US46037952800,6/9/2020,South Dakota,"Tract 9528, Day",Midwest,Census Tract,0.1,10.6,19.1,46.3,65.4
26189,1400000US13121003700,6/9/2020,Georgia,"Tract 37, Fulton",South,Census Tract,0.0,9.4,17.0,50.3,67.3
24103,1400000US12119910800,6/9/2020,Florida,"Tract 9108, Sumter",South,Census Tract,0.0,8.3,24.5,51.7,76.2
42486,1400000US20125950800,6/9/2020,Kansas,"Tract 9508, Montgomery",Midwest,Census Tract,0.0,7.9,26.2,42.3,68.5
100720,1400000US46059975700,6/9/2020,South Dakota,"Tract 9757, Hand",Midwest,Census Tract,0.1,7.9,17.8,40.5,58.3
...,...,...,...,...,...,...,...,...,...,...,...
49248,1400000US24047951300,6/9/2020,Maryland,"Tract 9513, Worcester",South,Census Tract,0.0,0.0,47.3,13.2,60.5
49250,1400000US24047951500,6/9/2020,Maryland,"Tract 9515, Worcester",South,Census Tract,0.0,0.0,46.6,13.3,59.9
49251,1400000US24047951700,6/9/2020,Maryland,"Tract 9517, Worcester",South,Census Tract,0.0,0.0,31.8,6.5,38.3
49254,1400000US24510010300,6/9/2020,Maryland,"Tract 103, Baltimore",South,Census Tract,0.0,0.0,61.3,4.1,65.4


In [115]:
# highest cumulative response rate (CRRALL)
tracts.sort_values(by='CRRALL', ascending=False)

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,DRRINT,DRRALL,CRRINT,not_int,CRRALL
27200,1400000US13215010606,6/9/2020,Georgia,"Tract 106.06, Muscogee",South,Census Tract,0.0,0.0,0.0,98.1,98.1
50423,1400000US25013812903,6/9/2020,Massachusetts,"Tract 8129.03, Hampden",Midwest,Census Tract,0.0,0.0,86.2,6.9,93.1
54026,1400000US26099223801,6/9/2020,Michigan,"Tract 2238.01, Macomb",Midwest,Census Tract,0.0,0.0,86.1,6.5,92.6
54838,1400000US26139021605,6/9/2020,Michigan,"Tract 216.05, Ottawa",Midwest,Census Tract,0.0,0.0,85.4,7.0,92.4
113809,1400000US51059492202,6/9/2020,Virginia,"Tract 4922.02, Fairfax",South,Census Tract,0.0,0.0,88.9,3.3,92.2
...,...,...,...,...,...,...,...,...,...,...,...
54231,1400000US26099982100,6/9/2020,Michigan,"Tract 9821, Macomb",Midwest,Census Tract,0.0,0.0,0.0,0.0,0.0
3046,1400000US04005980000,6/9/2020,Arizona,"Tract 9800, Coconino",West,Census Tract,0.0,0.0,0.0,0.0,0.0
483,2560000US1995T00100,6/9/2020,,"T001, Los Coyotes",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0
497,2560000US2190T00100,6/9/2020,,"T001, Mesa Grande",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0


#### Tribal tracts

In [116]:
# response rates in tribal tracts
tribal = tracts[tracts['Geo_Type'].str.contains("Tribal")]
tribal.sort_values(by='CRRALL', ascending=False)

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,DRRINT,DRRALL,CRRINT,not_int,CRRALL
566,2560000US2560T00400,6/9/2020,,"T004, Oneida (WI)",na,Tribal Tract,0.0,0.1,78.7,6.7,85.4
452,2560000US1610T00600,6/9/2020,,"T006, Isabella",na,Tribal Tract,0.0,0.0,69.0,10.6,79.6
565,2560000US2560T00300,6/9/2020,,"T003, Oneida (WI)",na,Tribal Tract,0.0,0.0,68.4,9.7,78.1
604,2560000US3000T00200,6/9/2020,,"T002, Puyallup",na,Tribal Tract,0.1,0.1,71.3,6.2,77.5
564,2560000US2560T00200,6/9/2020,,"T002, Oneida (WI)",na,Tribal Tract,0.1,0.1,67.1,9.5,76.6
...,...,...,...,...,...,...,...,...,...,...,...
497,2560000US2190T00100,6/9/2020,,"T001, Mesa Grande",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0
494,2560000US2147T00100,6/9/2020,,"T001, Mashpee Wampanoag",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0
483,2560000US1995T00100,6/9/2020,,"T001, Los Coyotes",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0
582,2560000US2695T00100,6/9/2020,,"T001, Passamaquoddy",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0


In [64]:
### tribal areas and tracts stats

#mean non internet response
tribal.mean(axis=0)['not_int']

6.115962441314556

In [65]:
# mean internet response rate
tribal.mean(axis=0)['CRRINT']

19.630281690140855

In [66]:
# mean overall response rate
tribal.mean(axis=0)['CRRALL']

25.7462441314554

#### Tracts with 0 overall response rate

In [117]:
## Places with 0 cumulative response rate
is_zero = tracts['CRRALL'] == 0.0
zeros = tracts[is_zero]
zeros

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,DRRINT,DRRALL,CRRINT,not_int,CRRALL
433,2560000US1440T00100,6/9/2020,,"T001, Havasupai",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0
483,2560000US1995T00100,6/9/2020,,"T001, Los Coyotes",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0
494,2560000US2147T00100,6/9/2020,,"T001, Mashpee Wampanoag",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0
497,2560000US2190T00100,6/9/2020,,"T001, Mesa Grande",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0
529,2560000US2430T01800,6/9/2020,,"T018, Navajo Nation",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0
582,2560000US2695T00100,6/9/2020,,"T001, Passamaquoddy",na,Tribal Tract,0.0,0.0,0.0,0.0,0.0
2959,1400000US04001944202,6/9/2020,Arizona,"Tract 9442.02, Apache",West,Census Tract,0.0,0.0,0.0,0.0,0.0
2961,1400000US04001944302,6/9/2020,Arizona,"Tract 9443.02, Apache",West,Census Tract,0.0,0.0,0.0,0.0,0.0
3041,1400000US04005942202,6/9/2020,Arizona,"Tract 9422.02, Coconino",West,Census Tract,0.0,0.0,0.0,0.0,0.0
3046,1400000US04005980000,6/9/2020,Arizona,"Tract 9800, Coconino",West,Census Tract,0.0,0.0,0.0,0.0,0.0


## States

In [137]:
# response rate by state
is_states = merged['Geo_Type'] == 'State'
states = merged[is_states]
states

# highest cumulative response rate (CRRALL)
states.sort_values(by='CRRALL', ascending=False)

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,DRRINT,DRRALL,CRRINT,not_int,CRRALL
56336,0400000US27,6/9/2020,Minnesota,Minnesota,Midwest,State,0.0,0.3,59.5,11.0,70.5
118515,0400000US55,6/9/2020,Wisconsin,Wisconsin,Midwest,State,0.0,0.1,56.2,11.7,67.9
38820,0400000US19,6/9/2020,Iowa,Iowa,Midwest,State,0.0,0.3,53.1,14.3,67.4
51572,0400000US26,6/9/2020,Michigan,Michigan,Midwest,State,0.0,0.3,52.9,14.4,67.3
65274,0400000US31,6/9/2020,Nebraska,Nebraska,Midwest,State,0.0,0.5,53.4,13.5,66.9
115591,0400000US53,6/9/2020,Washington,Washington,West,State,0.0,0.1,57.7,8.4,66.1
113065,0400000US51,6/9/2020,Virginia,Virginia,South,State,0.0,0.1,54.2,11.7,65.9
83303,0400000US39,6/9/2020,Ohio,Ohio,Midwest,State,0.0,0.2,50.9,14.9,65.8
29374,0400000US17,6/9/2020,Illinois,Illinois,Midwest,State,0.0,0.1,53.9,11.9,65.8
35465,0400000US18,6/9/2020,Indiana,Indiana,Midwest,State,0.0,0.2,50.4,15.1,65.5


In [119]:
# highest non-internet response rate (not_int)
states.sort_values(by='not_int', ascending=False)

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,DRRINT,DRRALL,CRRINT,not_int,CRRALL
60537,0400000US28,6/9/2020,Mississippi,Mississippi,South,State,0.0,0.7,34.0,22.0,56.0
4788,0400000US05,6/9/2020,Arkansas,Arkansas,South,State,0.0,0.4,37.2,18.2,55.4
746,0400000US01,6/9/2020,Alabama,Alabama,South,State,0.0,0.4,40.9,17.9,58.8
43476,0400000US21,6/9/2020,Kentucky,Kentucky,South,State,0.0,0.2,47.8,16.5,64.3
101175,0400000US47,6/9/2020,Tennessee,Tennessee,South,State,0.0,0.2,45.1,15.7,60.8
45317,0400000US22,6/9/2020,Louisiana,Louisiana,South,State,0.0,0.4,39.7,15.3,55.0
35465,0400000US18,6/9/2020,Indiana,Indiana,Midwest,State,0.0,0.2,50.4,15.1,65.5
83303,0400000US39,6/9/2020,Ohio,Ohio,Midwest,State,0.0,0.2,50.9,14.9,65.8
98262,0400000US45,6/9/2020,South Carolina,South Carolina,South,State,0.0,0.3,41.2,14.4,55.6
51572,0400000US26,6/9/2020,Michigan,Michigan,Midwest,State,0.0,0.3,52.9,14.4,67.3


#### State stats

In [48]:
states.mean(axis=0)['not_int']

11.63076923076923

In [49]:
states.mean(axis=0)['CRRINT']

47.101923076923065

In [50]:
states.mean(axis=0)['CRRALL']

58.73269230769233

In [141]:
# average by region
# NOTE as tribal tracts are not assigned to a state they do not have a corresponding region and thus are not counted in the regional calculations
states.groupby('Region').mean().sort_values(by='CRRALL', ascending=False)

Unnamed: 0_level_0,DRRINT,DRRALL,CRRINT,not_int,CRRALL
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Midwest,0.0,0.330769,52.238462,12.815385,65.053846
Northeast,0.0,0.1,47.3375,11.2,58.5375
West,0.0,0.169231,48.646154,8.846154,57.492308
South,0.0,0.266667,42.172222,12.977778,55.15


#### State stats without Puerto Rico

In [120]:
no_pr = states[states.State != 'Puerto Rico']

# average non internet response rate
no_pr.mean(axis=0)['not_int']

11.84705882352941

In [None]:
# average internet response
no_pr.mean(axis=0)['CRRINT']

In [None]:
# average overall response rate
no_pr.mean(axis=0)['CRRALL']

In [142]:
# average region response rate (so south now excludes Puerto Rico)
no_pr.groupby('Region').mean().sort_values(by='CRRALL', ascending=False)

Unnamed: 0_level_0,DRRINT,DRRALL,CRRINT,not_int,CRRALL
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Midwest,0.0,0.330769,52.238462,12.815385,65.053846
Northeast,0.0,0.1,47.3375,11.2,58.5375
South,0.0,0.282353,44.152941,13.705882,57.858824
West,0.0,0.169231,48.646154,8.846154,57.492308


## Compare to 2010 response rates

### 2010 to 2020 State Response Differences

In [163]:
# read in csvs with 2010 response data for tracts and states
rates2010 = pd.read_csv('2010responserate.csv')
rates2010

Unnamed: 0,NAME,county,State,GEO_ID,FSRR2010,state_num,county_num,tract
0,Census Tract 201,Autauga County,Alabama,1400000US01001020100,70.6,1,1,20100
1,Census Tract 202,Autauga County,Alabama,1400000US01001020200,70.1,1,1,20200
2,Census Tract 203,Autauga County,Alabama,1400000US01001020300,73.6,1,1,20300
3,Census Tract 204,Autauga County,Alabama,1400000US01001020400,78.4,1,1,20400
4,Census Tract 205.01,Autauga County,Alabama,1400000US01001020501,81.2,1,1,20501
...,...,...,...,...,...,...,...,...
84514,Census Tract 7505.01,Yauco Municipio,Puerto Rico,1400000US72153750501,70.4,72,153,750501
84515,Census Tract 7505.02,Yauco Municipio,Puerto Rico,1400000US72153750502,73.4,72,153,750502
84516,Census Tract 7505.03,Yauco Municipio,Puerto Rico,1400000US72153750503,63.4,72,153,750503
84517,Census Tract 7506.01,Yauco Municipio,Puerto Rico,1400000US72153750601,67.1,72,153,750601


In [161]:
## difference in row numbers: both tract dfs have 84519 rows, but when joined only 84093
# Identify what values are in rates2010 and not in tracts
key_diff1 = set(rates2010.GEO_ID).difference(tracts.GEO_ID)
len(key_diff1)
key_diff1

# Identify what values are in tracts and not in rates2010
key_diff2 = set(tracts.GEO_ID).difference(rates2010.GEO_ID)
len(key_diff2)
key_diff2

426

##### Conclusion: 2010 rates do not include tribal tracts while 2020 tracts are missing some tracts in a multitude of states, likely due to a change in tract boundaries. Those differences account for 426 tracts, which is .5% of the original 84519 tracts

In [164]:
# merge with 2020 tracts
tracts_merged = pd.merge(tracts, rates2010, on='GEO_ID')
cols = tracts_merged.columns.tolist()
cols = ['GEO_ID','NAME','county', 'State_y', 'Region', 'CRRALL', 'FSRR2010']
tracts_merged = tracts_merged[cols]
tracts_merged['10-20 difference'] = tracts_merged['FSRR2010'] - tracts_merged['CRRALL']
tracts_merged.sort_values(by='10-20 difference', ascending=False)

Unnamed: 0,GEO_ID,NAME,county,State_y,Region,CRRALL,FSRR2010,10-20 difference
8287,1400000US06053980000,Census Tract 9800,Monterey County,California,West,0.7,100.0,99.3
53795,1400000US36103159510,Census Tract 1595.10,Suffolk County,New York,Northeast,0.8,100.0,99.2
7980,1400000US06037980021,Census Tract 9800.21,Los Angeles County,California,West,1.6,100.0,98.4
49059,1400000US36001000404,Census Tract 4.04,Albany County,New York,Northeast,1.8,100.0,98.2
10379,1400000US06071010906,Census Tract 109.06,San Bernardino County,California,West,2.9,100.0,97.1
...,...,...,...,...,...,...,...,...
83055,1400000US56013940202,Census Tract 9402.02,Fremont County,Wyoming,West,17.9,,
83056,1400000US56013940301,Census Tract 9403.01,Fremont County,Wyoming,West,21.5,,
83057,1400000US56013940302,Census Tract 9403.02,Fremont County,Wyoming,West,31.6,,
83058,1400000US56013940400,Census Tract 9404,Fremont County,Wyoming,West,43.1,,


In [165]:
# average difference as of 6/9/20
tracts_merged.mean(axis=0)['10-20 difference']

7.510194825399082

In [None]:
# average difference by region
states_merged.groupby('Region').mean()

### 2010 to 2020 State Response Differences

In [131]:
# read in csvs with 2010 response data for states
states2010 = pd.read_csv('states2010.csv')

# merge with 2020 states
states_merged = pd.merge(states, states2010, on='State')
cols = states_merged.columns.tolist()
cols = ['GEO_ID', 'State', 'Region',
 'CRRALL', 'response2000', 'response2010']
states_merged = states_merged[cols]
states_merged['10-20 difference'] = states_merged['response2010'] - states_merged['CRRALL']
states_merged.sort_values(by='10-20 difference', ascending=False)

Unnamed: 0,GEO_ID,State,Region,CRRALL,response2000,response2010,10-20 difference
51,0400000US72,Puerto Rico,South,9.1,54.0,54.0,44.9
1,0400000US02,Alaska,West,41.5,67.0,64.0,22.5
40,0400000US45,South Carolina,South,55.6,68.0,75.0,19.4
33,0400000US37,North Carolina,South,57.1,69.0,76.0,18.9
50,0400000US56,Wyoming,West,50.8,75.0,69.0,18.2
45,0400000US50,Vermont,Northeast,51.5,68.0,69.0,17.5
19,0400000US23,Maine,Northeast,50.9,67.0,68.0,17.1
48,0400000US54,West Virginia,South,48.2,68.0,65.0,16.8
31,0400000US35,New Mexico,West,48.4,68.0,65.0,16.6
26,0400000US30,Montana,West,51.8,72.0,68.0,16.2


In [130]:
# average difference as of 6/9/20
states_merged.mean(axis=0)['10-20 difference']

14.094230769230771

In [133]:
# average difference by region
states_merged.groupby('Region').mean()

Unnamed: 0_level_0,CRRALL,response2000,response2010,10-20 difference
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Midwest,65.053846,79.230769,77.307692,12.253846
Northeast,58.5375,72.875,72.375,13.8375
South,55.15,69.722222,71.166667,16.016667
West,57.492308,72.538462,70.923077,13.430769
