<a href="https://colab.research.google.com/github/tdiffendal/USAT/blob/master/census-responses/Copy_of_census_responses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2020 Census Response Rate Analysis

### Theresa Diffendal, USA Today data intern, 06/2020

#### 2020 response rates from: https://2020census.gov/en/response-rates.html
#### 2010 response rates from: https://api.census.gov/data/2010/dec/responserate/variables.html
#### Demographic information in 2014-2018 ACS 5-year-estimate from: https://data2.nhgis.org/main

## Column Names

GEO_ID = Geographic Identifier

RESP_DATE = Posting Date

State = name of state (one of the 50 states, District of Columbia, Puerto Rico, or NaN)

Geo_Name = name of the tract, county, state

Region = region of the U.S. in which state is located as defined by census map at https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf

Geo_Type = type of geography; possible answers include Census Tract, Congressional District, Consolidated City, Country, County, County Subdivision, Place, Region, State, Tribal Tract, Tribal Area

DRRINT = Daily Self-Response Rate - Internet

DRRALL = Daily Self-Response Rate – Overall

CRRINT = Cumulative Self-Response Rate - Internet; renamed internet

not_int = new calculated column showing response rate NOT from internet

CRRALL = Cumulative Self-Response Rate – Overall; renamed 2020_rate

DINTMIN = Minimum Daily Internet Self-Response Rate

DMIN = Minimum Daily Overall Self-Response Rate

CINTMIN = Minimum Cumulative Internet Self-Response Rate

CMIN = Minimum Cumulative Overall Self-Response Rate

DINTMAX = Maximum Daily Internet Self-Response Rate

DMAX = Maximum Daily Overall Self-Response Rate

CINTMAX = Maximum Cumulative Internet Self-Response Rate

CMAX = Maximum Cumulative Overall Self-Response Rate

DINTAVG = Average Daily Internet Self-Response Rate

DAVG = Average Daily Overall Self-Response Rate

CINTAVG = Average Cumulative Internet Self-Response Rate

CAVG = Average Cumulative Overall Self-Response Rate

DINTMED = Median Daily Internet Self-Response Rate

DMED = Median Daily Overall Self-Response Rate

CINTMED = Median Cumulative Internet Self-Response Rate

CMED = Median Cumulative Overall Self-Response Rate

## Read, Merge, Clean Data

### Initial Load and Merge

In [58]:
import pandas as pd
import numpy as np

# read in 2020 response rates
initial_df = pd.read_csv('https://www2.census.gov/programs-surveys/decennial/2020/data/2020map/2020/decennialrr2020.csv')
# had to download from https://www2.census.gov/programs-surveys/decennial/2020/data/2020map/2020/ resave as UTF-8 CSV, hence the 2
crosswalk = pd.read_csv('https://www2.census.gov/programs-surveys/decennial/2020/data/2020map/2020/decennialrr2020_crosswalkfile.csv', encoding='latin-1')
# states paired with region as defined by census map at https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf
regions = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/data/state_region.csv')

# merge responses and crosswalk
temp = pd.merge(initial_df, crosswalk, on='GEO_ID')

#merge merged1 with region data
merged = pd.merge(temp, regions, on='State')

# create column showing responses not from internet
merged['not_int'] = merged.CRRALL - merged.CRRINT
merged['not_int_pct'] = (merged.not_int) * 100 / merged.CRRALL

#reorder columns to move State, Geo_Name and Geo_Type to front; also going to drop some values
cols = merged.columns.tolist()
cols = ['GEO_ID', 'RESP_DATE', 'State', 'Geo_Name', 'Region', 'Geo_Type', 
        'CRRINT', 'not_int', 'not_int_pct', 'CRRALL']
merged = merged[cols]
merged = merged.rename(columns={'CRRINT':'internet', 'CRRALL':'2020_rate'})
merged

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,internet,not_int,not_int_pct,2020_rate
0,0100000US,2020-06-30,,United States,na,Country,49.3,12.5,20.226537,61.8
1,0200000US1,2020-06-30,,Northeast,na,Region,49.5,11.8,19.249592,61.3
2,0200000US2,2020-06-30,,Midwest,na,Region,53.0,13.8,20.658683,66.8
3,0200000US3,2020-06-30,,South,na,Region,45.6,13.3,22.580645,58.9
4,0200000US4,2020-06-30,,West,na,Region,52.3,10.1,16.185897,62.4
...,...,...,...,...,...,...,...,...,...,...
123245,1400000US72153750502,2020-06-30,Puerto Rico,"Tract 7505.02, Yauco",Puerto Rico,Census Tract,22.8,16.4,41.836735,39.2
123246,1400000US72153750503,2020-06-30,Puerto Rico,"Tract 7505.03, Yauco",Puerto Rico,Census Tract,12.1,13.8,53.281853,25.9
123247,1400000US72153750601,2020-06-30,Puerto Rico,"Tract 7506.01, Yauco",Puerto Rico,Census Tract,14.3,12.7,47.037037,27.0
123248,1400000US72153750602,2020-06-30,Puerto Rico,"Tract 7506.02, Yauco",Puerto Rico,Census Tract,6.4,11.8,64.835165,18.2


### States

#### 2020 data

In [59]:
# create df with response rate by state
states2020 = merged[merged['Geo_Type'] == 'State']
states2020 = states2020.rename(columns={"internet": "state_internet", 
                                        "not_int" : "state_not_int", 
                                        'not_int_pct' : 'state_not_int_pct',
                                        "2020_rate" : "2020_state_rate"})

# print df and sort by highest cumulative response rate
#states2020.sort_values(by='2020_state_rate', ascending=False)

#### Join 2010 states

In [60]:
# read in csvs with 2010 response data for states
states2010 = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/data/states2010.csv')

# merge with 2020 states
states = pd.merge(states2020, states2010, on='State')
#get the column names
cols = states.columns.tolist()
#only select columns we want
cols = ['GEO_ID', 'State', 'Region',
 '2020_state_rate', '2010_rate', '2000_rate']
states = states[cols]
states = states.rename(columns={'2000_rate':'2000_state_rate', '2010_rate':'2010_state_rate'})

#create column with difference in 2010 vs 2020 response rate
states['10_20_state_difference'] = (states['2020_state_rate'] - states['2010_state_rate']) * 100 / states['2010_state_rate']

#print table sorted by 10-20 difference largest ---> smallest
states.sort_values(by=['2000_state_rate'], ascending=True)

Unnamed: 0,GEO_ID,State,Region,2020_state_rate,2010_state_rate,2000_state_rate,10_20_state_difference
51,0400000US72,Puerto Rico,Puerto Rico,22.6,54,54,-58.148148
11,0400000US15,Hawaii,West,58.0,68,66,-14.705882
1,0400000US02,Alaska,West,47.8,64,67,-25.3125
19,0400000US23,Maine,Northeast,53.3,68,67,-21.617647
48,0400000US54,West Virginia,South,53.4,65,68,-17.846154
45,0400000US50,Vermont,Northeast,55.1,69,68,-20.144928
40,0400000US45,South Carolina,South,56.3,75,68,-24.933333
31,0400000US35,New Mexico,West,50.9,65,68,-21.692308
18,0400000US22,Louisiana,South,56.0,65,68,-13.846154
0,0400000US01,Alabama,South,59.5,72,68,-17.361111


In [61]:
# print above dataframe to csv
#from google.colab import drive
#drive.mount('/gdrive')
#4/1QHjzPlDYwEyGbg5xGgBbeLxQRP_D-nPkF9bC8U5zL5ShJLIeEW_UiE
states.to_csv('/gdrive/My Drive/0USA Today/state_rates.csv')

### Census Tracts

#### 2020 Tracts

In [62]:
# select just census tract geo types
tracts2020 = merged[merged['Geo_Type'].str.contains("Tract")]
#rename column
tracts2020 = tracts2020.rename(columns={"2020_rate": "2020_tract_rate",
                                        'not_int':'tract_not_int',
                                        'not_int_pct':'tract_not_int_pct'})
# sort by highest cumulative response rate
tracts2020.sort_values(by='2020_tract_rate', ascending=False)

Unnamed: 0,GEO_ID,RESP_DATE,State,Geo_Name,Region,Geo_Type,internet,tract_not_int,tract_not_int_pct,2020_tract_rate
27200,1400000US13215010606,2020-06-30,Georgia,"Tract 106.06, Muscogee",South,Census Tract,0.0,98.1,100.000000,98.1
50423,1400000US25013812903,2020-06-30,Massachusetts,"Tract 8129.03, Hampden",Midwest,Census Tract,86.2,6.9,7.411386,93.1
54026,1400000US26099223801,2020-06-30,Michigan,"Tract 2238.01, Macomb",Midwest,Census Tract,86.4,6.6,7.096774,93.0
54838,1400000US26139021605,2020-06-30,Michigan,"Tract 216.05, Ottawa",Midwest,Census Tract,85.7,7.0,7.551241,92.7
113809,1400000US51059492202,2020-06-30,Virginia,"Tract 4922.02, Fairfax",South,Census Tract,89.3,3.4,3.667745,92.7
...,...,...,...,...,...,...,...,...,...,...
89622,1400000US40109980003,2020-06-30,Oklahoma,"Tract 9800.03, Oklahoma",South,Census Tract,0.0,0.0,,0.0
46742,1400000US22125951703,2020-06-30,Louisiana,"Tract 9517.03, West Feliciana",South,Census Tract,0.0,0.0,,0.0
94380,1400000US42019980100,2020-06-30,Pennsylvania,"Tract 9801, Butler",Northeast,Census Tract,0.0,0.0,,0.0
89623,1400000US40109980005,2020-06-30,Oklahoma,"Tract 9800.05, Oklahoma",South,Census Tract,0.0,0.0,,0.0


In [63]:
#tract rates compared to state averages
tract2020states = pd.merge(tracts2020, states, on=['State', 'Region'])
tract2020states = tract2020states[['GEO_ID_x', 'State', 'Geo_Name', 'Geo_Type', 'Region','2020_tract_rate', '2020_state_rate', '2010_state_rate', '10_20_state_difference']]
tract2020states = tract2020states.rename(columns={'GEO_ID_x':'GEO_ID'})
tract2020states['2020_tract_st_diff'] = tract2020states['2020_tract_rate'] - tract2020states['2020_state_rate']
tract2020states.sort_values(by=['2020_tract_st_diff'])

print(
    "Difference in records:", len(tracts2020) - len(tract2020states), "\n",
    "Number of tribal tracts:", len(tracts2020[tracts2020['Geo_Type'].str.contains("Tribal")])
)
#merging tracts with states will drop tribal tracts (as they have no state), so those are examined separately below

Difference in records: 426 
 Number of tribal tracts: 426


#### Join 2010 tract rates

In [64]:
# read in csvs with 2010 response data for tracts and states
tracts2010 = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/data/2010responserate.csv')
#rename this column
tracts2010 = tracts2010.rename(columns={'FSRR2010':'2010_tract_rate'})
#tracts2010

In [65]:
## difference in row numbers: both tract dfs have 84519 rows, but when joined only 84093
# Identify what values are in tracts2010 and not in tracts2020
#key_diff1 = set(tracts2010.GEO_ID).difference(tracts2020.GEO_ID)
#len(key_diff1)
#key_diff1

# Identify what values are in tracts2020 and not in tracts2010
#key_diff2 = set(tracts2020.GEO_ID).difference(tracts2010.GEO_ID)
#len(key_diff2)
#key_diff2

# 2010 rates do not include the 426 tribal tracts
# Those differences account for 426 tracts, which is .5% of the original 84519 
# tracts. These tracts are dropped in the comparative analyses and are analyzed separately

In [66]:
# merge with 2020 tracts
tracts = pd.merge(tract2020states, tracts2010, on='GEO_ID')
#select only columns we want
cols = ['Geo_Name','county', 'State_y', 'Region', 'Geo_Type', '2020_tract_rate', '2010_tract_rate', '2020_tract_st_diff', '2020_state_rate', '2010_state_rate', '10_20_state_difference', 'GEO_ID']
tracts = tracts[cols]
#rename weird column name
tracts = tracts.rename(columns={'State_y':'State'})
#print df sorted largest --> smallest 2010 rate
#tracts.sort_values(by='2010_tract_rate', ascending=False)

In [67]:
#how many null 2010 response values are there
is_temp = tracts.isnull()
no_2010 = is_temp.any(axis=1)
no_2010 = tracts[no_2010]
no_2010.sort_values(by="2010_tract_rate")

#tracts2010[tracts2010['2010_tract_rate'] == 0]

print(
    "There are", len(no_2010), "null 2010 response rate values out of", 
    len(tracts2010), "total 2010 observations, \n or",
    len(no_2010) * 100 / len(tracts2010) , "% \n",
    "and", len(no_2010) * 100 / len(tracts), "% of the", len(tracts),
    "total of 2020 and 2010 tracts"
)


#how many 0 rate tracts in each state
print(no_2010['State'].value_counts())

There are 531 null 2010 response rate values out of 84519 total 2010 observations, 
 or 0.6282611010542009 % 
 and 0.6314437586957298 % of the 84093 total of 2020 and 2010 tracts
 Wisconsin               92
 Florida                 56
 Texas                   54
 California              54
 New York                50
 Arizona                 50
 New Mexico              30
 Massachusetts           21
 Washington              17
 Montana                 14
 South Dakota            13
 North Carolina           8
 Alabama                  7
 Minnesota                7
 Idaho                    6
 Utah                     6
 North Dakota             6
 Colorado                 6
 Wyoming                  6
 Nevada                   3
 Vermont                  3
 Virginia                 3
 Maine                    3
 New Hampshire            3
 Oklahoma                 2
 New Jersey               2
 Nebraska                 2
 Rhode Island             1
 Alaska                   1
 District

In [68]:
#create column with difference in 2010 vs 2020 response rate
tracts['10_20_tract_difference'] = (tracts['2020_tract_rate'] - tracts['2010_tract_rate']) / tracts['2010_tract_rate']
#sort df largest --> smallest 10-20 difference
tracts.sort_values(by='10_20_tract_difference', ascending=False)

Unnamed: 0,Geo_Name,county,State,Region,Geo_Type,2020_tract_rate,2010_tract_rate,2020_tract_st_diff,2020_state_rate,2010_state_rate,10_20_state_difference,GEO_ID,10_20_tract_difference
12328,"Tract 1103.01, Santa Cruz",Santa Cruz County,California,West,Census Tract,45.9,0.0,-17.0,62.9,73,-13.835616,1400000US06087110301,inf
53804,"Tract 1596.02, Suffolk",Suffolk County,New York,Northeast,Census Tract,60.8,0.0,3.5,57.3,69,-16.956522,1400000US36103159602,inf
53322,"Tract 605.05, Saratoga",Saratoga County,New York,Northeast,Census Tract,19.4,0.0,-37.9,57.3,69,-16.956522,1400000US36091060505,inf
53425,"Tract 7403, Schoharie",Schoharie County,New York,Northeast,Census Tract,8.3,0.0,-49.0,57.3,69,-16.956522,1400000US36095740300,inf
21943,"Tract 231.15, DeKalb",DeKalb County,Georgia,South,Census Tract,10.0,0.0,-47.7,57.7,72,-19.861111,1400000US13089023115,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...
83055,"Tract 9402.02, Fremont",Fremont County,Wyoming,West,Census Tract,30.8,,-25.1,55.9,69,-18.985507,1400000US56013940202,
83056,"Tract 9403.01, Fremont",Fremont County,Wyoming,West,Census Tract,37.6,,-18.3,55.9,69,-18.985507,1400000US56013940301,
83057,"Tract 9403.02, Fremont",Fremont County,Wyoming,West,Census Tract,48.1,,-7.8,55.9,69,-18.985507,1400000US56013940302,
83058,"Tract 9404, Fremont",Fremont County,Wyoming,West,Census Tract,59.8,,3.9,55.9,69,-18.985507,1400000US56013940400,


### Demographic Data

In [117]:
#load both sets of demographic data, join
temp1 = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/data/nhgis0003_csv/nhgis0003_ds239_20185_2018_tract.csv', 
                    encoding='latin-1')
temp2 = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/data/nhgis0003_csv/nhgis0003_ds240_20185_2018_tract.csv', 
                    encoding='latin-1')
demo = pd.merge(temp1, temp2, on=['GISJOIN', 'YEAR', 'REGIONA', 'DIVISIONA',
 'STATE', 'STATEA', 'COUNTY', 'COUNTYA', 'COUSUBA', 'PLACEA', 'TRACTA',
  'CONCITA', 'AIANHHA', 'RES_ONLYA', 'TRUSTA', 'AITSCEA', 'ANRCA', 'CBSAA',
 'CSAA', 'METDIVA', 'NECTAA', 'CNECTAA', 'NECTADIVA', 'UAA', 'CDCURRA',
 'SLDUA', 'SLDLA', 'ZCTA5A', 'SUBMCDA', 'SDELMA', 'SDSECA', 'SDUNIA',
 'PUMA5A', 'BTTRA', 'NAME_E'])

#create a new edited column so we can join with the response rate dfs
# many different lengths so selecting substr that's always the same
demo['tractNum'] = demo['NAME_E'].apply(lambda x: ' '.join(x.split(' ')[1:4]))
tracts['tractNum'] = tracts['Geo_Name'].apply(lambda x: ' '.join(x.split(' ')[0:3]))

# merge to create new df with response rates and demos for all tracts
temp = pd.merge(tracts, demo, on='tractNum')

#create new column adding up pop with rent > 30% income (homelessness marker)
#temp['rent_30_more'] = (temp['rent_30_34.9'] + temp['rent_35_39.9'] 
#+ temp['rent_40_49.9'] + temp['rent_50_over'])

#create column with % population black, hispanic, non-white
#temp['non_white_pct'] = (temp['total_population'] - temp['white_alone']) * 100 / temp['total_population']
#temp['black_pct'] = (temp['black_alone'] * 100) / temp['total_population']

# check to see if column created
#pd.set_option('display.max_columns', None)
#list(temp.columns)
temp

Unnamed: 0,Geo_Name,county,State,Region,Geo_Type,2020_tract_rate,2010_tract_rate,2020_tract_st_diff,2020_state_rate,2010_state_rate,10_20_state_difference,GEO_ID,10_20_tract_difference,tractNum,GISJOIN,YEAR,REGIONA,DIVISIONA,STATE,STATEA,COUNTY,COUNTYA,COUSUBA,PLACEA,TRACTA,BLKGRPA,CONCITA,AIANHHA,RES_ONLYA,TRUSTA,AITSCEA,ANRCA,CBSAA,CSAA,METDIVA,NECTAA,CNECTAA,NECTADIVA,UAA,CDCURRA,...,AJ3KM003,AJ3KM004,AJ3KM005,AJ3KM006,AJ3KM007,AJ3KM008,AJ3KM009,AJ3KM010,AJ3KM011,AJ38M001,AJ38M002,AJ38M003,AJ38M004,AJ38M005,AJ38M006,AJ68E001,AJ68E002,AJ68E003,AJ68E004,AJ68E005,AJ68E006,AJ69E001,AJ69E002,AJ69E003,AJ69E004,AJ69E005,AJ69E006,NAME_M_y,AJ68M001,AJ68M002,AJ68M003,AJ68M004,AJ68M005,AJ68M006,AJ69M001,AJ69M002,AJ69M003,AJ69M004,AJ69M005,AJ69M006
0,"Tract 201, Autauga",Autauga County,Alabama,South,Census Tract,64.9,70.6,5.4,59.5,72,-17.361111,1400000US01001020100,-0.080737,"Tract 201, Autauga",G0100010020100,2014-2018,,,Alabama,1,Autauga County,1,,,20100,,,,,,,,,,,,,,,,...,18.0,12.0,10.0,12.0,7.0,21.0,38.0,40.0,22.0,73,79,12,89,43,50,1923.0,1870.0,0.0,11.0,18.0,24.0,,,,,,,"Census Tract 201, Autauga County, Alabama",253.0,253.0,12.0,12.0,18.0,19.0,,,,,,
1,"Tract 202, Autauga",Autauga County,Alabama,South,Census Tract,65.6,70.1,6.1,59.5,72,-17.361111,1400000US01001020200,-0.064194,"Tract 202, Autauga",G0100010020200,2014-2018,,,Alabama,1,Autauga County,1,,,20200,,,,,,,,,,,,,,,,...,27.0,16.0,14.0,49.0,32.0,26.0,7.0,53.0,13.0,87,83,12,73,39,75,2028.0,1993.0,0.0,10.0,15.0,10.0,,,,,,,"Census Tract 202, Autauga County, Alabama",192.0,193.0,12.0,12.0,16.0,16.0,,,,,,
2,"Tract 203, Autauga",Autauga County,Alabama,South,Census Tract,74.0,73.6,14.5,59.5,72,-17.361111,1400000US01001020300,0.005435,"Tract 203, Autauga",G0100010020300,2014-2018,,,Alabama,1,Autauga County,1,,,20300,,,,,,,,,,,,,,,,...,11.0,42.0,47.0,54.0,43.0,37.0,28.0,76.0,69.0,110,125,12,123,84,70,3476.0,3231.0,5.0,60.0,49.0,131.0,,,,,,,"Census Tract 203, Autauga County, Alabama",433.0,377.0,11.0,50.0,56.0,122.0,,,,,,
3,"Tract 204, Autauga",Autauga County,Alabama,South,Census Tract,78.1,78.4,18.6,59.5,72,-17.361111,1400000US01001020400,-0.003827,"Tract 204, Autauga",G0100010020400,2014-2018,,,Alabama,1,Autauga County,1,,,20400,,,,,,,,,,,,,,,,...,30.0,32.0,61.0,35.0,63.0,29.0,11.0,45.0,21.0,144,151,11,148,78,66,3831.0,3720.0,0.0,30.0,30.0,51.0,,,,,,,"Census Tract 204, Autauga County, Alabama",337.0,318.0,12.0,27.0,43.0,76.0,,,,,,
4,"Tract 206, Autauga",Autauga County,Alabama,South,Census Tract,66.7,71.9,7.2,59.5,72,-17.361111,1400000US01001020600,-0.072323,"Tract 206, Autauga",G0100010020600,2014-2018,,,Alabama,1,Autauga County,1,,,20600,,,,,,,,,,,,,,,,...,87.0,47.0,13.0,19.0,12.0,12.0,44.0,67.0,47.0,122,121,12,125,73,84,3705.0,3647.0,0.0,25.0,21.0,12.0,,,,,,,"Census Tract 206, Autauga County, Alabama",342.0,336.0,12.0,27.0,18.0,19.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65852,"Tract 7505.01, Yauco",Yauco Municipio,Puerto Rico,Puerto Rico,Census Tract,33.2,70.4,10.6,22.6,54,-58.148148,1400000US72153750501,-0.528409,"Tract 7505.01, Yauco",G7201530750501,2014-2018,,,Puerto Rico,72,Yauco Municipio,153,,,750501,,,,,,,,,,,,,,,,...,15.0,30.0,21.0,19.0,29.0,37.0,24.0,29.0,113.0,162,161,30,152,59,124,,,,,,,6303.0,6148.0,135.0,0.0,0.0,20.0,"Census Tract 7505.01, Yauco Municipio, Puerto ...",,,,,,,872.0,868.0,68.0,19.0,19.0,22.0
65853,"Tract 7505.02, Yauco",Yauco Municipio,Puerto Rico,Puerto Rico,Census Tract,39.2,73.4,16.6,22.6,54,-58.148148,1400000US72153750502,-0.465940,"Tract 7505.02, Yauco",G7201530750502,2014-2018,,,Puerto Rico,72,Yauco Municipio,153,,,750502,,,,,,,,,,,,,,,,...,9.0,22.0,23.0,14.0,14.0,23.0,14.0,25.0,52.0,85,78,17,77,18,54,,,,,,,2316.0,2201.0,97.0,0.0,13.0,5.0,"Census Tract 7505.02, Yauco Municipio, Puerto ...",,,,,,,386.0,356.0,89.0,14.0,20.0,7.0
65854,"Tract 7505.03, Yauco",Yauco Municipio,Puerto Rico,Puerto Rico,Census Tract,25.9,63.4,3.3,22.6,54,-58.148148,1400000US72153750503,-0.591483,"Tract 7505.03, Yauco",G7201530750503,2014-2018,,,Puerto Rico,72,Yauco Municipio,153,,,750503,,,,,,,,,,,,,,,,...,14.0,19.0,19.0,14.0,14.0,14.0,11.0,14.0,56.0,82,70,29,66,24,65,,,,,,,2244.0,2164.0,35.0,34.0,6.0,5.0,"Census Tract 7505.03, Yauco Municipio, Puerto ...",,,,,,,451.0,435.0,27.0,53.0,8.0,7.0
65855,"Tract 7506.01, Yauco",Yauco Municipio,Puerto Rico,Puerto Rico,Census Tract,27.0,67.1,4.4,22.6,54,-58.148148,1400000US72153750601,-0.597615,"Tract 7506.01, Yauco",G7201530750601,2014-2018,,,Puerto Rico,72,Yauco Municipio,153,,,750601,,,,,,,,,,,,,,,,...,14.0,25.0,13.0,14.0,14.0,13.0,30.0,14.0,52.0,166,148,44,126,74,118,,,,,,,4107.0,4007.0,100.0,0.0,0.0,0.0,"Census Tract 7506.01, Yauco Municipio, Puerto ...",,,,,,,668.0,650.0,91.0,14.0,14.0,14.0


In [70]:
print(
    "There are", len(tracts), "observations in the tracts df but only \n",
    len(temp), "after joining with the demographic data, \n",
    "a difference of", len(tracts) - len(temp)
)

There are 84093 observations in the tracts df but only 
 65857 after joining with the demographic data, 
 a difference of 18236


In [124]:
## difference in row numbers
# Identify what values are in tracts and not in demo
key_diff11 = set(tracts.tractNum).difference(demo.tractNum)
key_diff1 = pd.DataFrame(sorted(list(key_diff11)))
key_diff1 = key_diff1.rename(columns={0:'tractNum'})

# Identify what values are in demo and not in tracts
key_diff22 = set(demo.tractNum).difference(tracts.tractNum)
key_diff2 = pd.DataFrame(list(key_diff22))
key_diff2 = key_diff2.rename(columns={0:'tractNum'})

#key_diff1.to_csv("/gdrive/My Drive/0USA Today/unpaired_tracts.csv")
#key_diff2.to_csv("/gdrive/My Drive/0USA Today/unpaired_demos.csv")
#temp['tractNum'].to_csv("/gdrive/My Drive/0USA Today/paired.csv")

#print(key_diff1, "\n\n",
 #     key_diff2)
print("There are", len(key_diff1), 
      "tracts in the tracts df that are not in the demographics df \n",
      "And there are", len(key_diff2), 
      "tracts in the demo df not in the tracts df \n",
      "That's", len(key_diff1) + len(key_diff2), "unpaired values, which is \n",
      (len(key_diff1) + len(key_diff2)) * 100 / len(tracts), "% of observations in the tracts df \n\n")

print(
    '# Tracts per State not paired from demo df:\n',
    pd.merge(key_diff2, demo[['STATE', 'tractNum']], on='tractNum').STATE.value_counts(), "\n\n",
    '# Tracts per State not paired from tracts df: \n',
    pd.merge(key_diff1, tracts[['State', 'tractNum']], on='tractNum').State.value_counts()
)

There are 21914 tracts in the tracts df that are not in the demographics df 
 And there are 11616 tracts in the demo df not in the tracts df 
 That's 33530 unpaired values, which is 
 39.87252208863996 % of observations in the tracts df 


# Tracts per State not paired from demo df:
 Texas                   1459
California              1135
Florida                  951
Georgia                  703
New York                 573
North Carolina           453
Michigan                 362
Ohio                     337
Pennsylvania             293
Washington               287
Virginia                 284
Missouri                 250
Louisiana                238
Arizona                  230
Tennessee                221
South Carolina           216
Alabama                  216
Indiana                  195
Colorado                 189
New Jersey               185
Mississippi              182
Oklahoma                 179
Kentucky                 171
Massachusetts            163
Minnesota          

In [92]:
demo[key_diff2].STATE.value_counts()

KeyboardInterrupt: ignored

In [None]:
#df.std()Returns the standard deviation of each column
#df.corr()Returns the correlation between columns in a data frame

#In order to get # of null/missing values for each column, run 
pd.set_option('display.max_rows', 100)
pd.isnull(temp).sum()

# make a dataframe of all rows with na value
temp1 = temp[temp.isna().any(axis=1)]
temp1
# how many nas in each state
temp2 = temp1['State'].value_counts()
temp2 = pd.DataFrame(temp2)
temp2 = temp2.reset_index().rename(columns={'index':'state', 'State':'count_na'})
temp2
#compare to total 
temp3 = temp['State'].value_counts()
temp3 = pd.DataFrame(temp3)
temp3 = temp3.reset_index().rename(columns={'index':'state', 'State':'count'})
temp3

#see what percent of state values of na
#Puerto Rico will lose the greatest % if these are dropped
temp4 = pd.merge(temp2, temp3, on=['state'])
temp4['na_percent'] = (temp4['count_na']*100) / temp4['count']
temp4

#how many total nas? --> 1171
sum(temp4['count_na'])

#nas account for what total % of all rows --> 1.82
(sum(temp4['count_na'])*100) / sum(temp4['count'])

#Less than 2% of total, so for now will drop those rows
# before df had 64413 rows
#df = temp.dropna()    #now has 63242 rows, 1171 difference (total # nas)

In [None]:
#check if any nas
#pd.set_option('display.max_rows', None)
#print(pd.isnull(df).sum())

#still 52 states? (50 + DC and PR)
#print("Number states: ", len(df['State'].unique()))

#compare tract numbers now to before na drop
temp1 = pd.DataFrame(temp.groupby('State')['Geo_Name'].nunique()).rename(columns={'Geo_Name':'beforeDrop'})
temp2 = pd.DataFrame(df.groupby('State')['Geo_Name'].nunique()).rename(columns={'Geo_Name':'afterDrop'})
temp3 = pd.merge(temp1, temp2, left_index = True, right_index=True)
#number tracts dropped from each state
temp3['numDrop'] = temp3['beforeDrop'] - temp3['afterDrop']
# the percentage of total tracts dropped
temp3['dropPct'] = temp3['numDrop']*100 / temp3['beforeDrop']
#temp3 #puerto rico loses the most tracts at 5.7%

In [None]:
#return column size
pd.set_option('display.max_columns', 15)
#pd.set_option('display.max_row', 50)

#only select columns we want
cols = ['Geo_Name', 'county', 'State','Region',
'2020_tract_rate', '2010_tract_rate', '2020_tract_st_diff',
'2020_state_rate', '2010_state_rate',
'10_20_state_difference', '10_20_tract_difference',
'total_population', 'white_alone', 'black_alone', 'black_pct',
'amerindian_alone', 'non_white_pct',
'total_education', 'no_school', 'some_school', 'diploma', 
'ged', 'some_college','associate','bachelor',
'master','prof_school','doctorate',
'income_poverty_ratio','language_total',
'lang_english_only','lang_spanish','lang_spanish_limited_english',
'median_household_income','per_capita_income',
'total_houses','occupied_houses','vacant_houses',
'total_occupied_houses','owner_occupied','renter_occupied',
'median_gross_rent','rent_to_income','rent_30_more',
'total_computer_status','has_computer','dial_up_computer',
'broadband_computer','no_internet_computer','no_computer',
'employment_total','labor_force','civilian_labor_force',
'civilian_employed','civilian_unemployed',
'us_pop','us_born','us_territory_born','us_born_abroad',
'us_naturalization','not_us_citizen',
'total_pr','pr_born','pr_us_born','pr_born_abroad',
'pr_naturalization','pr_not_us_citizen']

df = df[cols]
#print df with no nas
df.to_csv("/gdrive/My Drive/0USA Today/all_rates_demos_merged.csv")
df

## Analysis

In [None]:
##see all dfs in memory
%whos DataFrame

#### Existing DFs:

#dataframe with all years, states, tracts, demographics
#df

#2010 and 2020 states
#states

#2010 States
#states2010

#2020 States
#states2020

#2010 and 2020 tracts and states
#tracts

#2020 tracts paired with states, includes int data
#tracts2020states

# ignore all dfs with "temp" in name 

In [None]:
# get average state rate and see how many are above average
i = states.mean(axis=0)['2020_state_rate']
print(
    "61.8% is the current nationwide response rate and",
    np.sum(states2020['2020_state_rate'] > 61.8),
    "states exceed that", "\n")
print(states2020[states2020['2020_state_rate'] > 61.8].State.to_list(), "/n")

temp = states2020['2020_state_rate'] > 61.8

#how many in each region?
states2020[temp].Region.value_counts()

In [None]:
#how many tracts are > than state avg?

#tract2020States.count(tract2020States['2020_tract_st_diff'] > 0)
print(np.sum(tract2020states['2020_tract_st_diff'] > 0), "tracts out of", 
      len(tract2020states), 
      "total (", 
      (np.sum(tract2020states['2020_tract_st_diff'] > 0) * 100) / len(tract2020states), "% )",
      "currently have greater census response rates than their state average")

In [None]:
# get info on 1000 tracks with greatest drop in response rate
big_drop = tracts.sort_values(by='10_20_tract_difference', ascending=True).head(1000)
big_drop

In [None]:
#lowest tract rates
lowest = tracts.sort_values(by='2020_tract_rate', ascending=True).head(1000)
lowest

### National Comparative Rankings 2010 vs 2020

In [None]:
#Discrepancies? Shouldn't these all match up?
# Average is sum / #obs
print(
  "2010 State Resonse Average from states data:",
    states.mean(axis=0)['2010_state_rate'], "\n",
  "2010 States Response Average from df data:",
    df.mean(axis=0)['2010_state_rate'], "\n",
  "2010 Tract Response Average from df data:",
    df.mean(axis=0)['2010_tract_rate'],  "\n"
)
## totals intended to be sum(percentages) / 100 (denominator)
print(
  "2010 State Resonse Total from states data:",
    (states['2010_state_rate'].sum()) / 100, "\n",
  "2010 States Response Total from df data:",
    (df['2010_state_rate'].sum()) / 100, "\n",
  "2010 Tract Response Total from df data:",
    (df['2010_tract_rate'].sum()) / 100
)

In [None]:
# Average difference between current state rates and 2010 state rates as of 6/15/20
#takes a while to run so commented out
#df.mean(axis=0)['10_20_state_difference']

#average 2020 response rate across states?
print(
  "2020 State Resonse Average from states data:",
    states.mean(axis=0)['2020_state_rate'], "\n",
  "2020 States Response Average from df data:",
    df.mean(axis=0)['2020_state_rate'], "\n",
  "2020 Tract Response Average from df data:",
    df.mean(axis=0)['2020_tract_rate'], "\n"
)

#is the total not sum(numerator) / 100? where numerator is percentage response rate?
print(
  "2020 State Resonse Total from states data:",
    (states['2020_state_rate'].sum()) / (100), "\n",
  "2020 States Response Total from df data:",
    (df['2020_state_rate'].sum()) / (100), "\n",
  "2020 Tract Response Total from df data:",
    (df['2020_tract_rate'].sum()) / (100)
)

In [None]:
# average difference by region
states.groupby('Region').mean().sort_values(by='10_20_state_difference', ascending=False)

In [None]:
#assign ranks to states based on comparative response rate
states['2020_rank'] = states['2020_state_rate'].rank(method='max', ascending=False)
states['2010_rank'] = states['2010_state_rate'].rank(method='max', ascending=False)

#pull ranks into separate dataframe
state_ranks = states[['State', '2020_rank', '2010_rank']].sort_values(by='2020_rank')

#show change in rank from 2010 to 2020
#negative number means a state has a lower 2020 response rate and has gone down in rankings
state_ranks['rank_change'] = state_ranks['2010_rank'] - state_ranks['2020_rank']
#state_ranks.sort_values(by='rank_change', ascending=True)

#see how many states only changed 2 or fewer positions
#small_change = state_ranks[state_ranks.rank_change.between(-2, 2, inclusive=True)].sort_values(by='rank_change')
#small_change
#16 states have stayed ~similar in the rankings, and this seems to impact
#states with both high and low response rates
#small_change.mean(axis=0)['2020_rank']

### Internet Usage

Internet usage is only available for 2020 rates

In [None]:
### Percent of response rate not from internet

states2020.sort_values(by='state_not_int_pct', ascending=False)
print(
    "The average state response rate to the census NOT conducted online:",
    states2020.mean(axis=0)['state_not_int_pct'], "/n",
    states2020.mean(axis=0)['state_not_int']
)

In [None]:
# average non internet response rate
states2020.groupby(by='State').mean().sort_values(by='state_internet', ascending=False)

In [None]:
# highest non-internet response rate (not_int)
states2020.sort_values(by='state_not_int', ascending=False)

In [None]:
states2020.mean(axis=0)['state_not_int']

In [None]:
states2020.mean(axis=0)['state_internet']

In [None]:
states2020.mean(axis=0)['2020_state_rate']

In [None]:
### Without Puerto Rico

#make non-pr df
no_pr = states2020[states2020['State'] != 'Puerto Rico']

# average internet response
no_pr.mean(axis=0)['state_internet']

In [None]:
# average overall response rate
no_pr.mean(axis=0)['2020_state_rate']

### Region Analysis

In [None]:
# average by region
# NOTE as tribal tracts are not assigned to a state they do not have a corresponding region and thus are not counted in the regional calculations
states.groupby('Region').mean().sort_values(by='2020_state_rate', ascending=False)

In [None]:
# average difference as of 6/15/20
#this can take a while to run so is commented out unless needed
#tracts.mean(axis=0)['10_20_tract_difference']

In [None]:
# average difference by region
tracts.groupby('Region').mean().sort_values(by='10_20_tract_difference', ascending=False)

In [None]:
#tract average differences vs state rates
tracts.groupby('State').mean().sort_values(by='2020_state_rate', ascending=False)

### Tribal tracts

In [None]:
# create df with response rates in tribal tracts
tribal = tracts2020[tracts2020['Geo_Type'].str.contains("Tribal")]
tribal.sort_values(by='2020_tract_rate', ascending=False)

In [None]:
### tribal areas and tracts stats

#mean non internet response
tribal.mean(axis=0)['tract_not_int'] #8.37%

In [None]:
# mean internet response rate
tribal.mean(axis=0)['internet']

In [None]:
# mean overall response rate
tribal.mean(axis=0)['2020_tract_rate']

### Tracts with 0 overall response rate

In [None]:
## Tracts with 0 cumulative response rate
is_zero = df['2020_tract_rate'] == 0
zeros = df[is_zero]
zeros.sort_values(by='State')

print(
    "Number of tracts with 0 cumulative response rate:", len(zeros), "\n"
    )

#make dataframe of states with # tracts with 0%, number total tracts, and what % of total tracts are 0
temp = pd.DataFrame(zeros['State'].value_counts())
temp2 = pd.DataFrame(tracts['State'].value_counts())
temp3 = pd.merge(temp, temp2, right_index=True, left_index=True)
#rename the columns
temp3 = temp3.rename(columns={"State_x": "0_tracts", "State_y" : "total_tracts"})
#compute percentage
temp3['0_percent'] = temp3['0_tracts'] * 100 / temp3['total_tracts']
temp3

zeros.sort_values(by='State')

In [None]:
zeros.mean('black_pct')

## Regressions

### With Puerto Rico

#### 2020 Regressions

In [None]:
### 2020 Multi-regression

import statsmodels.api as sm

#put all variables for predicting 2020 rates in dataframe
variables20 = df[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target20 = df[["2020_tract_rate"]]

#build model and print summary
model20 = sm.OLS(target20, variables20).fit()
print(model20.summary())

In [None]:
### 2020 linear regressions for each variable

from sklearn import linear_model

#create list of variable names
cols = variables20.columns.tolist()
#build linear model
regr = linear_model.LinearRegression()
#create empty list to store loop results
rows = []
#loop through each variable
for i in cols:
    #fit linear model to variable
    regr.fit(variables20[[i]], target20)
    #save model variable name, intercept, coef and r^2 to list
    rows.append([i, regr.intercept_, regr.coef_, regr.score(variables20[[i]], target20)])

#turn list into df with these column names
linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets lol
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['intercept'] = linears20['intercept'].str.get(0)

#print df sorted coefs largest --> smallest
linears20.sort_values(by='coefficient', ascending=False)

#### Normalized 2020 inputs

In [None]:
### 2020 Multi regression with normalized variables

#normalize variable values
norm_variables20 = (variables20 - variables20.min()) / (variables20.max() - variables20.min())

#build normalized multi-regress model and print summary
norm_model20 = sm.OLS(target20, norm_variables20).fit()
print(norm_model20.summary())

In [None]:
#2020 normalized linear regressions for each variable

# linear regression for each variable 'i'
cols = norm_variables20.columns.tolist()
# create model
norm_regr = linear_model.LinearRegression()
#empty list for loop results
rows = []
#loop through each variable
for i in cols:
    #fit model to each variable
    norm_regr.fit(norm_variables20[[i]], target20)
    #add model results to list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_, 
                 norm_regr.score(norm_variables20[[i]], target20)])

#turn list into dataframe
norm_linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['intercept'] = norm_linears20['intercept'].str.get(0)

#print df ordered by largest to smallest coef
norm_linears20.sort_values(by='coefficient', ascending=False)

#### 2010 Regressions

In [None]:
### 2010 multi regression

#put all variables for predicting 2010 rates in dataframe
variables10 = df[['total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2010 response rates - in separate dataframe
target10 = df[["2010_tract_rate"]]

#create model and print summary table
model10 = sm.OLS(target10, variables10).fit()
print(model10.summary())

In [None]:
### 2010 linear regression for each variable

#list of variable names
cols = variables10.columns.tolist()
#build multi-reg model
regr = linear_model.LinearRegression()
#create empty list for loop results
rows = []

#loop through variables
for i in cols:
    #fit a model to the current variable
    regr.fit(variables10[[i]], target10)
    #save the model's resulting variable name, intercept, coef, and r^2
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables10[[i]], target10)])

#turn list into data frame
linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['intercept'] = linears10['intercept'].str.get(0)

#print data frame ordered coefficient largest --> smallest
linears10.sort_values(by='coefficient', ascending=False)

#### Normalized 2010 Regressions

In [None]:
### 2010 normalized multi-regression

#normalize the variables
norm_variables10 = (variables10 - variables10.min()) / (variables10.max() - variables10.min())

#build normalized model and print summary
norm_model10 = sm.OLS(target10, norm_variables10).fit()
print(norm_model10.summary())

In [None]:
###2010 normalized linear regressions for each variable

#create list of variable names
cols = norm_variables10.columns.tolist()
#build the model
norm_regr = linear_model.LinearRegression()
#create empty list for model results
rows = []
#cycle through variables
for i in cols:
    #do the linear regression on the current variable
    norm_regr.fit(norm_variables10[[i]], target10)
    #add the corresponding variable name, intercept, coefficient and r-squared to the list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_,
                norm_regr.score(norm_variables10[[i]], target10)])

#turn list into data frame with these column names
norm_linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['intercept'] = norm_linears10['intercept'].str.get(0)

#print df sorted by coefficient
norm_linears10.sort_values(by='coefficient', ascending=False)

#### 2020 Edited Regressions
Edited = Run regressions with fewer variables

In [None]:
### 2020 edited multi-regressions

#put all variables for predicting 2020 rates in dataframe
variables_ed = df[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other', 'income_poverty_ratio', 'per_capita_income',
 'civilian_employed', 'civilian_unemployed',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income','rent_30_more',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer',
 'us_born', 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target_ed = df[["2020_tract_rate"]]

#build and fit the multi-regression model
model_ed = sm.OLS(target_ed, variables_ed).fit()
#print out the model summary table
print(model_ed.summary())

In [None]:
### 2020 edited linear regressions

#create list of variable names
cols = variables_ed.columns.tolist()
#build the model
regr = linear_model.LinearRegression()
#create empty list to append results
rows = []

#loop through variables
for i in cols:
    #fit the model
    regr.fit(variables_ed[[i]], target_ed)
    #put model variable name, intercept, coef and r^2 in list
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables_ed[[i]], target_ed)])

#turn list into data frame with these column names
linears_ed = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['intercept'] = linears_ed['intercept'].str.get(0)

#print df sorted by coefficient largest --> smallest
linears_ed.sort_values(by='coefficient', ascending=False)

In [None]:
### normalized 2020 edited variables multi regression

#normalize the variables
norm_variables_ed = (variables_ed - variables_ed.min()) / (variables_ed.max() - variables_ed.min())

#build normalized model and print summary
norm_model_ed = sm.OLS(target_ed, norm_variables_ed).fit()
print(norm_model_ed.summary())

### Without Puerto Rico

#### 2020 regressions

In [None]:
### 2020 Multi-regression

#df without puerto rico for regression 
no_pr = df[df['Region'] != 'Puerto Rico']
#this is different from earlier code no_pr = states[states.State != 'Puerto Rico']
#earlier code omitted Puerto Rico from state level data. this eliminates from tract level data


import statsmodels.api as sm

#put all variables for predicting 2020 rates in dataframe
variables20 = no_pr[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target20 = no_pr[["2020_tract_rate"]]

#build model and print summary
model20 = sm.OLS(target20, variables20).fit()
model20.summary()

In [None]:
### 2020 linear regressions for each variable

from sklearn import linear_model

#create list of variable names
cols = variables20.columns.tolist()
#build linear model
regr = linear_model.LinearRegression()
#create empty list to store loop results
rows = []
#loop through each variable
for i in cols:
    #fit linear model to variable
    regr.fit(variables20[[i]], target20)
    #save model variable name, intercept, coef and r^2 to list
    rows.append([i, regr.intercept_, regr.coef_, regr.score(variables20[[i]], target20)])

#turn list into df with these column names
linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets lol
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['intercept'] = linears20['intercept'].str.get(0)

#print df sorted coefs largest --> smallest
linears20.sort_values(by='coefficient', ascending=False)

#### Normalized 2020 inputs

In [None]:
### 2020 Multi regression with normalized variables

#normalize variable values
norm_variables20 = (variables20 - variables20.min()) / (variables20.max() - variables20.min())

#build normalized multi-regress model and print summary
norm_model20 = sm.OLS(target20, norm_variables20).fit()
norm_model20.summary()

In [None]:
#2020 normalized linear regressions for each variable

# linear regression for each variable 'i'
cols = norm_variables20.columns.tolist()
# create model
norm_regr = linear_model.LinearRegression()
#empty list for loop results
rows = []
#loop through each variable
for i in cols:
    #fit model to each variable
    norm_regr.fit(norm_variables20[[i]], target20)
    #add model results to list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_, 
                 norm_regr.score(norm_variables20[[i]], target20)])

#turn list into dataframe
norm_linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['intercept'] = norm_linears20['intercept'].str.get(0)

#print df ordered by largest to smallest coef
norm_linears20.sort_values(by='coefficient', ascending=False)

#### 2010 Regressions

In [None]:
### 2010 multi regression

#put all variables for predicting 2010 rates in dataframe
variables10 = no_pr[['total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2010 response rates - in separate dataframe
target10 = no_pr[["2010_tract_rate"]]

#create model and print summary table
model10 = sm.OLS(target10, variables10).fit()
model10.summary()

In [None]:
### 2010 linear regression for each variable

#list of variable names
cols = variables10.columns.tolist()
#build multi-reg model
regr = linear_model.LinearRegression()
#create empty list for loop results
rows = []

#loop through variables
for i in cols:
    #fit a model to the current variable
    regr.fit(variables10[[i]], target10)
    #save the model's resulting variable name, intercept, coef, and r^2
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables10[[i]], target10)])

#turn list into data frame
linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['intercept'] = linears10['intercept'].str.get(0)

#print data frame ordered coefficient largest --> smallest
linears10.sort_values(by='coefficient', ascending=False)

#### Normalized 2010 Regressions

In [None]:
### 2010 normalized multi-regression

#normalize the variables
norm_variables10 = (variables10 - variables10.min()) / (variables10.max() - variables10.min())

#build normalized model and print summary
norm_model10 = sm.OLS(target10, norm_variables10).fit()
norm_model10.summary()

In [None]:
###2010 normalized linear regressions for each variable

#create list of variable names
cols = norm_variables10.columns.tolist()
#build the model
norm_regr = linear_model.LinearRegression()
#create empty list for model results
rows = []
#cycle through variables
for i in cols:
    #do the linear regression on the current variable
    norm_regr.fit(norm_variables10[[i]], target10)
    #add the corresponding variable name, intercept, coefficient and r-squared to the list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_,
                norm_regr.score(norm_variables10[[i]], target10)])

#turn list into data frame with these column names
norm_linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['intercept'] = norm_linears10['intercept'].str.get(0)

#print df sorted by coefficient
norm_linears10.sort_values(by='coefficient', ascending=False)

#### 2020 Edited Regressions
Edited = Run regressions with fewer variables

In [None]:
### 2020 edited multi-regressions

#put all variables for predicting 2020 rates in dataframe
variables_ed = no_pr[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other', 'income_poverty_ratio', 'per_capita_income',
 'civilian_employed', 'civilian_unemployed',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income','rent_30_more',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer',
 'us_born', 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target_ed = no_pr[["2020_tract_rate"]]

#build and fit the multi-regression model
model_ed = sm.OLS(target_ed, variables_ed).fit()
#print out the model summary table
model_ed.summary()

In [None]:
### 2020 edited linear regressions

#create list of variable names
cols = variables_ed.columns.tolist()
#build the model
regr = linear_model.LinearRegression()
#create empty list to append results
rows = []

#loop through variables
for i in cols:
    #fit the model
    regr.fit(variables_ed[[i]], target_ed)
    #put model variable name, intercept, coef and r^2 in list
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables_ed[[i]], target_ed)])

#turn list into data frame with these column names
linears_ed = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['intercept'] = linears_ed['intercept'].str.get(0)

#print df sorted by coefficient largest --> smallest
linears_ed.sort_values(by='coefficient', ascending=False)

In [None]:
### normalized 2020 edited variables multi regression

#normalize the variables
norm_variables_ed = (variables_ed - variables_ed.min()) / (variables_ed.max() - variables_ed.min())

#build normalized model and print summary
norm_model_ed = sm.OLS(target_ed, norm_variables_ed).fit()
norm_model_ed.summary()

### Puerto Rico

In [None]:
#create df with just puerto rico
is_pr = df['Region'] == 'Puerto Rico'
pr = df[is_pr]
pr

#### 2020 regressions

In [None]:
### 2020 Multi-regression

import statsmodels.api as sm

#put all variables for predicting 2020 rates in dataframe
variables20 = pr[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target20 = pr[["2020_tract_rate"]]

#build model and print summary
model20 = sm.OLS(target20, variables20).fit()
model20.summary()

In [None]:
### 2020 linear regressions for each variable

from sklearn import linear_model

#create list of variable names
cols = variables20.columns.tolist()
#build linear model
regr = linear_model.LinearRegression()
#create empty list to store loop results
rows = []
#loop through each variable
for i in cols:
    #fit linear model to variable
    regr.fit(variables20[[i]], target20)
    #save model variable name, intercept, coef and r^2 to list
    rows.append([i, regr.intercept_, regr.coef_, regr.score(variables20[[i]], target20)])

#turn list into df with these column names
linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets lol
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['intercept'] = linears20['intercept'].str.get(0)

#print df sorted coefs largest --> smallest
linears20.sort_values(by='coefficient', ascending=False)

#### Normalized 2020 inputs

In [None]:
### 2020 Multi regression with normalized variables

#normalize variable values
norm_variables20 = (variables20 - variables20.min()) / (variables20.max() - variables20.min())

#build normalized multi-regress model and print summary
norm_model20 = sm.OLS(target20, norm_variables20).fit()
norm_model20.summary()

In [None]:
#2020 normalized linear regressions for each variable

# linear regression for each variable 'i'
cols = norm_variables20.columns.tolist()
# create model
norm_regr = linear_model.LinearRegression()
#empty list for loop results
rows = []
#loop through each variable
for i in cols:
    #fit model to each variable
    norm_regr.fit(norm_variables20[[i]], target20)
    #add model results to list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_, 
                 norm_regr.score(norm_variables20[[i]], target20)])

#turn list into dataframe
norm_linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['intercept'] = norm_linears20['intercept'].str.get(0)

#print df ordered by largest to smallest coef
norm_linears20.sort_values(by='coefficient', ascending=False)

#### 2010 Regressions

In [None]:
### 2010 multi regression

#put all variables for predicting 2010 rates in dataframe
variables10 = pr[['total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2010 response rates - in separate dataframe
target10 = pr[["2010_tract_rate"]]

#create model and print summary table
model10 = sm.OLS(target10, variables10).fit()
model10.summary()

In [None]:
### 2010 linear regression for each variable

#list of variable names
cols = variables10.columns.tolist()
#build multi-reg model
regr = linear_model.LinearRegression()
#create empty list for loop results
rows = []

#loop through variables
for i in cols:
    #fit a model to the current variable
    regr.fit(variables10[[i]], target10)
    #save the model's resulting variable name, intercept, coef, and r^2
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables10[[i]], target10)])

#turn list into data frame
linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['intercept'] = linears10['intercept'].str.get(0)

#print data frame ordered coefficient largest --> smallest
linears10.sort_values(by='coefficient', ascending=False)

#### Normalized 2010 Regressions

In [None]:
### 2010 normalized multi-regression

#normalize the variables
norm_variables10 = (variables10 - variables10.min()) / (variables10.max() - variables10.min())

#build normalized model and print summary
norm_model10 = sm.OLS(target10, norm_variables10).fit()
norm_model10.summary()

In [None]:
###2010 normalized linear regressions for each variable

#create list of variable names
cols = norm_variables10.columns.tolist()
#build the model
norm_regr = linear_model.LinearRegression()
#create empty list for model results
rows = []
#cycle through variables
for i in cols:
    #do the linear regression on the current variable
    norm_regr.fit(norm_variables10[[i]], target10)
    #add the corresponding variable name, intercept, coefficient and r-squared to the list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_,
                norm_regr.score(norm_variables10[[i]], target10)])

#turn list into data frame with these column names
norm_linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['intercept'] = norm_linears10['intercept'].str.get(0)

#print df sorted by coefficient
norm_linears10.sort_values(by='coefficient', ascending=False)

#### 2020 Edited Regressions
Edited = Run regressions with fewer variables

In [None]:
### 2020 edited multi-regressions

#put all variables for predicting 2020 rates in dataframe
variables_ed = pr[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other', 'income_poverty_ratio', 'per_capita_income',
 'civilian_employed', 'civilian_unemployed',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income','rent_30_more',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer',
 'us_born', 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target_ed = pr[["2020_tract_rate"]]

#build and fit the multi-regression model
model_ed = sm.OLS(target_ed, variables_ed).fit()
#print out the model summary table
model_ed.summary()

In [None]:
### 2020 edited linear regressions

#create list of variable names
cols = variables_ed.columns.tolist()
#build the model
regr = linear_model.LinearRegression()
#create empty list to append results
rows = []

#loop through variables
for i in cols:
    #fit the model
    regr.fit(variables_ed[[i]], target_ed)
    #put model variable name, intercept, coef and r^2 in list
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables_ed[[i]], target_ed)])

#turn list into data frame with these column names
linears_ed = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['intercept'] = linears_ed['intercept'].str.get(0)

#print df sorted by coefficient largest --> smallest
linears_ed.sort_values(by='coefficient', ascending=False)

In [None]:
### normalized 2020 edited variables multi regression

#normalize the variables
norm_variables_ed = (variables_ed - variables_ed.min()) / (variables_ed.max() - variables_ed.min())

#build normalized model and print summary
norm_model_ed = sm.OLS(target_ed, norm_variables_ed).fit()
norm_model_ed.summary()