<a href="https://colab.research.google.com/github/tdiffendal/USAT/blob/master/census-responses/census_responses_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2020 Census Response Rate Analysis

### Theresa Diffendal, USA Today data intern, 06/2020

#### 2020 response rates from: https://2020census.gov/en/response-rates.html
#### 2010 response rates from: https://api.census.gov/data/2010/dec/responserate/variables.html
#### Demographic information in 2014-2018 ACS 5-year-estimate from: https://data2.nhgis.org/main

## Column Names

GEO_ID = Geographic Identifier

RESP_DATE = Posting Date

State = name of state (one of the 50 states, District of Columbia, Puerto Rico, or NaN)

Geo_Name = name of the tract, county, state

Region = region of the U.S. in which state is located as defined by census map at https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf

Geo_Type = type of geography; possible answers include Census Tract, Congressional District, Consolidated City, Country, County, County Subdivision, Place, Region, State, Tribal Tract, Tribal Area

DRRINT = Daily Self-Response Rate - Internet

DRRALL = Daily Self-Response Rate – Overall

CRRINT = Cumulative Self-Response Rate - Internet; renamed internet

not_int = new calculated column showing response rate NOT from internet

CRRALL = Cumulative Self-Response Rate – Overall; renamed 2020_rate

DINTMIN = Minimum Daily Internet Self-Response Rate

DMIN = Minimum Daily Overall Self-Response Rate

CINTMIN = Minimum Cumulative Internet Self-Response Rate

CMIN = Minimum Cumulative Overall Self-Response Rate

DINTMAX = Maximum Daily Internet Self-Response Rate

DMAX = Maximum Daily Overall Self-Response Rate

CINTMAX = Maximum Cumulative Internet Self-Response Rate

CMAX = Maximum Cumulative Overall Self-Response Rate

DINTAVG = Average Daily Internet Self-Response Rate

DAVG = Average Daily Overall Self-Response Rate

CINTAVG = Average Cumulative Internet Self-Response Rate

CAVG = Average Cumulative Overall Self-Response Rate

DINTMED = Median Daily Internet Self-Response Rate

DMED = Median Daily Overall Self-Response Rate

CINTMED = Median Cumulative Internet Self-Response Rate

CMED = Median Cumulative Overall Self-Response Rate

## Read, Merge, Clean Data

### Initial Load and Merge

In [282]:
#mount drive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [323]:
import pandas as pd
import numpy as np

# read in 2020 response rates
initial_df = pd.read_csv('https://www2.census.gov/programs-surveys/decennial/2020/data/2020map/2020/decennialrr2020.csv',
                         dtype={'CRRINT':np.float,
                                'CRRALL':np.float})
# read in as latin-1 since the automatic detection returns an error
crosswalk = pd.read_csv('https://www2.census.gov/programs-surveys/decennial/2020/data/2020map/2020/decennialrr2020_crosswalkfile.csv', encoding='latin-1')
# states paired with region as defined by census map at https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf
regions = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/data/state_region.csv')

# merge responses and crosswalk
temp = pd.merge(initial_df, crosswalk, on='GEO_ID')

#merge merged1 with region data
merged = pd.merge(temp, regions, on='State')

# create column showing responses not from internet
merged['not_int'] = merged.CRRALL - merged.CRRINT
merged['not_int_pct'] = (merged.not_int) * 100 / merged.CRRALL

#reorder columns to move State, Geo_Name and Geo_Type to front; also going to drop some values
cols = ['GEO_ID', 'RESP_DATE', 'State', 'Geo_Name', 'Region', 'Geo_Type', 
        'CRRINT', 'not_int', 'not_int_pct', 'CRRALL']
merged = merged[cols].rename(columns={'CRRINT':'internet', 'CRRALL':'2020_rate'})

### States

#### 2020 data

In [284]:
# create df with response rate by state
states2020 = merged[merged['Geo_Type'] == 'State']
states2020 = states2020.rename(columns={"internet": "state_internet", 
                                        "not_int" : "state_not_int", 
                                        'not_int_pct' : 'state_not_int_pct',
                                        "2020_rate" : "2020_state_rate"})

# print df and sort by highest cumulative response rate
#states2020.sort_values(by='2020_state_rate', ascending=False)

#### Join 2010 states

In [322]:
# read in csvs with 2010 response data for states
states2010 = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/data/states2010.csv',
                         dtype={'2010_rate':np.float,
                                '2000_rate':np.float})

# merge with 2020 states
states = pd.merge(states2020, states2010, on='State')
#only select columns we want
cols = ['GEO_ID', 'State', 'Region',
 '2020_state_rate', '2010_rate', '2000_rate']
states = states[cols].rename(
    columns={'2000_rate':'2000_state_rate', '2010_rate':'2010_state_rate'})

#create column with difference in 2010 vs 2020 response rate
states['10_20_state_difference'] = (
    states['2020_state_rate'] - states['2010_state_rate']
    ) * 100 / states['2010_state_rate']

#print table sorted by 10-20 difference largest ---> smallest
#states.sort_values(by=['2000_state_rate'], ascending=True)

### Census Tracts

#### 2020 Tracts

In [286]:
# select just census tract geo types
tracts2020 = merged[merged['Geo_Type'].str.contains("Tract")].rename(
    columns={"2020_rate": "2020_tract_rate",'not_int':'tract_not_int',
             'not_int_pct':'tract_not_int_pct'})
# sort by highest cumulative response rate
#tracts2020.sort_values(by='2020_tract_rate', ascending=False)

In [287]:
#tract rates compared to state averages
tract2020states = pd.merge(tracts2020, states, on=['State', 'Region'])
tract2020states = tract2020states[['GEO_ID_x', 'State', 'Geo_Name', 'Geo_Type', 
                                   'Region','2020_tract_rate', '2020_state_rate', 
                                   '2010_state_rate', '10_20_state_difference']
                                  ].rename(columns={'GEO_ID_x':'GEO_ID'})
tract2020states['2020_tract_st_diff'] = tract2020states['2020_tract_rate'] - tract2020states['2020_state_rate']
tract2020states.sort_values(by=['2020_tract_st_diff'])

print(
    "Difference in records:", len(tracts2020) - len(tract2020states), "\n",
    "Number of tribal tracts:", len(tracts2020[tracts2020['Geo_Type'].str.contains("Tribal")]),
    "\n\n", "merging tracts with states will drop tribal tracts", "\n",
    "(as they have no state), so those are examined separately below"
)

Difference in records: 426 
 Number of tribal tracts: 426 

 merging tracts with states will drop tribal tracts 
 (as they have no state), so those are examined separately below


#### Join 2010 tract rates

In [321]:
# read in csvs with 2010 response data for tracts and states
tracts2010 = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/data/2010responserate.csv',
                         dtype={'FSRR2010':np.float}).rename(
                             columns={'FSRR2010':'2010_tract_rate'})
#tracts2010

In [289]:
## difference in row numbers: both tract dfs have 84519 rows, but when joined only 84093
# Identify what values are in tracts2010 and not in tracts2020
#key_diff1 = set(tracts2010.GEO_ID).difference(tracts2020.GEO_ID)
#len(key_diff1)
#key_diff1

# Identify what values are in tracts2020 and not in tracts2010
#key_diff2 = set(tracts2020.GEO_ID).difference(tracts2010.GEO_ID)
#len(key_diff2)
#key_diff2

# 2010 rates do not include the 426 tribal tracts
# Those differences account for 426 tracts, which is .5% of the original 84519 
# tracts. These tracts are dropped in the comparative analyses and are analyzed separately

In [290]:
# merge with 2020 tracts
tracts = pd.merge(tract2020states, tracts2010, on='GEO_ID')
#select only columns we want
cols = ['Geo_Name','county', 'State_y', 'Region', 'Geo_Type', '2020_tract_rate', 
        '2010_tract_rate', '2020_tract_st_diff', '2020_state_rate', 
        '2010_state_rate', '10_20_state_difference', 'GEO_ID']
tracts = tracts[cols].rename(columns={'State_y':'State'})
#print df sorted largest --> smallest 2010 rate
#tracts.sort_values(by='2010_tract_rate', ascending=False)

In [291]:
#how many null 2010 response values are there
no_2010 = tracts.isnull().any(axis=1)
no_2010 = tracts[no_2010].sort_values(by="2010_tract_rate")

#tracts2010[tracts2010['2010_tract_rate'] == 0]

print(
    "There are", len(no_2010), "null 2010 response rate values out of", 
    len(tracts2010), "total 2010 observations, \n or",
    len(no_2010) * 100 / len(tracts2010) , "% \n",
    "and", len(no_2010) * 100 / len(tracts), "% of the", len(tracts),
    "total of 2020 and 2010 tracts \n\n",
    "0 response rate traacts in each state:", "\n",
    no_2010['State'].value_counts())

There are 531 null 2010 response rate values out of 84519 total 2010 observations, 
 or 0.6282611010542009 % 
 and 0.6314437586957298 % of the 84093 total of 2020 and 2010 tracts 

 0 response rate traacts in each state: 
  Wisconsin               92
 Florida                 56
 California              54
 Texas                   54
 Arizona                 50
 New York                50
 New Mexico              30
 Massachusetts           21
 Washington              17
 Montana                 14
 South Dakota            13
 North Carolina           8
 Alabama                  7
 Minnesota                7
 Utah                     6
 Colorado                 6
 Wyoming                  6
 Idaho                    6
 North Dakota             6
 Virginia                 3
 Maine                    3
 Nevada                   3
 New Hampshire            3
 Vermont                  3
 Oklahoma                 2
 Nebraska                 2
 New Jersey               2
 Alaska              

In [292]:
#create column with difference in 2010 vs 2020 response rate
tracts['10_20_tract_difference'] = (tracts['2020_tract_rate'] - tracts['2010_tract_rate']) / tracts['2010_tract_rate']
#sort df largest --> smallest 10-20 difference
#tracts.sort_values(by='10_20_tract_difference', ascending=False)

### Census API

In [293]:
#try something new
%matplotlib inline
import requests, pandas as pd

key = 'https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/census_key.txt'
api_key = requests.get(key).text

year='2018'
dsource='acs'
dname='acs5'
cols=['GEO_ID', 'TRACT', 'STATE', 'B00001_001E', 'B00002_001E']
#names=['GEO_ID', 'total_pop', 'housing_units',]
state='*'
county='*'
tract='*'

base_url = f'https://api.census.gov/data/{year}/{dsource}/{dname}'


### Demographic Data

In [320]:
#load census demographic data, join

## demographics data
temp1 = pd.read_csv('/content/drive/Shared drives/Shared Items/census-responses/data/acs_demographics/ACSDP5Y2018.DP05_data_with_overlays_2020-07-02T144029.csv',
                    usecols = [0,1,2,70,148,152,156,176,208,228,232,284,304,342],
                    names = ['GEO_ID', 'NAME','total_population','median_age', 
                             'white_pct','black_pct','native_pct','asian_pct',
                             'pacific_pct','other_pct','two_pct','latino_pct',
                             'notLatino_pct','house_units'], 
                    skiprows= 1#,
#                    keep_default_na=True,
#                    na_values=['-','**'],
#                    error_bad_lines=False,
#                    dtype={'total_population':np.int,'median_age':np.float, 
#                          'white_pct':np.float,'black_pct':np.float,
#                           'native_pct':np.float,'asian_pct':np.float,
#                           'pacific_pct':np.float,'other_pct':np.float,
#                           'two_pct':np.float,'latino_pct':np.float,
#                           'notLatino_pct':np.float,'house_units':np.int}
                    )

### housing data
temp2 = pd.read_csv('/content/drive/Shared drives/Shared Items/census-responses/data/acs_housing/ACSDP5Y2018.DP04_data_with_overlays_2020-07-02T161352.csv',
                    usecols = [0,1,8,12,28,52,56,184,188,300,354,568],
                    names = ['GEO_ID', 'NAME','occupied_pct','vacant_pct', 
                             'house_pct','bigapt_pct','mobilehome_pct',
                             'owner_pct','renter_pct','no_telephone_pct',
                             'median_value','rent_more_35_pct'],
#                    keep_default_na=True,
#                   na_values=['-','**'],
#                   error_bad_lines=False,
#                    dtype={'occupied_pct':np.float,'vacant_pct':np.float,
#                           'house_pct':np.float,'bigapt_pct':np.float,
#                           'mobilehome_pct':np.float,'owner_pct':np.float,
#                           'renter_pct':np.float,'no_telephone_pct':np.float,
#                           'median_value':np.float,'rent_more_35_pct':np.float},
                    skiprows=2)

## income data
temp3 = pd.read_csv('/content/drive/Shared drives/Shared Items/census-responses/data/acs_income/ACSST5Y2018.S1901_data_with_overlays_2020-07-02T160659.csv',
                    usecols = [0,1,90],
                    names = ['GEO_ID', 'NAME','median_income'],
#                    dtype={'median_income':np.float},
                    skiprows=2)

## internet access data
temp4 = pd.read_csv('/content/drive/Shared drives/Shared Items/census-responses/data/acs_internet_access/ACSDT5Y2018.B28002_data_with_overlays_2020-07-02T154751.csv',
                    usecols = [0,1,2,26],
                    names = ['GEO_ID', 'NAME','total_comp', 'no_int'],
#                    dtype={'total_comp':np.int, 'no_int':np.int},
                    skiprows=2)

#create new column to get % without internet instead of whole number
temp4['no_int_pct'] = temp4.no_int * 100 / temp4.total_comp
cols = ['GEO_ID', 'NAME', 'no_int_pct']
temp4 = temp4[cols]

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
## merge demographic and response rate data

frames = [temp1, temp2, temp3, temp4]

# merge to create new df with all demos for all tracts
from functools import reduce
demo = reduce(lambda  left,right: pd.merge(left,right,on=['GEO_ID', 'NAME'],
                                            how='left'), frames)

In [None]:
#create dataframe with demographics and response rates 
temp = pd.merge(tracts, demo, on=['GEO_ID'], how = 'left')

temp

#### Cleaning

In [None]:
## difference in row numbers
# Identify what values are in tracts and not in demo
key_diff1 = pd.DataFrame(list(set(tracts.GEO_ID).difference(demo.GEO_ID))
).rename(columns={0:'GEO_ID'})

# Identify what values are in demo and not in tracts
key_diff2 = pd.DataFrame(list(set(demo.GEO_ID).difference(tracts.GEO_ID))
).rename(columns={0:'GEO_ID'})

print("There are", len(tracts), "observations in the tracts df and \n",
    "There are", len(demo), "observations in the demographics df, \n",
    "a difference of", len(tracts) - len(demo), "\n\n",
    "There are", len(key_diff1), 
    "tracts in the tracts df that are not in the demographics df \n",
    "And there are", len(key_diff2), "tracts in the demo df not in the tracts df")

In [None]:
# make a dataframe of all rows with na value
temp1 = temp[temp.isna().any(axis=1)]

# how many nas in each state
temp2 = pd.DataFrame(temp1['State'].value_counts()).reset_index().rename(
    columns={'index':'state', 'State':'count_na'})

#compare to total 
temp3 = pd.DataFrame(temp['State'].value_counts()).reset_index().rename(
    columns={'index':'state', 'State':'count'})

#see what percent of state values of na
temp4 = pd.merge(temp2, temp3, on=['state'])
temp4['na_percent'] = (temp4['count_na']*100) / temp4['count']

print(
    "Total na tracts:", sum(temp4['count_na']), "\n",
    'NAs as % of all tracts:', (sum(temp4['count_na'])*100) / sum(temp4['count']),
    "\n")

In [None]:
#check if any nas
#pd.set_option('display.max_rows', None)
print(pd.isnull(temp).sum())

#still 52 states? (50 + DC and PR)
#print("Number states: ", len(df['State'].unique()))

#compare tract numbers now to before na drop
#temp1 = pd.DataFrame(temp.groupby('State')['Geo_Name'].nunique()).rename(columns={'Geo_Name':'beforeDrop'})
#temp2 = pd.DataFrame(df.groupby('State')['Geo_Name'].nunique()).rename(columns={'Geo_Name':'afterDrop'})
#temp3 = pd.merge(temp1, temp2, left_index = True, right_index=True)
#number tracts dropped from each state
#temp3['numDrop'] = temp3['beforeDrop'] - temp3['afterDrop']
# the percentage of total tracts dropped
#temp3['dropPct'] = temp3['numDrop']*100 / temp3['beforeDrop']
#temp3

In [None]:
#return column size
pd.set_option('display.max_columns', 15)
#pd.set_option('display.max_row', 50)

#only select columns we want
cols = ['Geo_Name', 'county', 'State', '2020_tract_rate',
 '2010_tract_rate', '2020_tract_st_diff', '2020_state_rate', '2010_state_rate',
 '10_20_state_difference',  '10_20_tract_difference', 'total_population',
 'median_age', 'white_pct', 'black_pct', 'native_pct', 'asian_pct',
 'pacific_pct', 'other_pct', 'two_pct', 'latino_pct', 'notLatino_pct',
 'house_units', 'occupied_pct', 'vacant_pct', 'house_pct', 'bigapt_pct',
 'mobilehome_pct', 'owner_pct', 'renter_pct', 'no_telephone_pct',
 'median_value', 'rent_more_35_pct', 'median_income', 'no_int_pct', 'GEO_ID', 
 'Region']

df = temp[cols]
#df

## Analysis

In [None]:
##see all dfs in memory
%whos DataFrame

#### Existing DFs:

#dataframe with all years, states, tracts, demographics
#df

#2010 and 2020 states
#states

#2010 States
#states2010

#2020 States
#states2020

#2010 and 2020 tracts and states
#tracts

#2020 tracts paired with states, includes int data
#tracts2020states

# ignore all dfs with "temp" in name 

In [None]:
# get average state rate and see how many are above average

print(
    "62.0% is the current nationwide response rate and",
    np.sum(states2020['2020_state_rate'] > 62.0),
    "states exceed that", "\n")
print(states2020[states2020['2020_state_rate'] > 62.0].State.to_list(), "\n")

temp = states2020['2020_state_rate'] > 62.0

#how many in each region?
states2020[temp].Region.value_counts()

In [None]:
#how many tracts are > than state avg?

#tract2020States.count(tract2020States['2020_tract_st_diff'] > 0)
print(np.sum(tract2020states['2020_tract_st_diff'] > 0), "tracts out of", 
      len(tract2020states), 
      "total (", 
      (np.sum(tract2020states['2020_tract_st_diff'] > 0) * 100) / len(tract2020states),
      "% ) \n",
      "currently have greater census response rates than their state average")

In [None]:
# get info on 1000 tracks with greatest drop in response rate
big_drop = tracts.sort_values(by='10_20_tract_difference', ascending=True).head(1000)
big_drop

In [None]:
#lowest tract rates
lowest = tracts.sort_values(by='2020_tract_rate', ascending=True).head(1000)
lowest

### National Comparative Rankings 2010 vs 2020

In [None]:
#Discrepancies? Shouldn't these all match up?
# Average is sum / #obs
#print(
#  "2010 State Resonse Average from states data:",
#    states.mean(axis=0)['2010_state_rate'], "\n",
#  "2010 States Response Average from df data:",
#    df.mean(axis=0)['2010_state_rate'], "\n",
#  "2010 Tract Response Average from df data:",
#    df.mean(axis=0)['2010_tract_rate'],  "\n"
#)
## totals intended to be sum(percentages) / 100 (denominator)
#print(
#  "2010 State Resonse Total from states data:",
#    (states['2010_state_rate'].sum()) / 100, "\n",
#  "2010 States Response Total from df data:",
#    (df['2010_state_rate'].sum()) / 100, "\n",
#  "2010 Tract Response Total from df data:",
#    (df['2010_tract_rate'].sum()) / 100
#)

In [None]:
# Average difference between current state rates and 2010 state rates as of 6/15/20
#takes a while to run so commented out
#df.mean(axis=0)['10_20_state_difference']

#average 2020 response rate across states?
#print(
#  "2020 State Resonse Average from states data:",
#    states.mean(axis=0)['2020_state_rate'], "\n",
#  "2020 States Response Average from df data:",
#   df.mean(axis=0)['2020_state_rate'], "\n",
#  "2020 Tract Response Average from df data:",
#    df.mean(axis=0)['2020_tract_rate'], "\n"
#)

#is the total not sum(numerator) / 100? where numerator is percentage response rate?
#print(
#  "2020 State Resonse Total from states data:",
#    (states['2020_state_rate'].sum()) / (100), "\n",
#  "2020 States Response Total from df data:",
#    (df['2020_state_rate'].sum()) / (100), "\n",
# "2020 Tract Response Total from df data:",
#    (df['2020_tract_rate'].sum()) / (100)
#)

In [None]:
# average difference by region
states.groupby('Region').mean().sort_values(by='10_20_state_difference', ascending=False)

In [None]:
#assign ranks to states based on comparative response rate
states['2020_rank'] = states['2020_state_rate'].rank(method='max', ascending=False)
states['2010_rank'] = states['2010_state_rate'].rank(method='max', ascending=False)

#pull ranks into separate dataframe
state_ranks = states[['State', '2020_rank', '2010_rank']].sort_values(by='2020_rank')

#show change in rank from 2010 to 2020
#negative number means a state has a lower 2020 response rate and has gone down in rankings
state_ranks['rank_change'] = state_ranks['2010_rank'] - state_ranks['2020_rank']
#state_ranks.sort_values(by='rank_change', ascending=True)

#see how many states only changed 2 or fewer positions
#small_change = state_ranks[state_ranks.rank_change.between(-2, 2, inclusive=True)].sort_values(by='rank_change')
#small_change
#16 states have stayed ~similar in the rankings, and this seems to impact
#states with both high and low response rates
#small_change.mean(axis=0)['2020_rank']

### Internet Usage

Internet usage is only available for 2020 rates

In [None]:
### Percent of response rate not from internet

states2020.sort_values(by='state_not_int_pct', ascending=False)
print(
    "The average state response rate to the census NOT conducted online:",
    states2020.mean(axis=0)['state_not_int_pct'], "/n",
    states2020.mean(axis=0)['state_not_int']
)

In [None]:
# average non internet response rate
states2020.groupby(by='State').mean().sort_values(by='state_internet', ascending=False)

In [None]:
# highest non-internet response rate (not_int)
states2020.sort_values(by='state_not_int', ascending=False)

In [None]:
states2020.mean(axis=0)['state_not_int']

In [None]:
states2020.mean(axis=0)['state_internet']

In [None]:
states2020.mean(axis=0)['2020_state_rate']

In [None]:
### Without Puerto Rico

#make non-pr df
no_pr = states2020[states2020['State'] != 'Puerto Rico']

# average internet response
no_pr.mean(axis=0)['state_internet']

In [None]:
# average overall response rate
no_pr.mean(axis=0)['2020_state_rate']

### Region Analysis

In [None]:
# average by region
# NOTE as tribal tracts are not assigned to a state they do not have a corresponding region and thus are not counted in the regional calculations
states.groupby('Region').mean().sort_values(by='2020_state_rate', ascending=False)

In [None]:
# average difference as of 6/15/20
#this can take a while to run so is commented out unless needed
#tracts.mean(axis=0)['10_20_tract_difference']

In [None]:
# average difference by region
tracts.groupby('Region').mean().sort_values(by='10_20_tract_difference', ascending=False)

In [None]:
#tract average differences vs state rates
tracts.groupby('State').mean().sort_values(by='2020_state_rate', ascending=False)

### Tribal tracts

In [None]:
# create df with response rates in tribal tracts
tribal = tracts2020[tracts2020['Geo_Type'].str.contains("Tribal")].sort_values(
    by='2020_tract_rate', ascending=False)

In [None]:
### tribal areas and tracts stats

#mean non internet response
tribal.mean(axis=0)['tract_not_int'] #8.37%

In [None]:
# mean internet response rate
tribal.mean(axis=0)['internet']

In [None]:
# mean overall response rate
tribal.mean(axis=0)['2020_tract_rate']

### Tracts with 0 overall response rate

In [None]:
## Tracts with 0 cumulative response rate
is_zero = df['2020_tract_rate'] == 0
zeros = df[is_zero]

print(
    "Number of tracts with 0 cumulative response rate:", len(zeros), "\n"
    )

#return data frame with each tract with 0 2020 response rate
zeros.sort_values(by='State')

In [None]:
#make dataframe of states with # tracts with 0%, number total tracts,
#and what % of total tracts are 0
temp = pd.merge(pd.DataFrame(zeros['State'].value_counts()),
                pd.DataFrame(tracts['State'].value_counts()),
                right_index=True, left_index=True).rename(
                    columns={"State_x": "0_tracts", "State_y" : "total_tracts"})
#compute percentage
temp['0_percent'] = temp['0_tracts'] * 100 / temp['total_tracts']
temp

## Regressions

### With Puerto Rico

#### 2020 Regressions

In [None]:
#drop na rows for regression
reg_df = df.dropna(axis=0, how='any')

In [None]:
print(variables20.dtypes)
variables20.info()
print(df.dtypes)

In [None]:
### 2020 Multi-regression

import statsmodels.api as sm

#put all variables for predicting 2020 rates in dataframe
variables20 = reg_df[['2010_tract_rate', '2010_state_rate',
 'total_population', 'median_age', 'white_pct', 'black_pct', 'native_pct', 
 'asian_pct', 'pacific_pct', 'other_pct', 'two_pct', 'latino_pct', 
 'notLatino_pct', 'house_units', 'occupied_pct', 'vacant_pct', 'house_pct', 
 'bigapt_pct', 'mobilehome_pct', 'owner_pct', 'renter_pct', 'no_telephone_pct',
 'median_value', 'rent_more_35_pct', 'median_income', 'no_int_pct']]

#what we want to predict - 2020 response rates - in dataframe
target20 = reg_df[["2020_tract_rate"]]

#build model and print summary
model20 = sm.OLS(target20, variables20).fit()
print(model20.summary())

np.asarray(variables)

In [None]:
np.asarray(variables20)

In [None]:
### 2020 linear regressions for each variable

from sklearn import linear_model

#create list of variable names
cols = variables20.columns.tolist()
#build linear model
regr = linear_model.LinearRegression()
#create empty list to store loop results
rows = []
#loop through each variable
for i in cols:
    #fit linear model to variable
    regr.fit(variables20[[i]], target20)
    #save model variable name, intercept, coef and r^2 to list
    rows.append([i, regr.intercept_, regr.coef_, regr.score(variables20[[i]], target20)])

#turn list into df with these column names
linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets lol
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['intercept'] = linears20['intercept'].str.get(0)

#print df sorted coefs largest --> smallest
linears20.sort_values(by='coefficient', ascending=False)

#### Normalized 2020 inputs

In [None]:
### 2020 Multi regression with normalized variables

#normalize variable values
norm_variables20 = (variables20 - variables20.min()) / (variables20.max() - variables20.min())

#build normalized multi-regress model and print summary
norm_model20 = sm.OLS(target20, norm_variables20).fit()
print(norm_model20.summary())

In [None]:
#2020 normalized linear regressions for each variable

# linear regression for each variable 'i'
cols = norm_variables20.columns.tolist()
# create model
norm_regr = linear_model.LinearRegression()
#empty list for loop results
rows = []
#loop through each variable
for i in cols:
    #fit model to each variable
    norm_regr.fit(norm_variables20[[i]], target20)
    #add model results to list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_, 
                 norm_regr.score(norm_variables20[[i]], target20)])

#turn list into dataframe
norm_linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['intercept'] = norm_linears20['intercept'].str.get(0)

#print df ordered by largest to smallest coef
norm_linears20.sort_values(by='coefficient', ascending=False)

#### 2010 Regressions

In [None]:
### 2010 multi regression

#put all variables for predicting 2010 rates in dataframe
variables10 = reg_df[['total_population', 'median_age', 'white_pct', 
                      'black_pct', 'native_pct','asian_pct', 'pacific_pct', 
                      'other_pct', 'two_pct', 'latino_pct', 'notLatino_pct', 
                      'house_units', 'occupied_pct', 'vacant_pct', 'house_pct', 
                      'bigapt_pct', 'mobilehome_pct', 'owner_pct', 'renter_pct', 
                      'no_telephone_pct', 'median_value', 'rent_more_35_pct', 
                      'median_income', 'no_int_pct']]

#what we want to predict - 2010 response rates - in separate dataframe
target10 = reg_df[["2010_tract_rate"]]

#create model and print summary table
model10 = sm.OLS(target10, variables10).fit()
print(model10.summary())

In [None]:
### 2010 linear regression for each variable

#list of variable names
cols = variables10.columns.tolist()
#build multi-reg model
regr = linear_model.LinearRegression()
#create empty list for loop results
rows = []

#loop through variables
for i in cols:
    #fit a model to the current variable
    regr.fit(variables10[[i]], target10)
    #save the model's resulting variable name, intercept, coef, and r^2
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables10[[i]], target10)])

#turn list into data frame
linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['intercept'] = linears10['intercept'].str.get(0)

#print data frame ordered coefficient largest --> smallest
linears10.sort_values(by='coefficient', ascending=False)

#### Normalized 2010 Regressions

In [None]:
### 2010 normalized multi-regression

#normalize the variables
norm_variables10 = (variables10 - variables10.min()) / (variables10.max() - variables10.min())

#build normalized model and print summary
norm_model10 = sm.OLS(target10, norm_variables10).fit()
print(norm_model10.summary())

In [None]:
###2010 normalized linear regressions for each variable

#create list of variable names
cols = norm_variables10.columns.tolist()
#build the model
norm_regr = linear_model.LinearRegression()
#create empty list for model results
rows = []
#cycle through variables
for i in cols:
    #do the linear regression on the current variable
    norm_regr.fit(norm_variables10[[i]], target10)
    #add the corresponding variable name, intercept, coefficient and r-squared to the list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_,
                norm_regr.score(norm_variables10[[i]], target10)])

#turn list into data frame with these column names
norm_linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['intercept'] = norm_linears10['intercept'].str.get(0)

#print df sorted by coefficient
norm_linears10.sort_values(by='coefficient', ascending=False)

#### 2020 Edited Regressions
Edited = Run regressions with fewer variables

In [None]:
### 2020 edited multi-regressions

#put all variables for predicting 2020 rates in dataframe
variables_ed = reg_df[['2010_tract_rate', '2010_state_rate',
 'total_population', 'median_age', 'white_pct', 'black_pct', 'native_pct', 
 'asian_pct', 'pacific_pct', 'other_pct', 'two_pct', 'latino_pct', 
 'notLatino_pct', 'house_units', 'occupied_pct', 'vacant_pct', 'house_pct', 
 'bigapt_pct', 'mobilehome_pct', 'owner_pct', 'renter_pct', 'no_telephone_pct',
 'median_value', 'rent_more_35_pct', 'median_income', 'no_int_pct']]

#what we want to predict - 2020 response rates - in dataframe
target_ed = reg_df[["2020_tract_rate"]]

#build and fit the multi-regression model
model_ed = sm.OLS(target_ed, variables_ed).fit()
#print out the model summary table
print(model_ed.summary())

In [None]:
### 2020 edited linear regressions

#create list of variable names
cols = variables_ed.columns.tolist()
#build the model
regr = linear_model.LinearRegression()
#create empty list to append results
rows = []

#loop through variables
for i in cols:
    #fit the model
    regr.fit(variables_ed[[i]], target_ed)
    #put model variable name, intercept, coef and r^2 in list
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables_ed[[i]], target_ed)])

#turn list into data frame with these column names
linears_ed = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['intercept'] = linears_ed['intercept'].str.get(0)

#print df sorted by coefficient largest --> smallest
linears_ed.sort_values(by='coefficient', ascending=False)

In [None]:
### normalized 2020 edited variables multi regression

#normalize the variables
norm_variables_ed = (variables_ed - variables_ed.min()) / (variables_ed.max() - variables_ed.min())

#build normalized model and print summary
norm_model_ed = sm.OLS(target_ed, norm_variables_ed).fit()
print(norm_model_ed.summary())

### Without Puerto Rico

#### 2020 regressions

In [None]:
### 2020 Multi-regression

#df without puerto rico for regression 
no_pr = reg_df[reg_df['Region'] != 'Puerto Rico']
#this is different from earlier code no_pr = states[states.State != 'Puerto Rico']
#earlier code omitted Puerto Rico from state level data. this eliminates from tract level data


import statsmodels.api as sm

#put all variables for predicting 2020 rates in dataframe
variables20 = no_pr[['2010_tract_rate', '2010_state_rate',
 'total_population', 'median_age', 'white_pct', 'black_pct', 'native_pct', 
 'asian_pct', 'pacific_pct', 'other_pct', 'two_pct', 'latino_pct', 
 'notLatino_pct', 'house_units', 'occupied_pct', 'vacant_pct', 'house_pct', 
 'bigapt_pct', 'mobilehome_pct', 'owner_pct', 'renter_pct', 'no_telephone_pct',
 'median_value', 'rent_more_35_pct', 'median_income', 'no_int_pct']]

#what we want to predict - 2020 response rates - in dataframe
target20 = no_pr[["2020_tract_rate"]]

#build model and print summary
model20 = sm.OLS(target20, variables20).fit()
model20.summary()

In [None]:
### 2020 linear regressions for each variable

from sklearn import linear_model

#create list of variable names
cols = variables20.columns.tolist()
#build linear model
regr = linear_model.LinearRegression()
#create empty list to store loop results
rows = []
#loop through each variable
for i in cols:
    #fit linear model to variable
    regr.fit(variables20[[i]], target20)
    #save model variable name, intercept, coef and r^2 to list
    rows.append([i, regr.intercept_, regr.coef_, regr.score(variables20[[i]], 
                                                            target20)])

#turn list into df with these column names
linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 
                                        'r-squared'])

#remove square brackets lol
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['intercept'] = linears20['intercept'].str.get(0)

#print df sorted coefs largest --> smallest
linears20.sort_values(by='coefficient', ascending=False)

#### Normalized 2020 inputs

In [None]:
### 2020 Multi regression with normalized variables

#normalize variable values
norm_variables20 = (variables20 - variables20.min()) / (variables20.max() - variables20.min())

#build normalized multi-regress model and print summary
norm_model20 = sm.OLS(target20, norm_variables20).fit()
norm_model20.summary()

In [None]:
#2020 normalized linear regressions for each variable

# linear regression for each variable 'i'
cols = norm_variables20.columns.tolist()
# create model
norm_regr = linear_model.LinearRegression()
#empty list for loop results
rows = []
#loop through each variable
for i in cols:
    #fit model to each variable
    norm_regr.fit(norm_variables20[[i]], target20)
    #add model results to list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_, 
                 norm_regr.score(norm_variables20[[i]], target20)])

#turn list into dataframe
norm_linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 
                                             'coefficient', 'r-squared'])

#remove square brackets
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['intercept'] = norm_linears20['intercept'].str.get(0)

#print df ordered by largest to smallest coef
norm_linears20.sort_values(by='coefficient', ascending=False)

#### 2010 Regressions

In [None]:
### 2010 multi regression

#put all variables for predicting 2010 rates in dataframe
variables10 = no_pr[['total_population', 'median_age', 'white_pct', 
                     'black_pct', 'native_pct', 'asian_pct', 'pacific_pct',
                     'other_pct', 'two_pct', 'latino_pct', 'notLatino_pct',
                     'house_units', 'occupied_pct', 'vacant_pct', 'house_pct',
                     'bigapt_pct', 'mobilehome_pct', 'owner_pct', 'renter_pct',
                     'no_telephone_pct', 'median_value', 'rent_more_35_pct',
                     'median_income', 'no_int_pct']]

#what we want to predict - 2010 response rates - in separate dataframe
target10 = no_pr[["2010_tract_rate"]]

#create model and print summary table
model10 = sm.OLS(target10, variables10).fit()
model10.summary()

In [None]:
### 2010 linear regression for each variable

#list of variable names
cols = variables10.columns.tolist()
#build multi-reg model
regr = linear_model.LinearRegression()
#create empty list for loop results
rows = []

#loop through variables
for i in cols:
    #fit a model to the current variable
    regr.fit(variables10[[i]], target10)
    #save the model's resulting variable name, intercept, coef, and r^2
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables10[[i]], target10)])

#turn list into data frame
linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 
                                        'r-squared'])

#remove square brackets
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['intercept'] = linears10['intercept'].str.get(0)

#print data frame ordered coefficient largest --> smallest
linears10.sort_values(by='coefficient', ascending=False)

#### Normalized 2010 Regressions

In [None]:
### 2010 normalized multi-regression

#normalize the variables
norm_variables10 = (variables10 - variables10.min()) / (
    variables10.max() - variables10.min()
    )

#build normalized model and print summary
norm_model10 = sm.OLS(target10, norm_variables10).fit()
norm_model10.summary()

In [None]:
###2010 normalized linear regressions for each variable

#create list of variable names
cols = norm_variables10.columns.tolist()
#build the model
norm_regr = linear_model.LinearRegression()
#create empty list for model results
rows = []
#cycle through variables
for i in cols:
    #do the linear regression on the current variable
    norm_regr.fit(norm_variables10[[i]], target10)
    #add the corresponding variable name, intercept, coefficient and r-squared to the list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_,
                norm_regr.score(norm_variables10[[i]], target10)])

#turn list into data frame with these column names
norm_linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 
                                             'coefficient', 'r-squared'])

#remove square brackets
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['intercept'] = norm_linears10['intercept'].str.get(0)

#print df sorted by coefficient
norm_linears10.sort_values(by='coefficient', ascending=False)

#### 2020 Edited Regressions
Edited = Run regressions with fewer variables

In [None]:
### 2020 edited multi-regressions

#put all variables for predicting 2020 rates in dataframe
variables_ed = no_pr[['total_population', 'median_age', 'white_pct', 
                      'black_pct', 'native_pct', 'asian_pct', 'pacific_pct', 
                      'other_pct', 'two_pct', 'latino_pct', 'notLatino_pct',
                      'house_units', 'occupied_pct', 'vacant_pct', 'house_pct', 
                      'bigapt_pct', 'mobilehome_pct', 'owner_pct', 'renter_pct', 
                      'no_telephone_pct', 'median_value', 'rent_more_35_pct',
                      'median_income', 'no_int_pct']]

#what we want to predict - 2020 response rates - in dataframe
target_ed = no_pr[["2020_tract_rate"]]

#build and fit the multi-regression model
model_ed = sm.OLS(target_ed, variables_ed).fit()
#print out the model summary table
model_ed.summary()

In [None]:
### 2020 edited linear regressions

#create list of variable names
cols = variables_ed.columns.tolist()
#build the model
regr = linear_model.LinearRegression()
#create empty list to append results
rows = []

#loop through variables
for i in cols:
    #fit the model
    regr.fit(variables_ed[[i]], target_ed)
    #put model variable name, intercept, coef and r^2 in list
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables_ed[[i]], target_ed)])

#turn list into data frame with these column names
linears_ed = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 
                                         'r-squared'])

#remove square brackets
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['intercept'] = linears_ed['intercept'].str.get(0)

#print df sorted by coefficient largest --> smallest
linears_ed.sort_values(by='coefficient', ascending=False)

In [None]:
### normalized 2020 edited variables multi regression

#normalize the variables
norm_variables_ed = (variables_ed - variables_ed.min()) / (variables_ed.max() - variables_ed.min())

#build normalized model and print summary
norm_model_ed = sm.OLS(target_ed, norm_variables_ed).fit()
norm_model_ed.summary()

### Puerto Rico

In [None]:
#create df with just puerto rico
pr = reg_df[reg_df['Region'] == 'Puerto Rico']
pr

#### 2020 regressions

In [None]:
### 2020 Multi-regression

import statsmodels.api as sm

#put all variables for predicting 2020 rates in dataframe
variables20 = pr[['2010_tract_rate', 'total_population', 'median_age', 
                  'white_pct', 'black_pct', 'native_pct', 'asian_pct',
                  'pacific_pct', 'other_pct', 'two_pct', 'latino_pct',  
                  'notLatino_pct', 'house_units', 'occupied_pct', 'vacant_pct',
                  'house_pct', 'bigapt_pct', 'mobilehome_pct', 'owner_pct',
                  'renter_pct', 'no_telephone_pct', 'median_value',
                  'rent_more_35_pct', 'median_income', 'no_int_pct']]

#what we want to predict - 2020 response rates - in dataframe
target20 = pr[["2020_tract_rate"]]

#build model and print summary
model20 = sm.OLS(target20, variables20).fit()
model20.summary()

In [None]:
### 2020 linear regressions for each variable

from sklearn import linear_model

#create list of variable names
cols = variables20.columns.tolist()
#build linear model
regr = linear_model.LinearRegression()
#create empty list to store loop results
rows = []
#loop through each variable
for i in cols:
    #fit linear model to variable
    regr.fit(variables20[[i]], target20)
    #save model variable name, intercept, coef and r^2 to list
    rows.append([i, regr.intercept_, regr.coef_, 
                 regr.score(variables20[[i]], target20)])

#turn list into df with these column names
linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 
                                        'r-squared'])

#remove square brackets lol
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['intercept'] = linears20['intercept'].str.get(0)

#print df sorted coefs largest --> smallest
linears20.sort_values(by='coefficient', ascending=False)

#### Normalized 2020 inputs

In [None]:
### 2020 Multi regression with normalized variables

#normalize variable values
norm_variables20 = (variables20 - variables20.min()) / (variables20.max() - variables20.min())

#build normalized multi-regress model and print summary
norm_model20 = sm.OLS(target20, norm_variables20).fit()
norm_model20.summary()

In [None]:
#2020 normalized linear regressions for each variable

# linear regression for each variable 'i'
cols = norm_variables20.columns.tolist()
# create model
norm_regr = linear_model.LinearRegression()
#empty list for loop results
rows = []
#loop through each variable
for i in cols:
    #fit model to each variable
    norm_regr.fit(norm_variables20[[i]], target20)
    #add model results to list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_, 
                 norm_regr.score(norm_variables20[[i]], target20)])

#turn list into dataframe
norm_linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 
                                             'coefficient', 'r-squared'])

#remove square brackets
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['intercept'] = norm_linears20['intercept'].str.get(0)

#print df ordered by largest to smallest coef
norm_linears20.sort_values(by='coefficient', ascending=False)

#### 2010 Regressions

In [None]:
### 2010 multi regression

#put all variables for predicting 2010 rates in dataframe
variables10 = pr[['total_population', 'median_age',
                  'white_pct', 'black_pct', 'native_pct', 'asian_pct',
                  'pacific_pct', 'other_pct', 'two_pct', 'latino_pct', 
                  'notLatino_pct', 'house_units', 'occupied_pct', 'vacant_pct',
                  'house_pct', 'bigapt_pct', 'mobilehome_pct', 'owner_pct',
                  'renter_pct', 'no_telephone_pct', 'median_value',
                  'rent_more_35_pct', 'median_income', 'no_int_pct']]

#what we want to predict - 2010 response rates - in separate dataframe
target10 = pr[["2010_tract_rate"]]

#create model and print summary table
model10 = sm.OLS(target10, variables10).fit()
model10.summary()

In [None]:
### 2010 linear regression for each variable

#list of variable names
cols = variables10.columns.tolist()
#build multi-reg model
regr = linear_model.LinearRegression()
#create empty list for loop results
rows = []

#loop through variables
for i in cols:
    #fit a model to the current variable
    regr.fit(variables10[[i]], target10)
    #save the model's resulting variable name, intercept, coef, and r^2
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables10[[i]], target10)])

#turn list into data frame
linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 
                                        'coefficient', 'r-squared'])

#remove square brackets
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['intercept'] = linears10['intercept'].str.get(0)

#print data frame ordered coefficient largest --> smallest
linears10.sort_values(by='coefficient', ascending=False)

#### Normalized 2010 Regressions

In [None]:
### 2010 normalized multi-regression

#normalize the variables
norm_variables10 = (variables10 - variables10.min()) / (variables10.max() - variables10.min())

#build normalized model and print summary
norm_model10 = sm.OLS(target10, norm_variables10).fit()
norm_model10.summary()

In [None]:
###2010 normalized linear regressions for each variable

#create list of variable names
cols = norm_variables10.columns.tolist()
#build the model
norm_regr = linear_model.LinearRegression()
#create empty list for model results
rows = []
#cycle through variables
for i in cols:
    #do the linear regression on the current variable
    norm_regr.fit(norm_variables10[[i]], target10)
    #add the corresponding variable name, intercept, coefficient and r-squared to the list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_,
                norm_regr.score(norm_variables10[[i]], target10)])

#turn list into data frame with these column names
norm_linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 
                                             'coefficient', 'r-squared'])

#remove square brackets
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['intercept'] = norm_linears10['intercept'].str.get(0)

#print df sorted by coefficient
norm_linears10.sort_values(by='coefficient', ascending=False)

#### 2020 Edited Regressions
Edited = Run regressions with fewer variables

In [None]:
### 2020 edited multi-regressions

#put all variables for predicting 2020 rates in dataframe
variables_ed = pr[['2010_tract_rate', 'total_population', 'median_age',
                   'white_pct', 'black_pct', 'native_pct', 'asian_pct',
                   'pacific_pct', 'other_pct', 'two_pct', 'latino_pct', 
                   'notLatino_pct', 'house_units', 'occupied_pct', 'vacant_pct',
                   'house_pct', 'bigapt_pct', 'mobilehome_pct', 'owner_pct',
                   'renter_pct', 'no_telephone_pct', 'median_value',
                   'rent_more_35_pct', 'median_income', 'no_int_pct']]

#what we want to predict - 2020 response rates - in dataframe
target_ed = pr[["2020_tract_rate"]]

#build and fit the multi-regression model
model_ed = sm.OLS(target_ed, variables_ed).fit()
#print out the model summary table
model_ed.summary()

In [None]:
### 2020 edited linear regressions

#create list of variable names
cols = variables_ed.columns.tolist()
#build the model
regr = linear_model.LinearRegression()
#create empty list to append results
rows = []

#loop through variables
for i in cols:
    #fit the model
    regr.fit(variables_ed[[i]], target_ed)
    #put model variable name, intercept, coef and r^2 in list
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables_ed[[i]], target_ed)])

#turn list into data frame with these column names
linears_ed = pd.DataFrame(rows, columns=['variable', 'intercept', 
                                         'coefficient', 'r-squared'])

#remove square brackets
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['intercept'] = linears_ed['intercept'].str.get(0)

#print df sorted by coefficient largest --> smallest
linears_ed.sort_values(by='coefficient', ascending=False)

In [None]:
### normalized 2020 edited variables multi regression

#normalize the variables
norm_variables_ed = (variables_ed - variables_ed.min()) / (variables_ed.max() - variables_ed.min())

#build normalized model and print summary
norm_model_ed = sm.OLS(target_ed, norm_variables_ed).fit()
norm_model_ed.summary()