<a href="https://colab.research.google.com/github/tdiffendal/USAT/blob/master/Copy_of_census_responses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2020 Census Response Rate Analysis

### Theresa Diffendal, USA Today data intern, 06/2020

#### 2020 response rates from: https://2020census.gov/en/response-rates.html
#### 2010 response rates from: https://api.census.gov/data/2010/dec/responserate/variables.html
#### Demographic information in 2014-2018 ACS 5-year-estimate from: https://data2.nhgis.org/main

## Column Names

GEO_ID = Geographic Identifier

RESP_DATE = Posting Date

State = name of state (one of the 50 states, District of Columbia, Puerto Rico, or NaN)

Geo_Name = name of the tract, county, state

Region = region of the U.S. in which state is located as defined by census map at https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf

Geo_Type = type of geography; possible answers include Census Tract, Congressional District, Consolidated City, Country, County, County Subdivision, Place, Region, State, Tribal Tract, Tribal Area

DRRINT = Daily Self-Response Rate - Internet

DRRALL = Daily Self-Response Rate – Overall

CRRINT = Cumulative Self-Response Rate - Internet; renamed internet

not_int = new calculated column showing response rate NOT from internet

CRRALL = Cumulative Self-Response Rate – Overall; renamed 2020_rate

DINTMIN = Minimum Daily Internet Self-Response Rate

DMIN = Minimum Daily Overall Self-Response Rate

CINTMIN = Minimum Cumulative Internet Self-Response Rate

CMIN = Minimum Cumulative Overall Self-Response Rate

DINTMAX = Maximum Daily Internet Self-Response Rate

DMAX = Maximum Daily Overall Self-Response Rate

CINTMAX = Maximum Cumulative Internet Self-Response Rate

CMAX = Maximum Cumulative Overall Self-Response Rate

DINTAVG = Average Daily Internet Self-Response Rate

DAVG = Average Daily Overall Self-Response Rate

CINTAVG = Average Cumulative Internet Self-Response Rate

CAVG = Average Cumulative Overall Self-Response Rate

DINTMED = Median Daily Internet Self-Response Rate

DMED = Median Daily Overall Self-Response Rate

CINTMED = Median Cumulative Internet Self-Response Rate

CMED = Median Cumulative Overall Self-Response Rate

## Read, Merge, Clean Data

### Initial Load and Merge

In [1]:
import pandas as pd
import numpy as np

# read in 2020 response rates
initial_df = pd.read_csv('https://www2.census.gov/programs-surveys/decennial/2020/data/2020map/2020/decennialrr2020.csv')
# had to download from https://www2.census.gov/programs-surveys/decennial/2020/data/2020map/2020/ resave as UTF-8 CSV, hence the 2
crosswalk = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/decennialrr2020_crosswalkfile2.csv')
# states paired with region as defined by census map at https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf
regions = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/state_region.csv')

# merge responses and crosswalk
temp = pd.merge(initial_df, crosswalk, on='GEO_ID')

#merge merged1 with region data
merged = pd.merge(temp, regions, on='State')

# create column showing responses not from internet
merged['not_int'] = merged.CRRALL - merged.CRRINT
merged['not_int_pct'] = (merged.not_int) * 100 / merged.CRRALL

#reorder columns to move State, Geo_Name and Geo_Type to front; also going to drop some values
cols = merged.columns.tolist()
cols = ['GEO_ID', 'RESP_DATE', 'State', 'Geo_Name', 'Region', 'Geo_Type', 
        'CRRINT', 'not_int', 'not_int_pct', 'CRRALL']
merged = merged[cols]
merged = merged.rename(columns={'CRRINT':'internet', 'CRRALL':'2020_rate'})
#merged

### States

#### 2020 data

In [2]:
# create df with response rate by state
states2020 = merged[merged['Geo_Type'] == 'State']
states2020 = states2020.rename(columns={"internet": "state_internet", 
                                        "not_int" : "state_not_int", 
                                        'not_int_pct' : 'state_not_int_pct',
                                        "2020_rate" : "2020_state_rate"})

# print df and sort by highest cumulative response rate
#states2020.sort_values(by='2020_state_rate', ascending=False)

#### Join 2010 states

In [3]:
# read in csvs with 2010 response data for states
states2010 = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/states2010.csv')

# merge with 2020 states
states = pd.merge(states2020, states2010, on='State')
#get the column names
cols = states.columns.tolist()
#only select columns we want
cols = ['GEO_ID', 'State', 'Region',
 '2020_state_rate', '2010_rate', '2000_rate']
states = states[cols]
states = states.rename(columns={'2000_rate':'2000_state_rate', '2010_rate':'2010_state_rate'})

#create column with difference in 2010 vs 2020 response rate
states['10_20_state_difference'] = states['2020_state_rate'] - states['2010_state_rate']

#print table sorted by 10-20 difference largest ---> smallest
#states.sort_values(by=['2000_state_rate'], ascending=True)

### Census Tracts

#### 2020 Tracts

In [4]:
# select just census tract geo types
tracts2020 = merged[merged['Geo_Type'].str.contains("Tract")]
#rename column
tracts2020 = tracts2020.rename(columns={"2020_rate": "2020_tract_rate",
                                        'not_int':'tract_not_int',
                                        'not_int_pct':'tract_not_int_pct'})
# sort by highest cumulative response rate
#tracts2020.sort_values(by='2020_tract_rate', ascending=False)

In [5]:
#tract rates compared to state averages
tract2020states = pd.merge(tracts2020, states, on=['State', 'Region'])
tract2020states = tract2020states[['GEO_ID_x', 'State', 'Geo_Name', 'Geo_Type', 'Region','2020_tract_rate', '2020_state_rate', '2010_state_rate', '10_20_state_difference']]
tract2020states = tract2020states.rename(columns={'GEO_ID_x':'GEO_ID'})
tract2020states['2020_tract_st_diff'] = tract2020states['2020_tract_rate'] - tract2020states['2020_state_rate']
#tract2020states.sort_values(by=['2020_tract_st_diff'])
#merging tracts with states will drop tribal tracts (as they have no state), so those are examined separately below

#### Join 2010 tract rates

In [6]:
# read in csvs with 2010 response data for tracts and states
tracts2010 = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/2010responserate.csv')
#rename this column
tracts2010 = tracts2010.rename(columns={'FSRR2010':'2010_tract_rate'})
#tracts2010

In [7]:
## difference in row numbers: both tract dfs have 84519 rows, but when joined only 84093
# Identify what values are in tracts2010 and not in tracts2020
key_diff1 = set(tracts2010.GEO_ID).difference(tracts2020.GEO_ID)
len(key_diff1)
#key_diff1

# Identify what values are in tracts2020 and not in tracts2010
key_diff2 = set(tracts2020.GEO_ID).difference(tracts2010.GEO_ID)
len(key_diff2)
#key_diff2

# 2010 rates do not include tribal tracts while 2020 tracts are missing some 
# tracts in a multitude of states, likely due to a change in tract boundaries. 
# Those differences account for 426 tracts, which is .5% of the original 84519 
# tracts. As these tracts comprise a small percentage, they can be dropped. 

426

In [8]:
# merge with 2020 tracts
tracts = pd.merge(tract2020states, tracts2010, on='GEO_ID')
#get column names
cols = tracts.columns.tolist()
#select only columns we want
cols = ['Geo_Name','county', 'State_y', 'Region', 'Geo_Type', '2020_tract_rate', '2010_tract_rate', '2020_tract_st_diff', '2020_state_rate', '2010_state_rate', '10_20_state_difference']
tracts = tracts[cols]
#rename weird column name
tracts = tracts.rename(columns={'State_y':'State'})
#print df sorted largest --> smallest 2010 rate
#tracts.sort_values(by='2010_tract_rate', ascending=False)

In [9]:
#how many null 2010 response values are there: 531, which is .6% of all rows, 84093
is_temp = tracts.isnull()
no_2010 = is_temp.any(axis=1)
no_2010 = tracts[no_2010]
no_2010.sort_values(by="2010_tract_rate")

#due to the low percentage, these null values will be discarded
tracts = tracts.dropna(axis=0)
#check they'd discraded
#tracts.sort_values(by='2010_tract_rate')
#there are no 2010 na values, so all were dropped

In [10]:
#create column with difference in 2010 vs 2020 response rate
tracts['10_20_tract_difference'] = tracts['2020_tract_rate'] - tracts['2010_tract_rate']
#sort df largest --> smallest 10-20 difference
#tracts.sort_values(by='10_20_tract_difference', ascending=False)

### Demographic Data

In [11]:
#load both sets of demographic data, join
temp1 = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/demographics1_working.csv')
temp2 = pd.read_csv('https://raw.githubusercontent.com/tdiffendal/USAT/master/census-responses/demographics2_working.csv')
demo = pd.merge(temp1, temp2, on=['GISJOIN', "YEAR", "STATE", "STATEA", 
                                              'COUNTY', 'COUNTYA', 'TRACTA', 
                                              'Geo_Name', 'NAME_E'])

# merge to create new df with response rates and demos for all tracts
temp = pd.merge(tracts, demo, on='Geo_Name')

#create new column adding up pop with rent > 30% income (homelessness marker)
temp['rent_30_more'] = (temp['rent_30_34.9'] + temp['rent_35_39.9'] 
+ temp['rent_40_49.9'] + temp['rent_50_over'])
# check to see if column created
#pd.set_option('display.max_columns', 100)
#temp

In [12]:
#df.std()Returns the standard deviation of each column
#df.corr()Returns the correlation between columns in a data frame

#In order to get # of null/missing values for each column, run 
pd.set_option('display.max_rows', 100)
pd.isnull(temp).sum()

# make a dataframe of all rows with na value
temp1 = temp[temp.isna().any(axis=1)]
temp1
# how many nas in each state
temp2 = temp1['State'].value_counts()
temp2 = pd.DataFrame(temp2)
temp2 = temp2.reset_index().rename(columns={'index':'state', 'State':'count_na'})
temp2
#compare to total 
temp3 = temp['State'].value_counts()
temp3 = pd.DataFrame(temp3)
temp3 = temp3.reset_index().rename(columns={'index':'state', 'State':'count'})
temp3

#see what percent of state values of na
#Puerto Rico will lose the greatest % if these are dropped
temp4 = pd.merge(temp2, temp3, on=['state'])
temp4['na_percent'] = (temp4['count_na']*100) / temp4['count']
temp4

#how many total nas? --> 1171
sum(temp4['count_na'])

#nas account for what total % of all rows --> 1.82
(sum(temp4['count_na'])*100) / sum(temp4['count'])

#Less than 2% of total, so for now will drop those rows
# before df had 64413 rows
df = temp.dropna()    #now has 63242 rows, 1171 difference (total # nas)

In [13]:
#check if any nas
#pd.set_option('display.max_rows', None)
#print(pd.isnull(df).sum())

#still 52 states? (50 + DC and PR)
#print("Number states: ", len(df['State'].unique()))

#compare tract numbers now to before na drop
temp1 = pd.DataFrame(temp.groupby('State')['Geo_Name'].nunique()).rename(columns={'Geo_Name':'beforeDrop'})
temp2 = pd.DataFrame(df.groupby('State')['Geo_Name'].nunique()).rename(columns={'Geo_Name':'afterDrop'})
temp3 = pd.merge(temp1, temp2, left_index = True, right_index=True)
#number tracts dropped from each state
temp3['numDrop'] = temp3['beforeDrop'] - temp3['afterDrop']
# the percentage of total tracts dropped
temp3['dropPct'] = temp3['numDrop']*100 / temp3['beforeDrop']
#temp3 #puerto rico loses the most tracts at 5.7%

In [14]:
#return column size
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_row', 50)

#print df with no nas
#df.to_csv("all_rates_demos_merged.csv")

## Analysis

In [15]:
##see all dfs in memory
%whos DataFrame

#### Existing DFs:

#dataframe with all years, states, tracts, demographics
#df

#2010 and 2020 states
#states

#2010 States
#states2010

#2020 States
#states2020

#2010 and 2020 tracts and states
#tracts

#2020 tracts paired with states, includes int data
#tracts2020states

#ignore: crosswalk, demo, demo1, demo2, df1, df2, df3, df4, df5
        #hm, hmm, hmmm, initial_df, merged1

Variable          Type         Data/Info
----------------------------------------
crosswalk         DataFrame                   GEO_ID    <...>[125338 rows x 4 columns]
demo              DataFrame                  GISJOIN    <...>[73976 rows x 91 columns]
df                DataFrame                       Geo_Na<...>63242 rows x 103 columns]
initial_df        DataFrame                   GEO_ID   R<...>123252 rows x 22 columns]
is_temp           DataFrame           Geo_Name  county  <...>[84093 rows x 11 columns]
merged            DataFrame                         GEO_<...>123250 rows x 10 columns]
no_2010           DataFrame                         Geo_<...>\n[531 rows x 11 columns]
regions           DataFrame              State       Reg<...>\n\n[54 rows x 2 columns]
states            DataFrame             GEO_ID          <...>\n\n[52 rows x 7 columns]
states2010        DataFrame                State Geo_Typ<...>\n\n[52 rows x 4 columns]
states2020        DataFrame                 GEO_

In [16]:
i = states.mean(axis=0)['2020_state_rate']
print(
    i,
    "% is the current nationwide average and",
    np.sum(df['2020_state_rate'] > i),
    "states exceed that")

60.35576923076923 % is the current nationwide average and 39079 states exceed that


In [17]:
#how many tracts are > than state avg?

#tract2020States.count(tract2020States['2020_tract_st_diff'] > 0)
print(np.sum(tract2020states['2020_tract_st_diff'] > 0), "tracts out of", 
      len(tract2020states), 
      "total (", 
      (np.sum(tract2020states['2020_tract_st_diff'] > 0) * 100) / len(tract2020states), "% )",
      "currently have greater census response rates than their state average")

44995 tracts out of 84093 total ( 53.506237142211596 % ) currently have greater census response rates than their state average


### National Comparative Rankings 2010 vs 2020

In [None]:
#Discrepancies? Shouldn't these all match up?

print(
  "2010 State Resonse Average from states data:",
    states.mean(axis=0)['2010_state_rate'], "\n",
  "2010 States Response Average from df data:",
    df.mean(axis=0)['2010_state_rate'], "\n",
  "2010 Tract Response Average from df data:",
    df.mean(axis=0)['2010_tract_rate']
)
## why are these the same as above what's the difference between an average and a totallllllll
print(
  "2010 State Resonse Total from states data:",
    (states['2010_state_rate'].sum()) / (states.shape[0]), "\n",
  "2010 States Response Total from df data:",
    (df['2010_state_rate'].sum()) / (df.shape[0]), "\n",
  "2010 Tract Response Total from df data:",
    (df['2010_tract_rate'].sum()) / (df.shape[0])
)

In [None]:
# Average difference between current state rates and 2010 state rates as of 6/15/20
#takes a while to run so commented out
#df.mean(axis=0)['10_20_state_difference']

#average 2020 response rate across states?
print(
  "2020 State Resonse Average from states data:",
    states.mean(axis=0)['2020_state_rate'], "\n",
  "2020 States Response Average from df data:",
    df.mean(axis=0)['2020_state_rate'], "\n",
  "2020 Tract Response Average from df data:",
    df.mean(axis=0)['2020_tract_rate']
)

print(
  "2020 State Resonse Total from states data:",
    (states['2020_state_rate'].sum()) / (states.shape[0] * 100), "\n",
  "2020 States Response Total from df data:",
    (df['2020_state_rate'].sum()) / (df.shape[0] * 100), "\n",
  "2020 Tract Response Total from df data:",
    (df['2020_tract_rate'].sum()) / (df.shape[0] * 100)
)

In [None]:
# average difference by region
states.groupby('Region').mean().sort_values(by='10_20_state_difference', ascending=False)

In [None]:
#assign ranks to states based on comparative response rate
states['2020_rank'] = states['2020_state_rate'].rank(method='max', ascending=False)
states['2010_rank'] = states['2010_state_rate'].rank(method='max', ascending=False)

#pull ranks into separate dataframe
state_ranks = states[['State', '2020_rank', '2010_rank']].sort_values(by='2020_rank')

#show change in rank from 2010 to 2020
#negative number means a state has a lower 2020 response rate and has gone down in rankings
state_ranks['rank_change'] = state_ranks['2010_rank'] - state_ranks['2020_rank']
state_ranks.sort_values(by='rank_change', ascending=True)

#see how many states only changed 2 or fewer positions
#small_change = state_ranks[state_ranks.rank_change.between(-2, 2, inclusive=True)].sort_values(by='rank_change')
#small_change
#16 states have stayed ~similar in the rankings, and this seems to impact
#states with both high and low response rates
#small_change.mean(axis=0)['2020_rank']

### Internet Usage

Internet usage is only available for 2020 rates

In [None]:
### Percent of response rate not from internet

states2020.sort_values(by='state_not_int_pct', ascending=False)
states2020.mean(axis=0)['state_not_int_pct']
states2020.mean(axis=0)['state_not_int']

In [None]:
# average non internet response rate
states2020.groupby(by='Region').mean().sort_values(by='state_internet', ascending=False)

In [None]:
# highest non-internet response rate (not_int)
states2020.sort_values(by='state_not_int', ascending=False)

In [None]:
states2020.mean(axis=0)['state_not_int']

In [None]:
states2020.mean(axis=0)['state_internet']

In [None]:
states2020.mean(axis=0)['2020_state_rate']

In [None]:
### Without Puerto Rico

#make non-pr df
no_pr = states2020[states2020['State'] != 'Puerto Rico']

# average internet response
no_pr.mean(axis=0)['state_internet']

In [None]:
# average overall response rate
no_pr.mean(axis=0)['2020_state_rate']

### Region Analysis

In [None]:
# average by region
# NOTE as tribal tracts are not assigned to a state they do not have a corresponding region and thus are not counted in the regional calculations
states.groupby('Region').mean().sort_values(by='2020_state_rate', ascending=False)

In [None]:
# average difference as of 6/15/20
#this can take a while to run so is commented out unless needed
#tracts.mean(axis=0)['10_20_tract_difference']

In [None]:
# average difference by region
tracts.groupby('Region').mean().sort_values(by='10_20_tract_difference', ascending=False)

In [None]:
#tract average differences vs state rates
tracts.groupby('State').mean().sort_values(by='2020_state_rate', ascending=False)

### Tribal tracts

In [None]:
# create df with response rates in tribal tracts
tribal = tracts2020[tracts2020['Geo_Type'].str.contains("Tribal")]
tribal.sort_values(by='2020_tract_rate', ascending=False)

In [None]:
### tribal areas and tracts stats

#mean non internet response
tribal.mean(axis=0)['tract_not_int'] #8.37%

In [None]:
# mean internet response rate
tribal.mean(axis=0)['internet']

In [None]:
# mean overall response rate
tribal.mean(axis=0)['2020_tract_rate']

### Tracts with 0 overall response rate

In [None]:
## Tracts with 0 cumulative response rate: 28
is_zero = tracts['2020_tract_rate'] == 0.0
zeros = tracts[is_zero]
zeros.sort_values(by='State')

## Tracts with 0 cumulative response rate: 28

#make dataframe of states with # tracts with 0%, number total tracts, and what % of total tracts are 0
temp = pd.DataFrame(zeros['State'].value_counts())
temp2 = pd.DataFrame(tracts['State'].value_counts())
temp3 = pd.merge(temp, temp2, right_index=True, left_index=True)
#rename the columns
temp3 = temp3.rename(columns={"State_x": "0_tracts", "State_y" : "total_tracts"})
#compute percentage
temp3['0_percent'] = temp3['0_tracts'] * 100 / temp3['total_tracts']
temp3

## Regressions

### With Puerto Rico

#### 2020 Regressions

In [None]:
### 2020 Multi-regression

import statsmodels.api as sm

#put all variables for predicting 2020 rates in dataframe
variables20 = df[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target20 = df[["2020_tract_rate"]]

#build model and print summary
model20 = sm.OLS(target20, variables20).fit()
print(model20.summary())

In [None]:
### 2020 linear regressions for each variable

from sklearn import linear_model

#create list of variable names
cols = variables20.columns.tolist()
#build linear model
regr = linear_model.LinearRegression()
#create empty list to store loop results
rows = []
#loop through each variable
for i in cols:
    #fit linear model to variable
    regr.fit(variables20[[i]], target20)
    #save model variable name, intercept, coef and r^2 to list
    rows.append([i, regr.intercept_, regr.coef_, regr.score(variables20[[i]], target20)])

#turn list into df with these column names
linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets lol
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['intercept'] = linears20['intercept'].str.get(0)

#print df sorted coefs largest --> smallest
linears20.sort_values(by='coefficient', ascending=False)

#### Normalized 2020 inputs

In [None]:
### 2020 Multi regression with normalized variables

#normalize variable values
norm_variables20 = (variables20 - variables20.min()) / (variables20.max() - variables20.min())

#build normalized multi-regress model and print summary
norm_model20 = sm.OLS(target20, norm_variables20).fit()
print(norm_model20.summary())

In [None]:
#2020 normalized linear regressions for each variable

# linear regression for each variable 'i'
cols = norm_variables20.columns.tolist()
# create model
norm_regr = linear_model.LinearRegression()
#empty list for loop results
rows = []
#loop through each variable
for i in cols:
    #fit model to each variable
    norm_regr.fit(norm_variables20[[i]], target20)
    #add model results to list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_, 
                 norm_regr.score(norm_variables20[[i]], target20)])

#turn list into dataframe
norm_linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['intercept'] = norm_linears20['intercept'].str.get(0)

#print df ordered by largest to smallest coef
norm_linears20.sort_values(by='coefficient', ascending=False)

#### 2010 Regressions

In [None]:
### 2010 multi regression

#put all variables for predicting 2010 rates in dataframe
variables10 = df[['total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2010 response rates - in separate dataframe
target10 = df[["2010_tract_rate"]]

#create model and print summary table
model10 = sm.OLS(target10, variables10).fit()
print(model10.summary())

In [None]:
### 2010 linear regression for each variable

#list of variable names
cols = variables10.columns.tolist()
#build multi-reg model
regr = linear_model.LinearRegression()
#create empty list for loop results
rows = []

#loop through variables
for i in cols:
    #fit a model to the current variable
    regr.fit(variables10[[i]], target10)
    #save the model's resulting variable name, intercept, coef, and r^2
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables10[[i]], target10)])

#turn list into data frame
linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['intercept'] = linears10['intercept'].str.get(0)

#print data frame ordered coefficient largest --> smallest
linears10.sort_values(by='coefficient', ascending=False)

#### Normalized 2010 Regressions

In [None]:
### 2010 normalized multi-regression

#normalize the variables
norm_variables10 = (variables10 - variables10.min()) / (variables10.max() - variables10.min())

#build normalized model and print summary
norm_model10 = sm.OLS(target10, norm_variables10).fit()
print(norm_model10.summary())

In [None]:
###2010 normalized linear regressions for each variable

#create list of variable names
cols = norm_variables10.columns.tolist()
#build the model
norm_regr = linear_model.LinearRegression()
#create empty list for model results
rows = []
#cycle through variables
for i in cols:
    #do the linear regression on the current variable
    norm_regr.fit(norm_variables10[[i]], target10)
    #add the corresponding variable name, intercept, coefficient and r-squared to the list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_,
                norm_regr.score(norm_variables10[[i]], target10)])

#turn list into data frame with these column names
norm_linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['intercept'] = norm_linears10['intercept'].str.get(0)

#print df sorted by coefficient
norm_linears10.sort_values(by='coefficient', ascending=False)

#### 2020 Edited Regressions
Edited = Run regressions with fewer variables

In [None]:
### 2020 edited multi-regressions

#put all variables for predicting 2020 rates in dataframe
variables_ed = df[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other', 'income_poverty_ratio', 'per_capita_income',
 'civilian_employed', 'civilian_unemployed',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income','rent_30_more',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer',
 'us_born', 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target_ed = df[["2020_tract_rate"]]

#build and fit the multi-regression model
model_ed = sm.OLS(target_ed, variables_ed).fit()
#print out the model summary table
print(model_ed.summary())

In [None]:
### 2020 edited linear regressions

#create list of variable names
cols = variables_ed.columns.tolist()
#build the model
regr = linear_model.LinearRegression()
#create empty list to append results
rows = []

#loop through variables
for i in cols:
    #fit the model
    regr.fit(variables_ed[[i]], target_ed)
    #put model variable name, intercept, coef and r^2 in list
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables_ed[[i]], target_ed)])

#turn list into data frame with these column names
linears_ed = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['intercept'] = linears_ed['intercept'].str.get(0)

#print df sorted by coefficient largest --> smallest
linears_ed.sort_values(by='coefficient', ascending=False)

In [None]:
### normalized 2020 edited variables multi regression

#normalize the variables
norm_variables_ed = (variables_ed - variables_ed.min()) / (variables_ed.max() - variables_ed.min())

#build normalized model and print summary
norm_model_ed = sm.OLS(target_ed, norm_variables_ed).fit()
print(norm_model_ed.summary())

### Without Puerto Rico

#### 2020 regressions

In [None]:
### 2020 Multi-regression

#df without puerto rico for regression 
no_pr = df[df['Region'] != 'Puerto Rico']
#this is different from earlier code no_pr = states[states.State != 'Puerto Rico']
#earlier code omitted Puerto Rico from state level data. this eliminates from tract level data


import statsmodels.api as sm

#put all variables for predicting 2020 rates in dataframe
variables20 = no_pr[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target20 = no_pr[["2020_tract_rate"]]

#build model and print summary
model20 = sm.OLS(target20, variables20).fit()
model20.summary()

In [None]:
### 2020 linear regressions for each variable

from sklearn import linear_model

#create list of variable names
cols = variables20.columns.tolist()
#build linear model
regr = linear_model.LinearRegression()
#create empty list to store loop results
rows = []
#loop through each variable
for i in cols:
    #fit linear model to variable
    regr.fit(variables20[[i]], target20)
    #save model variable name, intercept, coef and r^2 to list
    rows.append([i, regr.intercept_, regr.coef_, regr.score(variables20[[i]], target20)])

#turn list into df with these column names
linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets lol
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['intercept'] = linears20['intercept'].str.get(0)

#print df sorted coefs largest --> smallest
linears20.sort_values(by='coefficient', ascending=False)

#### Normalized 2020 inputs

In [None]:
### 2020 Multi regression with normalized variables

#normalize variable values
norm_variables20 = (variables20 - variables20.min()) / (variables20.max() - variables20.min())

#build normalized multi-regress model and print summary
norm_model20 = sm.OLS(target20, norm_variables20).fit()
norm_model20.summary()

In [None]:
#2020 normalized linear regressions for each variable

# linear regression for each variable 'i'
cols = norm_variables20.columns.tolist()
# create model
norm_regr = linear_model.LinearRegression()
#empty list for loop results
rows = []
#loop through each variable
for i in cols:
    #fit model to each variable
    norm_regr.fit(norm_variables20[[i]], target20)
    #add model results to list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_, 
                 norm_regr.score(norm_variables20[[i]], target20)])

#turn list into dataframe
norm_linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['intercept'] = norm_linears20['intercept'].str.get(0)

#print df ordered by largest to smallest coef
norm_linears20.sort_values(by='coefficient', ascending=False)

#### 2010 Regressions

In [None]:
### 2010 multi regression

#put all variables for predicting 2010 rates in dataframe
variables10 = no_pr[['total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2010 response rates - in separate dataframe
target10 = no_pr[["2010_tract_rate"]]

#create model and print summary table
model10 = sm.OLS(target10, variables10).fit()
model10.summary()

In [None]:
### 2010 linear regression for each variable

#list of variable names
cols = variables10.columns.tolist()
#build multi-reg model
regr = linear_model.LinearRegression()
#create empty list for loop results
rows = []

#loop through variables
for i in cols:
    #fit a model to the current variable
    regr.fit(variables10[[i]], target10)
    #save the model's resulting variable name, intercept, coef, and r^2
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables10[[i]], target10)])

#turn list into data frame
linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['intercept'] = linears10['intercept'].str.get(0)

#print data frame ordered coefficient largest --> smallest
linears10.sort_values(by='coefficient', ascending=False)

#### Normalized 2010 Regressions

In [None]:
### 2010 normalized multi-regression

#normalize the variables
norm_variables10 = (variables10 - variables10.min()) / (variables10.max() - variables10.min())

#build normalized model and print summary
norm_model10 = sm.OLS(target10, norm_variables10).fit()
norm_model10.summary()

In [None]:
###2010 normalized linear regressions for each variable

#create list of variable names
cols = norm_variables10.columns.tolist()
#build the model
norm_regr = linear_model.LinearRegression()
#create empty list for model results
rows = []
#cycle through variables
for i in cols:
    #do the linear regression on the current variable
    norm_regr.fit(norm_variables10[[i]], target10)
    #add the corresponding variable name, intercept, coefficient and r-squared to the list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_,
                norm_regr.score(norm_variables10[[i]], target10)])

#turn list into data frame with these column names
norm_linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['intercept'] = norm_linears10['intercept'].str.get(0)

#print df sorted by coefficient
norm_linears10.sort_values(by='coefficient', ascending=False)

#### 2020 Edited Regressions
Edited = Run regressions with fewer variables

In [None]:
### 2020 edited multi-regressions

#put all variables for predicting 2020 rates in dataframe
variables_ed = no_pr[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other', 'income_poverty_ratio', 'per_capita_income',
 'civilian_employed', 'civilian_unemployed',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income','rent_30_more',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer',
 'us_born', 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target_ed = no_pr[["2020_tract_rate"]]

#build and fit the multi-regression model
model_ed = sm.OLS(target_ed, variables_ed).fit()
#print out the model summary table
model_ed.summary()

In [None]:
### 2020 edited linear regressions

#create list of variable names
cols = variables_ed.columns.tolist()
#build the model
regr = linear_model.LinearRegression()
#create empty list to append results
rows = []

#loop through variables
for i in cols:
    #fit the model
    regr.fit(variables_ed[[i]], target_ed)
    #put model variable name, intercept, coef and r^2 in list
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables_ed[[i]], target_ed)])

#turn list into data frame with these column names
linears_ed = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['intercept'] = linears_ed['intercept'].str.get(0)

#print df sorted by coefficient largest --> smallest
linears_ed.sort_values(by='coefficient', ascending=False)

In [None]:
### normalized 2020 edited variables multi regression

#normalize the variables
norm_variables_ed = (variables_ed - variables_ed.min()) / (variables_ed.max() - variables_ed.min())

#build normalized model and print summary
norm_model_ed = sm.OLS(target_ed, norm_variables_ed).fit()
norm_model_ed.summary()

### Puerto Rico

In [None]:
#create df with just puerto rico
is_pr = df['Region'] == 'Puerto Rico'
pr = df[is_pr]
pr

#### 2020 regressions

In [None]:
### 2020 Multi-regression

import statsmodels.api as sm

#put all variables for predicting 2020 rates in dataframe
variables20 = pr[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target20 = pr[["2020_tract_rate"]]

#build model and print summary
model20 = sm.OLS(target20, variables20).fit()
model20.summary()

In [None]:
### 2020 linear regressions for each variable

from sklearn import linear_model

#create list of variable names
cols = variables20.columns.tolist()
#build linear model
regr = linear_model.LinearRegression()
#create empty list to store loop results
rows = []
#loop through each variable
for i in cols:
    #fit linear model to variable
    regr.fit(variables20[[i]], target20)
    #save model variable name, intercept, coef and r^2 to list
    rows.append([i, regr.intercept_, regr.coef_, regr.score(variables20[[i]], target20)])

#turn list into df with these column names
linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets lol
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['coefficient'] = linears20['coefficient'].str.get(0)
linears20['intercept'] = linears20['intercept'].str.get(0)

#print df sorted coefs largest --> smallest
linears20.sort_values(by='coefficient', ascending=False)

#### Normalized 2020 inputs

In [None]:
### 2020 Multi regression with normalized variables

#normalize variable values
norm_variables20 = (variables20 - variables20.min()) / (variables20.max() - variables20.min())

#build normalized multi-regress model and print summary
norm_model20 = sm.OLS(target20, norm_variables20).fit()
norm_model20.summary()

In [None]:
#2020 normalized linear regressions for each variable

# linear regression for each variable 'i'
cols = norm_variables20.columns.tolist()
# create model
norm_regr = linear_model.LinearRegression()
#empty list for loop results
rows = []
#loop through each variable
for i in cols:
    #fit model to each variable
    norm_regr.fit(norm_variables20[[i]], target20)
    #add model results to list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_, 
                 norm_regr.score(norm_variables20[[i]], target20)])

#turn list into dataframe
norm_linears20 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['coefficient'] = norm_linears20['coefficient'].str.get(0)
norm_linears20['intercept'] = norm_linears20['intercept'].str.get(0)

#print df ordered by largest to smallest coef
norm_linears20.sort_values(by='coefficient', ascending=False)

#### 2010 Regressions

In [None]:
### 2010 multi regression

#put all variables for predicting 2010 rates in dataframe
variables10 = pr[['total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone',
 'other_alone', 'two_or_more', 'two_more_including_other',
 'two_more_excluding_other', 'total_education', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master', 'prof_school', 'doctorate', 'language_total',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other_indo_euro', 'lang_asian_pacific_island',
 'lang_other', 'income_poverty_ratio', 'income_poverty_under_half',
 'income_poverty_half_.99', 'income_povery_1_1.24',
 'income_poverty_1.25_1.49', 'income_poverty_1.5_1.84',
 'income_poverty_1.85_1.99', 'income_poverty_2_over',
 'median_household_income', 'per_capita_income',
 'employment_total', 'labor_force', 'civilian_labor_force',
 'civilian_employed', 'civilian_unemployed', 'armed_forces',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'total_occupied_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income',
 'rent_less_10', 'rent_10_14.9', 'rent_15_19.9', 'rent_20_24.9',
 'rent_25_29.9', 'rent_30_34.9', 'rent_35_39.9', 'rent_40_49.9',
 'rent_50_over', 'rent_30_more', 'rent_not_computed', 'total_computer_status',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer', 'us_pop',
 'us_born', 'us_territory_born', 'us_born_abroad',
 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2010 response rates - in separate dataframe
target10 = pr[["2010_tract_rate"]]

#create model and print summary table
model10 = sm.OLS(target10, variables10).fit()
model10.summary()

In [None]:
### 2010 linear regression for each variable

#list of variable names
cols = variables10.columns.tolist()
#build multi-reg model
regr = linear_model.LinearRegression()
#create empty list for loop results
rows = []

#loop through variables
for i in cols:
    #fit a model to the current variable
    regr.fit(variables10[[i]], target10)
    #save the model's resulting variable name, intercept, coef, and r^2
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables10[[i]], target10)])

#turn list into data frame
linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['coefficient'] = linears10['coefficient'].str.get(0)
linears10['intercept'] = linears10['intercept'].str.get(0)

#print data frame ordered coefficient largest --> smallest
linears10.sort_values(by='coefficient', ascending=False)

#### Normalized 2010 Regressions

In [None]:
### 2010 normalized multi-regression

#normalize the variables
norm_variables10 = (variables10 - variables10.min()) / (variables10.max() - variables10.min())

#build normalized model and print summary
norm_model10 = sm.OLS(target10, norm_variables10).fit()
norm_model10.summary()

In [None]:
###2010 normalized linear regressions for each variable

#create list of variable names
cols = norm_variables10.columns.tolist()
#build the model
norm_regr = linear_model.LinearRegression()
#create empty list for model results
rows = []
#cycle through variables
for i in cols:
    #do the linear regression on the current variable
    norm_regr.fit(norm_variables10[[i]], target10)
    #add the corresponding variable name, intercept, coefficient and r-squared to the list
    rows.append([i, norm_regr.intercept_, norm_regr.coef_,
                norm_regr.score(norm_variables10[[i]], target10)])

#turn list into data frame with these column names
norm_linears10 = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['coefficient'] = norm_linears10['coefficient'].str.get(0)
norm_linears10['intercept'] = norm_linears10['intercept'].str.get(0)

#print df sorted by coefficient
norm_linears10.sort_values(by='coefficient', ascending=False)

#### 2020 Edited Regressions
Edited = Run regressions with fewer variables

In [None]:
### 2020 edited multi-regressions

#put all variables for predicting 2020 rates in dataframe
variables_ed = pr[['2010_tract_rate','total_population', 
 'white_alone', 'black_alone', 'amerindian_alone', 'asian_alone', 
 'pacific_islander_alone', 'no_school',
 'some_school', 'diploma', 'ged', 'some_college', 'associate',
 'bachelor', 'master',
 'lang_english_only', 'lang_spanish', 'lang_spanish_limited_english', 
 'lang_other', 'income_poverty_ratio', 'per_capita_income',
 'civilian_employed', 'civilian_unemployed',
 'not_labor_force', 'total_houses', 'occupied_houses',
 'vacant_houses', 'owner_occupied',
 'renter_occupied', 'median_gross_rent', 'rent_to_income','rent_30_more',
 'has_computer', 'dial_up_computer', 'broadband_computer',
 'no_internet_computer', 'no_computer',
 'us_born', 'us_naturalization', 'not_us_citizen']]

#what we want to predict - 2020 response rates - in dataframe
target_ed = pr[["2020_tract_rate"]]

#build and fit the multi-regression model
model_ed = sm.OLS(target_ed, variables_ed).fit()
#print out the model summary table
model_ed.summary()

In [None]:
### 2020 edited linear regressions

#create list of variable names
cols = variables_ed.columns.tolist()
#build the model
regr = linear_model.LinearRegression()
#create empty list to append results
rows = []

#loop through variables
for i in cols:
    #fit the model
    regr.fit(variables_ed[[i]], target_ed)
    #put model variable name, intercept, coef and r^2 in list
    rows.append([i, regr.intercept_, regr.coef_,
                regr.score(variables_ed[[i]], target_ed)])

#turn list into data frame with these column names
linears_ed = pd.DataFrame(rows, columns=['variable', 'intercept', 'coefficient', 'r-squared'])

#remove square brackets
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['coefficient'] = linears_ed['coefficient'].str.get(0)
linears_ed['intercept'] = linears_ed['intercept'].str.get(0)

#print df sorted by coefficient largest --> smallest
linears_ed.sort_values(by='coefficient', ascending=False)

In [None]:
### normalized 2020 edited variables multi regression

#normalize the variables
norm_variables_ed = (variables_ed - variables_ed.min()) / (variables_ed.max() - variables_ed.min())

#build normalized model and print summary
norm_model_ed = sm.OLS(target_ed, norm_variables_ed).fit()
norm_model_ed.summary()