### Gather dependencies

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import json
import matplotlib
# from sodapy import Socrata
from config import census_key
import pprint
import requests
import csv
import os

### Function for quickly calling a DataFrame

In [2]:
# Function for quickly assempling a DataFrame
def json_to_dataframe(response):
    return pd.DataFrame(response.json()[1:], columns=response.json()[0])

### Create DataFrame for CDC Data

In [3]:
# CDC DataFrame by county in Minnesota
cdc_data = pd.read_csv("../Project_1/cdc_data.csv")
cdc_data
cdc_df = pd.DataFrame(cdc_data)
# cdc_df.columns
cdc_df = cdc_df[['case_month', 'res_state', 'res_county', 'age_group', 'sex', 'race', 'ethnicity', 'current_status']]
cdc_df = cdc_df.rename(columns={
    'res_state': 'State',
    'res_county': 'County',
    'ethnicity': 'Ethnicity',
    'current_status': 'Infection Status',
    'race':'Race'
})
# Drop "Unkown" race from dataset
cdc_df = cdc_df.loc[cdc_df['Race']!= 'Unknown',:]
cdc_df['Infections'] = 1
cdc_df['Race'] = cdc_df['Race'].replace('White', 'Caucasian')
cdc_df['Race'] = cdc_df['Race'].replace('Black', 'African American')
cdc_df['Race'] = cdc_df['Race'].replace('American Indian/Alaska Native', 'Native American')
cdc_df['Race'] = cdc_df['Race'].replace('Multiple/Other', 'Other')
cdc_df['Race2'] = np.where(np.logical_and(cdc_df['Race'] == 'Caucasian', cdc_df['Ethnicity'] == 'Hispanic/Latino'), 'Hispanic', cdc_df['Race'])
cdc_df = cdc_df[['County','Race2','Infections']]
cdc_df

Unnamed: 0,County,Race2,Infections
0,STEELE,Caucasian,1
2,STEARNS,Caucasian,1
3,KANDIYOHI,Caucasian,1
4,ANOKA,,1
5,CLAY,Caucasian,1
...,...,...,...
539745,SCOTT,Asian,1
539746,GOODHUE,Caucasian,1
539747,DAKOTA,Caucasian,1
539748,DAKOTA,Caucasian,1


### Drop NaN and compile totals by county and race

In [6]:
# Drop NaN
cdc_group_df = cdc_df[['County','Race2','Infections']]
cdc_group_df = cdc_group_df.dropna()
cdc_group_df.to_csv("../Project_1/all0.csv", index=False)
#cdc_group_df.append(pd.DataFrame(cdc_group_df.sum()['Infections'], index=["Total"], columns=["Infections"]))
#cdc_group_df
# County totals by race
cdc_county_totals_df = cdc_group_df.groupby(["County", "Race2"], as_index=False)['Infections'].sum()
cdc_county_totals_df#.to_csv("../Project_1/county_total.csv", index=False)


Unnamed: 0,County,Race2,Infections
0,ANOKA,African American,2935
1,ANOKA,Asian,1781
2,ANOKA,Caucasian,24621
3,ANOKA,Hispanic,746
4,ANOKA,Native American,103
...,...,...,...
180,WRIGHT,African American,160
181,WRIGHT,Asian,68
182,WRIGHT,Caucasian,10764
183,WRIGHT,Hispanic,109


### Create poverty DataFrame based on US Census

In [37]:
# Poverty DataFrame By Race - 2020

url = "https://api.census.gov/data/2020/acs/acs5?get=NAME,B17001_002E,B17001A_002E,B17001B_002E,B17001C_002E,B17001D_002E,B17001E_002E,B17001F_002E,B17001G_002E,B17001H_002E,B17001I_002E&for=county:*&in=state:27&key={0}".format(census_key)
response = requests.request("GET", url)
response
poverty_df = json_to_dataframe(response)

# Split NAME into county & state
name = poverty_df['NAME'].str.split(",", n=1, expand=True)
COUNTY = poverty_df['County']=name[0]
STATE = poverty_df['State']=name[1]
poverty_df.drop(columns=["NAME"], inplace=True)
poverty_df['Caucasian2'] = poverty_df['B17001A_002E'].astype(int) - poverty_df['B17001I_002E'].astype(int)
poverty_df['Other'] = poverty_df['B17001F_002E'].astype(int) + poverty_df['B17001G_002E'].astype(int)
# Rename columns to something intelligible
poverty_df = poverty_df[['County','B17001_002E','Caucasian2', 'B17001B_002E','B17001C_002E','B17001D_002E','B17001E_002E','Other','B17001I_002E']]

poverty_df = poverty_df.rename(columns={
    'B17001_002E':'Poverty Total',
    'Caucasian2':'Caucasian',
    'B17001B_002E':'African American',
    'B17001C_002E':'Native American',
    'B17001D_002E':'Asian',
    'B17001E_002E':'Hawaiian',
    'B17001I_002E':'Hispanic' 
    })

# # Remove ...County from County in dataframe
poverty_df['County'] = poverty_df['County'].replace('County', '', regex=True)

# # Upper case for County
poverty_df['County'] = poverty_df['County'].str.upper()
cols = ['County','Caucasian', 'African American', 'Native American', 'Asian', 'Hawaiian', 'Hispanic', 'Other']
poverty_df[cols].head(2)
poverty_df[cols].set_index('County').head(4)
poverty_df[cols].set_index('County').stack().reset_index().head(5)
poverty_df = poverty_df[cols].set_index('County').stack().reset_index()
poverty_df = poverty_df.rename(columns={
    'level_1': 'Race2',
    0: 'Population'
})
poverty_df



Unnamed: 0,County,Race2,Population
0,ANOKA,Caucasian,11778
1,ANOKA,African American,3697
2,ANOKA,Native American,327
3,ANOKA,Asian,953
4,ANOKA,Hawaiian,0
...,...,...,...
604,TODD,Native American,0
605,TODD,Asian,6
606,TODD,Hawaiian,0
607,TODD,Hispanic,230


### Create population DataFrame by county and race

In [27]:
# Population by county, by race
url = "https://api.census.gov/data/2020/acs/acs5?get=NAME,B03002_001E,B03002_003E,B03002_004E,B03002_005E,B03002_006E,B03002_007E,B03002_008E,B03002_009E,B03002_012E&for=county:*&in=state:27&key={0}".format(census_key)


response = requests.request("GET", url)
response
response_df = json_to_dataframe(response)
response_df = response_df.rename(columns={
    'B03002_001E': 'Population Total',
    'B03002_003E': 'Caucasian',
    'B03002_004E': 'African American',
    'B03002_005E': 'Native American',
    'B03002_006E': 'Asian',
    'B03002_007E': 'Hawaiian',
    'B03002_012E': 'Hispanic'
})
response_df['Other'] = response_df['B03002_008E'].astype(int) + response_df['B03002_009E'].astype(int)
mn_pop_df = response_df

# Split NAME into county & state
name = mn_pop_df['NAME'].str.split(",", n=1, expand=True)
COUNTY = mn_pop_df['County']=name[0]
STATE = mn_pop_df['State']=name[1]
mn_pop_df.drop(columns=["NAME"], inplace=True)

# # Remove ...County from County in dataframe
mn_pop_df['County'] = mn_pop_df['County'].replace('County', '', regex=True)

# # Upper case for County
mn_pop_df['County'] = mn_pop_df['County'].str.upper()
mn_pop_df = mn_pop_df[['County', 'Population Total', 'Caucasian', 'African American', 'Native American', 'Asian', 'Hawaiian', 'Hispanic', 'Other']]

mn_pop_df#.to_csv("../Project_1/mn_population by race.csv", index=False)
cols = ['County','Caucasian', 'African American', 'Native American', 'Asian', 'Hawaiian', 'Hispanic', 'Other']
mn_pop_df[cols].head(2)
mn_pop_df[cols].set_index('County').head(4)
mn_pop_df[cols].set_index('County').stack().reset_index().head(5)
mn_pop_df = mn_pop_df[cols].set_index('County').stack().reset_index()
mn_pop_df = mn_pop_df.rename(columns={
    'level_1': 'Race2',
    0: 'Population'
})
mn_pop_df



Unnamed: 0,County,Race2,Population
0,ANOKA,Caucasian,283436
1,ANOKA,African American,23534
2,ANOKA,Native American,1788
3,ANOKA,Asian,16082
4,ANOKA,Hawaiian,86
...,...,...,...
604,TODD,Native American,104
605,TODD,Asian,160
606,TODD,Hawaiian,7
607,TODD,Hispanic,1588


### Coordinates for heat map -- if time permits

In [None]:
# Get the coordinates of counties in MN
url = "https://en.wikipedia.org/wiki/User:Michael_J/County_table"

table = pd.read_html(url)
df = table[0]
Counties = df.loc[df['State'] == "MN", : ]
counties_df = Counties[['County [2]', 'Latitude', 'Longitude']]
counties_df#.to_csv("../Project_1/county_geo.csv")  

In [49]:
mn_pop_df = mn_pop_df.rename(columns={
    'Population':'Total Population'
    })
poverty_df = poverty_df.rename(columns={
    'Population': 'Poverty Population'
    })
poverty_df

merge_df = pd.merge(mn_pop_df, poverty_df, how="inner", on=['County', 'Race2'])
merge_df


Unnamed: 0,County,Race2,Total Population,Poverty Population
0,ANOKA,Caucasian,283436,11778
1,ANOKA,African American,23534,3697
2,ANOKA,Native American,1788,327
3,ANOKA,Asian,16082,953
4,ANOKA,Hawaiian,86,0
...,...,...,...,...
604,TODD,Native American,104,0
605,TODD,Asian,160,6
606,TODD,Hawaiian,7,0
607,TODD,Hispanic,1588,230
