# -------------------------------------**Sung** work below-----------------------------------

# Importing from CSV file
## = CDC.gov Dataset =

### Gather dependencies

In [31]:
# Dependencies
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from sodapy import Socrata
from config import census_key
import pprint
import requests
import os
import scipy.stats as st
from scipy.stats import linregress

### Create DataFrame for CDC Data

In [4]:
cdc_data = pd.read_csv("../Project_1/cdc_data.csv")
cdc_data
cdc_df = pd.DataFrame(cdc_data)
cdc_df

# Choose the columns we want and rename anything that isn't clear
cdc_df = cdc_df[['case_month', 'res_state', 'res_county', 'age_group', 'sex', 'race', 'ethnicity', 'current_status']]
cdc_df = cdc_df.rename(columns={
    'res_state': 'State',
    'res_county': 'County',
    'ethnicity': 'Ethnicity',
    'current_status': 'Infection Status',
    'race':'Race'
})

# Drop "Unkown" race from dataset
cdc_df = cdc_df.loc[cdc_df['Race']!= 'Unknown', :]

# Set Infections to 1 and replace the various races with values that are compatible with the census data
cdc_df['Infections'] = 1
cdc_df['Race'] = cdc_df['Race'].replace('White', 'Caucasian')
cdc_df['Race'] = cdc_df['Race'].replace('Black', 'African American')
cdc_df['Race'] = cdc_df['Race'].replace('American Indian/Alaska Native', 'Native American')
cdc_df['Race'] = cdc_df['Race'].replace('Multiple/Other', 'Other')
cdc_df['Race'] = np.where(np.logical_and(cdc_df['Race'] == 'Caucasian', cdc_df['Ethnicity'] == 'Hispanic/Latino'), 'Hispanic', cdc_df['Race'])

# NOTE: It's not clear what happens with Hawaiian here. 

cdc_df

Unnamed: 0,case_month,State,County,age_group,sex,Race,Ethnicity,Infection Status,Infections
0,2021-11,MN,STEELE,18 to 49 years,Male,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
2,2021-11,MN,STEARNS,50 to 64 years,Female,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
3,2020-05,MN,KANDIYOHI,18 to 49 years,Male,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
4,2021-05,MN,ANOKA,18 to 49 years,Male,,,Laboratory-confirmed case,1
5,2020-04,MN,CLAY,65+ years,Female,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
...,...,...,...,...,...,...,...,...,...
539745,2022-01,MN,SCOTT,50 to 64 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539746,2020-11,MN,GOODHUE,18 to 49 years,Female,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539747,2020-11,MN,DAKOTA,18 to 49 years,Male,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539748,2022-01,MN,DAKOTA,50 to 64 years,Male,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1


In [5]:
cdc_race = cdc_df['Race'].unique()
cdc_race

array(['Caucasian', nan, 'Hispanic', 'Asian', 'African American', 'Other',
       'Native American'], dtype=object)

## % of Asians in Hennepin County

In [6]:
cdc_hennepin = cdc_df.loc[cdc_df['County'] == 'HENNEPIN']
cdc_hennepin

Unnamed: 0,case_month,State,County,age_group,sex,Race,Ethnicity,Infection Status,Infections
13,2022-01,MN,HENNEPIN,0 - 17 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
15,2021-08,MN,HENNEPIN,18 to 49 years,Male,African American,Missing,Laboratory-confirmed case,1
16,2020-11,MN,HENNEPIN,18 to 49 years,Female,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
22,2021-03,MN,HENNEPIN,0 - 17 years,Female,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
29,2021-12,MN,HENNEPIN,18 to 49 years,Male,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
...,...,...,...,...,...,...,...,...,...
539729,2020-09,MN,HENNEPIN,18 to 49 years,Female,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539730,2020-12,MN,HENNEPIN,50 to 64 years,Female,African American,Non-Hispanic/Latino,Laboratory-confirmed case,1
539734,2020-03,MN,HENNEPIN,65+ years,Male,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539739,2020-12,MN,HENNEPIN,18 to 49 years,Male,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1


In [7]:
sample_population_hennepin = cdc_hennepin['State'].count()
print(f"Total sample population in Hennepin County in this randomized dataset is {sample_population_hennepin} according to the CDC.gov website")


Total sample population in Hennepin County in this randomized dataset is 105700 according to the CDC.gov website


In [8]:
#count of Asian in Hennepin county
cdc_hennepin_asian_df = cdc_hennepin.loc[cdc_hennepin['Race'] == 'Asian']
cdc_hennepin_asian_df

Unnamed: 0,case_month,State,County,age_group,sex,Race,Ethnicity,Infection Status,Infections
13,2022-01,MN,HENNEPIN,0 - 17 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
311,2021-12,MN,HENNEPIN,18 to 49 years,Male,Asian,Missing,Laboratory-confirmed case,1
317,2022-01,MN,HENNEPIN,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
368,2022-01,MN,HENNEPIN,65+ years,Female,Asian,Non-Hispanic/Latino,Probable Case,1
408,2022-06,MN,HENNEPIN,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
...,...,...,...,...,...,...,...,...,...
539592,2022-01,MN,HENNEPIN,50 to 64 years,Male,Asian,Missing,Laboratory-confirmed case,1
539607,2021-07,MN,HENNEPIN,18 to 49 years,Female,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539640,2022-01,MN,HENNEPIN,65+ years,Male,Asian,Missing,Laboratory-confirmed case,1
539700,2021-02,MN,HENNEPIN,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1


In [9]:
# count of Asian in Hennepin county
cdc_hennepin_asian_count = cdc_hennepin_asian_df['State'].count()
cdc_hennepin_asian_count
print(f"Out of a randomized CDC dataframe, {cdc_hennepin_asian_count} asians in the Hennepin County responded they have been infected with Covid-19")

Out of a randomized CDC dataframe, 7526 asians in the Hennepin County responded they have been infected with Covid-19


In [10]:
Asian_in_hennepin = cdc_hennepin_asian_count/sample_population_hennepin*100
rounded_Asian_in_hennepin = "{:.2f}".format(Asian_in_hennepin)
print(f"{rounded_Asian_in_hennepin}% of Asians are represented in the Hennepin County in the CDC dataset")

7.12% of Asians are represented in the Hennepin County in the CDC dataset


## % of Asians in Anoka County

In [11]:
cdc_anoka = cdc_df.loc[cdc_df['County'] == 'ANOKA']
cdc_anoka

Unnamed: 0,case_month,State,County,age_group,sex,Race,Ethnicity,Infection Status,Infections
4,2021-05,MN,ANOKA,18 to 49 years,Male,,,Laboratory-confirmed case,1
8,2021-12,MN,ANOKA,18 to 49 years,Female,,,Laboratory-confirmed case,1
11,2021-10,MN,ANOKA,50 to 64 years,Male,,,Laboratory-confirmed case,1
61,2022-01,MN,ANOKA,50 to 64 years,Male,Caucasian,Non-Hispanic/Latino,Probable Case,1
93,2021-12,MN,ANOKA,18 to 49 years,Male,Caucasian,Non-Hispanic/Latino,Probable Case,1
...,...,...,...,...,...,...,...,...,...
539709,2021-05,MN,ANOKA,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539713,2022-05,MN,ANOKA,18 to 49 years,Female,Caucasian,Missing,Laboratory-confirmed case,1
539714,2021-03,MN,ANOKA,18 to 49 years,Male,Caucasian,Missing,Laboratory-confirmed case,1
539726,2021-07,MN,ANOKA,18 to 49 years,Male,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1


In [12]:
sample_population_anoka = cdc_anoka['State'].count()
print(f"Total sample population in Anoka County in this randomized dataset is {sample_population_anoka} according to the CDC.gov website")

Total sample population in Anoka County in this randomized dataset is 33688 according to the CDC.gov website


In [13]:
#count of Asian in Anoka county
cdc_anoka_asian_df = cdc_anoka.loc[cdc_anoka['Race'] == 'Asian']
cdc_anoka_asian_df

Unnamed: 0,case_month,State,County,age_group,sex,Race,Ethnicity,Infection Status,Infections
104,2021-12,MN,ANOKA,50 to 64 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
214,2020-10,MN,ANOKA,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
977,2021-09,MN,ANOKA,0 - 17 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
1708,2021-03,MN,ANOKA,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
1907,2022-05,MN,ANOKA,0 - 17 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
...,...,...,...,...,...,...,...,...,...
539264,2021-12,MN,ANOKA,0 - 17 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539387,2020-11,MN,ANOKA,18 to 49 years,Female,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539424,2021-08,MN,ANOKA,18 to 49 years,Female,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539692,2021-10,MN,ANOKA,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1


In [14]:
# count of Asian in Anoka county
cdc_anoka_asian_count = cdc_anoka_asian_df['State'].count()
cdc_anoka_asian_count
print(f"Out of a randomized CDC dataframe, {cdc_anoka_asian_count} asians in the Anoka County responded they have been infected with Covid-19")

Out of a randomized CDC dataframe, 1781 asians in the Anoka County responded they have been infected with Covid-19


In [15]:
Asian_in_anoka = cdc_anoka_asian_count/sample_population_anoka*100
rounded_Asian_in_anoka = "{:.2f}".format(Asian_in_anoka)
print(f"{rounded_Asian_in_anoka}% of Asians are represented in the Anoka County in the CDC dataset")

5.29% of Asians are represented in the Anoka County in the CDC dataset


## %Asians in Ramsey County

In [16]:
cdc_ramsey = cdc_df.loc[cdc_df['County'] == 'RAMSEY']
cdc_ramsey

Unnamed: 0,case_month,State,County,age_group,sex,Race,Ethnicity,Infection Status,Infections
12,2022-05,MN,RAMSEY,0 - 17 years,Female,Caucasian,Non-Hispanic/Latino,Probable Case,1
18,2020-12,MN,RAMSEY,50 to 64 years,Male,Asian,Missing,Laboratory-confirmed case,1
27,2020-11,MN,RAMSEY,18 to 49 years,Female,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
33,2020-06,MN,RAMSEY,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
37,2021-11,MN,RAMSEY,50 to 64 years,Male,Caucasian,Missing,Probable Case,1
...,...,...,...,...,...,...,...,...,...
539707,2022-02,MN,RAMSEY,18 to 49 years,Female,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539710,2022-05,MN,RAMSEY,18 to 49 years,Male,Caucasian,Non-Hispanic/Latino,Probable Case,1
539724,2020-12,MN,RAMSEY,18 to 49 years,Female,Caucasian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539731,2020-11,MN,RAMSEY,65+ years,Male,Caucasian,Missing,Laboratory-confirmed case,1


In [17]:
sample_population_ramsey = cdc_ramsey['State'].count()
print(f"Total sample population in Ramsey County in this randomized dataset is {sample_population_ramsey} according to the CDC.gov website")

Total sample population in Ramsey County in this randomized dataset is 44356 according to the CDC.gov website


In [18]:
#count of Asian in Ramsey county
cdc_ramsey_asian_df = cdc_ramsey.loc[cdc_ramsey['Race'] == 'Asian']
cdc_ramsey_asian_df

Unnamed: 0,case_month,State,County,age_group,sex,Race,Ethnicity,Infection Status,Infections
18,2020-12,MN,RAMSEY,50 to 64 years,Male,Asian,Missing,Laboratory-confirmed case,1
33,2020-06,MN,RAMSEY,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
48,2021-12,MN,RAMSEY,18 to 49 years,Female,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
137,2021-09,MN,RAMSEY,18 to 49 years,Female,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
491,2022-06,MN,RAMSEY,50 to 64 years,Male,Asian,Non-Hispanic/Latino,Probable Case,1
...,...,...,...,...,...,...,...,...,...
539413,2020-11,MN,RAMSEY,0 - 17 years,Male,Asian,Missing,Laboratory-confirmed case,1
539546,2020-11,MN,RAMSEY,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1
539568,2021-06,MN,RAMSEY,18 to 49 years,Female,Asian,Non-Hispanic/Latino,Probable Case,1
539681,2020-11,MN,RAMSEY,18 to 49 years,Male,Asian,Non-Hispanic/Latino,Laboratory-confirmed case,1


In [19]:
# count of Asian in Ramsey county
cdc_ramsey_asian_count = cdc_ramsey_asian_df['State'].count()
cdc_ramsey_asian_count
print(f"Out of a randomized CDC dataframe, {cdc_ramsey_asian_count} asians in the Ramsey County responded they have been infected with Covid-19")

Out of a randomized CDC dataframe, 6919 asians in the Ramsey County responded they have been infected with Covid-19


In [20]:
Asian_in_ramsey = cdc_ramsey_asian_count/sample_population_ramsey*100
rounded_Asian_in_ramsey = "{:.2f}".format(Asian_in_ramsey)
print(f"{rounded_Asian_in_ramsey}% of Asians are represented in the Ramsey County in the CDC dataset")

15.60% of Asians are represented in the Ramsey County in the CDC dataset


In [21]:
# Observed data in a CDC dataset
observed = pd.Series([rounded_Asian_in_hennepin,rounded_Asian_in_anoka,rounded_Asian_in_ramsey], index=["%Asians in Hennepin", "%Asians in Anoka", "%Asians in Ramsey"])

In [22]:
# Create a data frame
observed_df = pd.DataFrame([observed]).T
observed_df

Unnamed: 0,0
%Asians in Hennepin,7.12
%Asians in Anoka,5.29
%Asians in Ramsey,15.6


In [23]:
# Create a data frame
df = pd.DataFrame([observed]).T

In [24]:
expected = pd.Series([7.15, 4.55, 14.74], index=["%Asians in Hennepin", "%Asians in Anoka", "%Asians in Ramsey"])
expected = [7.15, 4.55, 14.74]
df[1] = expected
df[1]

%Asians in Hennepin     7.15
%Asians in Anoka        4.55
%Asians in Ramsey      14.74
Name: 1, dtype: float64

In [27]:
# Rename columns
#I gotta find a way to remove row number and dtype indications..
df.columns = ["% of population infected with COVID-19 in CDC dataset", "Population Census"]
df

Unnamed: 0,% of population infected with COVID-19 in CDC dataset,Population Census
%Asians in Hennepin,7.12,7.15
%Asians in Anoka,5.29,4.55
%Asians in Ramsey,15.6,14.74


In [32]:
# The degree of freedom is 3-1 = 2
# With a p-value of 0.05, the confidence level is 1.00-0.05 = 0.95.
critical_value = st.chi2.ppf(q = 0.95, df = 2)

In [33]:
# The critical value
critical_value

5.991464547107979

In [34]:
# Run the chi square test with stats.chisquare()
st.chisquare(df["% of population infected with COVID-19 in CDC dataset"], df["Population Census"])

ValueError: For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are:
0.05937972768532528

In [35]:
# Drop NaNs and then group by County and Race
cdc_df = cdc_df.dropna(how='any')
cdc_df = cdc_df[['County', 'Race', 'Infections']]
cdc_grouped_county = cdc_df.groupby(['County', 'Race'])
cdc_df = cdc_grouped_county.count()
cdc_df.reset_index(inplace=True)
cdc_df


Unnamed: 0,County,Race,Infections
0,ANOKA,African American,2864
1,ANOKA,Asian,1732
2,ANOKA,Caucasian,24501
3,ANOKA,Hispanic,746
4,ANOKA,Native American,73
...,...,...,...
172,WRIGHT,African American,107
173,WRIGHT,Asian,49
174,WRIGHT,Caucasian,10656
175,WRIGHT,Hispanic,109


# Pulling data using API with the help of JSON
## = Census Dataset =

### Drop NaN and compile totals by county and race

### Create poverty DataFrame based on US Census

In [None]:
# Poverty DataFrame By Race - 2020

url = "https://api.census.gov/data/2020/acs/acs5?get=NAME,B17001_002E,B17001A_002E,B17001B_002E,B17001C_002E,B17001D_002E,B17001E_002E,B17001F_002E,B17001G_002E,B17001H_002E,B17001I_002E&for=county:*&in=state:27&key={0}".format(census_key)
response = requests.request("GET", url)
response
poverty_df = json_to_dataframe(response)

# Split NAME into county & state
name = poverty_df['NAME'].str.split(",", n=1, expand=True)
COUNTY = poverty_df['County'] = name[0]
STATE = poverty_df['State'] = name[1]
poverty_df.drop(columns=["NAME"], inplace=True)
poverty_df['Caucasian2'] = poverty_df['B17001A_002E'].astype(int) - poverty_df['B17001I_002E'].astype(int)
poverty_df['Other'] = poverty_df['B17001F_002E'].astype(int) + poverty_df['B17001G_002E'].astype(int)

# Get the columns we want
poverty_df = poverty_df[['County', 'State','B17001_002E','Caucasian2', 'B17001B_002E','B17001C_002E','B17001D_002E','B17001E_002E','Other','B17001I_002E']]

# Rename columns to something intelligible
poverty_df = poverty_df.rename(columns={
    'B17001_002E':'Poverty Total',
    'Caucasian2':'Caucasian',
    'B17001B_002E':'African American',
    'B17001C_002E':'Native American',
    'B17001D_002E':'Asian',
    'B17001E_002E':'Hawaiian', # I had to change this from Native Hawaiian to Hawaiian for compatibility with the rest - Dom
    'B17001I_002E':'Hispanic' 
    })

# # Remove ...County from County in dataframe
poverty_df['County'] = poverty_df['County'].replace('County', '', regex=True)
# NOTE: The line above accidentally leaves a blank space at the end of these county names! That space caused a merge to fail
# on any of these counties because, for example, "AITKIN " is not the same as "AITKIN". 

# Remove any blank space that may be present at the begining or end of the county name
poverty_df['County'] = poverty_df['County'].str.strip()

# And then convert the county name to uppercase
poverty_df['County'] = poverty_df['County'].str.upper()

poverty_df['Year'] = 2020
#poverty_df.to_csv("../Project_1/population_in_poverty.csv")
poverty_df



In [None]:
# Choose only the columns we need
poverty_df = poverty_df[['County', 'Caucasian', 'African American', 'Native American', 'Asian', 'Hawaiian', 'Other', 'Hispanic']]
poverty_df

In [None]:
# Use melt to convert this into a format similar to the CDC data
poverty_df = poverty_df.melt(id_vars='County', value_vars=['Caucasian', 'African American', 'Native American', 'Asian', \
                                                    'Hawaiian', 'Other', 'Hispanic'], \
                                                     var_name='Race', value_name='Poverty')
poverty_gb = poverty_df.groupby(['County', 'Race'])
poverty_df = poverty_gb.sum()
poverty_df.reset_index(inplace=True)
poverty_df

### Function for quickly calling a DataFrame

### Create population DataFrame by county and race

In [36]:
# Function for quickly assempling a DataFrame
def json_to_dataframe(response):
    return pd.DataFrame(response.json()[1:], columns=response.json()[0])

In [37]:
# Population by county, by race
url = "https://api.census.gov/data/2020/acs/acs5?get=NAME,B03002_001E,B03002_003E,B03002_004E,B03002_005E,B03002_006E,B03002_007E,B03002_008E,B03002_009E,B03002_012E&for=county:*&in=state:27&key={0}".format(census_key)


response = requests.request("GET", url)
response
response_df = json_to_dataframe(response)
response_df = response_df.rename(columns={
    'B03002_001E': 'Population Total',
    'B03002_003E': 'Caucasian',
    'B03002_004E': 'African American',
    'B03002_005E': 'Native American',
    'B03002_006E': 'Asian',
    'B03002_007E': 'Hawaiian',
    'B03002_012E': 'Hispanic'
})
response_df['Other'] = response_df['B03002_008E'].astype(int) + response_df['B03002_009E'].astype(int)
mn_pop_df = response_df

# Split NAME into county & state
name = mn_pop_df['NAME'].str.split(",", n=1, expand=True)
COUNTY = mn_pop_df['County']=name[0]
STATE = mn_pop_df['State']=name[1]
mn_pop_df.drop(columns=["NAME"], inplace=True)

# # Remove ...County from County in dataframe
mn_pop_df['County'] = mn_pop_df['County'].replace('County', '', regex=True)
# NOTE: The line above accidentally leaves a blank space at the end of these county names! That space caused a merge to fail
# on any of these counties because, for example, "AITKIN " is not the same as "AITKIN". 

# Remove any blank space that may be present at the begining or end of the county name
mn_pop_df['County'] = mn_pop_df['County'].str.strip()

# # Upper case for County
mn_pop_df['County'] = mn_pop_df['County'].str.upper()
mn_pop_df = mn_pop_df[['County', 'Population Total', 'Caucasian', 'African American', 'Native American', 'Asian', 'Hawaiian', 'Hispanic', 'Other']]
mn_pop_df.to_csv("../Project_1/mn_population by race.csv")
mn_pop_df


Unnamed: 0,County,Population Total,Caucasian,African American,Native American,Asian,Hawaiian,Hispanic,Other
0,ANOKA,353775,283436,23534,1788,16082,86,16769,12080
1,BECKER,34227,29290,146,1938,209,6,738,1900
2,BELTRAMI,46784,33506,399,8920,295,36,1114,2514
3,BENTON,40476,35979,1876,49,378,0,1086,1108
4,BIG STONE,4974,4787,3,3,0,0,102,79
...,...,...,...,...,...,...,...,...,...
82,RENVILLE,14572,12741,67,186,108,0,1268,202
83,ROSEAU,15259,13876,124,254,399,0,223,383
84,SHERBURNE,96015,86342,2658,403,1206,0,2577,2829
85,STEELE,36710,31640,1225,119,150,6,2913,657


In [39]:
#Population in Hennepin County by race
mn_pop_hennepin_df = mn_pop_df.loc[  mn_pop_df['County'] == 'HENNEPIN'  ]
mn_pop_hennepin_df

Unnamed: 0,County,Population Total,Caucasian,African American,Native American,Asian,Hawaiian,Hispanic,Other
69,HENNEPIN,1255296,852045,163432,6801,89753,360,87216,55689


In [40]:
mn_pop_hennepin_asian_count = mn_pop_hennepin_df['Asian']
mn_pop_hennepin_asian_count = mn_pop_hennepin_asian_count.astype(str).astype(int)
mn_pop_hennepin_asian_count
# mn_pop_census_hennepin_asian_per_df = 

69    89753
Name: Asian, dtype: int32

In [43]:
mn_pop_hennepin_total_population = mn_pop_hennepin_df['Population Total']
mn_pop_hennepin_total_population = mn_pop_hennepin_total_population.astype(str).astype(int)
mn_pop_hennepin_total_population

69    1255296
Name: Population Total, dtype: int32

In [44]:
mn_pop_census_hennepin_asian_per = mn_pop_hennepin_asian_count/mn_pop_hennepin_total_population*100
mn_pop_census_hennepin_asian_per = round(mn_pop_census_hennepin_asian_per, 2) 
mn_pop_census_hennepin_asian_per

69    7.15
dtype: float64

In [46]:
#Population in Anoka County by race
mn_pop_anoka_df = mn_pop_df.loc[  mn_pop_df['County'] == 'ANOKA'  ]
mn_pop_anoka_df

Unnamed: 0,County,Population Total,Caucasian,African American,Native American,Asian,Hawaiian,Hispanic,Other
0,ANOKA,353775,283436,23534,1788,16082,86,16769,12080


In [47]:
mn_pop_anoka_asian_count = mn_pop_anoka_df['Asian']
mn_pop_anoka_asian_count = mn_pop_anoka_asian_count.astype(str).astype(int)
mn_pop_anoka_asian_count
# mn_pop_census_hennepin_asian_per_df = 

0    16082
Name: Asian, dtype: int32

In [48]:
mn_pop_anoka_total_population = mn_pop_anoka_df['Population Total']
mn_pop_anoka_total_population = mn_pop_anoka_total_population.astype(str).astype(int)
mn_pop_anoka_total_population

0    353775
Name: Population Total, dtype: int32

In [49]:
mn_pop_census_anoka_asian_per = mn_pop_anoka_asian_count/mn_pop_anoka_total_population*100
mn_pop_census_anoka_asian_per = round(mn_pop_census_anoka_asian_per, 2) 
mn_pop_census_anoka_asian_per

0    4.55
dtype: float64

In [50]:
#Population in Ramsey County by race
mn_pop_ramsey_df = mn_pop_df.loc[  mn_pop_df['County'] == 'RAMSEY'  ]
mn_pop_ramsey_df

Unnamed: 0,County,Population Total,Caucasian,African American,Native American,Asian,Hawaiian,Hispanic,Other
81,RAMSEY,546598,332580,62897,2551,80543,141,41189,26697


In [51]:
mn_pop_ramsey_asian_count = mn_pop_ramsey_df['Asian']
mn_pop_ramsey_asian_count = mn_pop_ramsey_asian_count.astype(str).astype(int)
mn_pop_ramsey_asian_count
# mn_pop_census_hennepin_asian_per_df = 

81    80543
Name: Asian, dtype: int32

In [53]:
mn_pop_ramsey_total_population = mn_pop_ramsey_df['Population Total']
mn_pop_ramsey_total_population = mn_pop_ramsey_total_population.astype(str).astype(int)
mn_pop_ramsey_total_population

81    546598
Name: Population Total, dtype: int32

In [54]:
mn_pop_census_ramsey_asian_per = mn_pop_ramsey_asian_count/mn_pop_ramsey_total_population*100
mn_pop_census_ramsey_asian_per = round(mn_pop_census_ramsey_asian_per, 2) 
mn_pop_census_ramsey_asian_per

81    14.74
dtype: float64

### Coordinates for heat map -- if time permits

In [None]:
# Get the coordinates of counties in MN
url = "https://en.wikipedia.org/wiki/User:Michael_J/County_table"

table = pd.read_html(url)
df = table[0]
Counties = df.loc[df['State'] == "MN", : ]
counties_df = Counties[['County [2]', 'Latitude', 'Longitude']]
counties_df#.to_csv("../Project_1/county_geo.csv")  

In [None]:
# Choose only the columns we need
mn_pop_df = mn_pop_df[['County', 'Caucasian', 'African American', 'Native American', 'Asian', 'Hawaiian', 'Other', 'Hispanic']]
mn_pop_df


In [None]:
# Use melt to convert this into a format similar to the CDC data
pop_df = mn_pop_df.melt(id_vars='County', value_vars=['Caucasian', 'African American', 'Native American', 'Asian', \
                                                      'Hawaiian', 'Other', 'Hispanic'], \
                                                      var_name='Race', value_name='Population')
pop_gb = pop_df.groupby(['County', 'Race'])
pop_df = pop_gb.sum()
pop_df.reset_index(inplace=True)
pop_df

### Merge the datasets

In [None]:
merged_df = pd.merge(pop_df, poverty_df, on=['County', 'Race'])
merged_df = pd.merge(merged_df, cdc_df, on=['County', 'Race'])
merged_df