# ------------------------------**Sung** Work Begins Here--------------------------------

# High Level Summary
 ### keywords: Population in Minnesota (MN), Race, Poverty, Infection rate, Poverty rate
 ### pulled data from CDC.gov and Population Census datasets.

## Number of Infections by Race and County
   a) Number of infections for African American in each County
![](output/African%20American%20number%20of%20Infections.PNG)

   b) Number of infections for Asians in each County
![](output/Asian%20number%20of%20Infections.PNG)

## Comprehensive table of Number of Infections by Race and County
![](output/Summary%20table%20of%20total%20number%20of%20infections%20by%20county%20and%20race.PNG)

## Summary table of the Infection rate, % (number of infections / Total Census Population of each Racial Group) of different racial groups in Minnesota from March 2020 to April 2022 
![](output/infection%20rate-percentage.PNG)

## Population in MN by race
![](output/Population%20in%20MN.PNG)

## Infection rate, % (number of infections / race population) of Caucasian and Non-Caucasian in Minnesota from March 2020 to April 2022 
![](output/infection%20rate-percentage-two%20groups.PNG)


## Infection ratio (Infection rate / % population of race) of Caucasian and Non-Caucasian in Minnesota
![](output/infection%20rate-ratio-corrected-two%20groups.PNG)

## Summary table of infection ratio (Infection rate / % population of race) of caucasian and non-caucasian groups by County
![](output/infection%20rate-ratio-corrected-two%20groups%20by%20county.PNG)

## Hypothesis Test #1
### $H_{0}$: Is the mean of the covid infection ratio of caucasian group equal to the mean of that of non-caucasian group?
### Independent t-test
![](output/T-test-two%20race%20groups.PNG)

## Summary table of infection ratio (Infection rate / % population of race) of different race groups by County
![](output/infection%20rate-ratio-corrected-all%20race%20groups%20by%20county.PNG)

## Hypothesis Test #2
### ANOVA test
![](output/ANOVA%20test-infection%20ratio-corrected.PNG)

## Poverty rate (number of poverty / race population for each county) of Caucasian and Non-Caucasian groups in Minnesota
![](output/poverty%20rate-two%20racial%20groups.png)

## Hypothesis Test #3
### Poverty rate independent t-test
![](output/poverty%20t-test.PNG)

# Importing from CSV file
## = CDC.gov Dataset =

### Gather dependencies

In [None]:
# Dependencies
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
# from sodapy import Socrata
from config import census_key
import pprint
import requests
import os
import scipy.stats as st
from scipy.stats import linregress

### Create DataFrame for CDC Data

In [None]:
cdc_data = pd.read_csv("../Project_1/cdc_data.csv")
cdc_data
cdc_df = pd.DataFrame(cdc_data)
cdc_df

# Choose the columns we want and rename anything that isn't clear
cdc_df = cdc_df[['case_month', 'res_state', 'res_county', 'age_group', 'sex', 'race', 'ethnicity', 'current_status']]
cdc_df = cdc_df.rename(columns={
    'res_state': 'State',
    'res_county': 'County',
    'ethnicity': 'Ethnicity',
    'current_status': 'Infection Status',
    'race':'Race'
})

# Drop "Unkown" race from dataset
cdc_df = cdc_df.loc[cdc_df['Race']!= 'Unknown', :]

# Set Infections to 1 and replace the various races with values that are compatible with the census data
cdc_df['Infections'] = 1
cdc_df['Race'] = cdc_df['Race'].replace('White', 'Caucasian')
cdc_df['Race'] = cdc_df['Race'].replace('Black', 'African American')
cdc_df['Race'] = cdc_df['Race'].replace('American Indian/Alaska Native', 'Native American')
cdc_df['Race'] = cdc_df['Race'].replace('Multiple/Other', 'Other')
cdc_df['Race'] = np.where(np.logical_and(cdc_df['Race'] == 'Caucasian', cdc_df['Ethnicity'] == 'Hispanic/Latino'), 'Hispanic', cdc_df['Race'])

# NOTE: It's not clear what happens with Hawaiian here. 

cdc_df

## Infections by race in MN

In [None]:
# Drop NaNs and then sort by date chronologically
cdc_df = cdc_df.dropna(how='any')
cdc_df = cdc_df[['case_month', 'County', 'Race', 'Infections']]

cdc_df

In [None]:
race_mn = cdc_df['Race'].unique()
race_mn = sorted(set(race_mn))
race_mn

In [None]:

cdc_date = cdc_df['case_month'].unique()
cdc_date.sort()
print(cdc_date)

In [None]:
cdc_df = cdc_df.sort_values(['case_month', 'County'])
cdc_df

## Number of Infections by Race and County
   ### a) Total infections for African American.

In [None]:
#total sum of infections of African American.
cdc_infections_african_american_df = cdc_df.loc[  cdc_df['Race'] == race_mn[0]]
cdc_infections_african_american_df
cdc_infections_african_american_df = cdc_infections_african_american_df.groupby(['County', 'Race'])
cdc_infections_african_american_df.head()
cdc_infections_african_american_sum_df = cdc_infections_african_american_df.sum()
cdc_infections_african_american_sum_df.head(50)

In [None]:
cdc_infections_african_american_total = cdc_infections_african_american_sum_df['Infections'].sum()
cdc_infections_african_american_total

### b) Total infections for Asian.

In [None]:
#total sum of infections of Asians
cdc_infections_asian_df = cdc_df.loc[  cdc_df['Race'] == race_mn[1]]
cdc_infections_asian_df
cdc_infections_asian_df = cdc_infections_asian_df.groupby(['County', 'Race'])
cdc_infections_asian_df.head()
cdc_infections_asian_sum_df = cdc_infections_asian_df.sum()
cdc_infections_asian_sum_df.head(50)

In [None]:
cdc_infections_asian_total = cdc_infections_asian_sum_df['Infections'].sum()
cdc_infections_asian_total

### c) Total infections for Caucasian American.

In [None]:
cdc_infections_caucasian_df = cdc_df.loc[  cdc_df['Race'] == race_mn[2]]
cdc_infections_caucasian_df
cdc_infections_caucasian_df = cdc_infections_caucasian_df.groupby(['County', 'Race'])
cdc_infections_caucasian_df.head()
cdc_infections_caucasian_sum_df = cdc_infections_caucasian_df.sum()
cdc_infections_caucasian_sum_df.head(50)

In [None]:
cdc_infections_caucasian_total = cdc_infections_caucasian_sum_df['Infections'].sum()
cdc_infections_caucasian_total

### d) Total infections for Hispanic.

In [None]:
#total sum of infections of asians in 2021
cdc_infections_hispanic_df = cdc_df.loc[  cdc_df['Race'] == race_mn[3]]
cdc_infections_hispanic_df
cdc_infections_hispanic_df = cdc_infections_hispanic_df.groupby(['County', 'Race'])
cdc_infections_hispanic_df.head()
cdc_infections_hispanic_sum_df = cdc_infections_hispanic_df.sum()
cdc_infections_hispanic_sum_df.head(50)

In [None]:
cdc_infections_hispanic_total = cdc_infections_hispanic_sum_df['Infections'].sum()
cdc_infections_hispanic_total

### e) Total infections for Native American.

In [None]:
#total sum of infections of Native American in 2021
cdc_infections_native_american_df = cdc_df.loc[  cdc_df['Race'] == race_mn[4]]
cdc_infections_native_american_df
cdc_infections_native_american_df = cdc_infections_native_american_df.groupby(['County', 'Race'])
cdc_infections_native_american_df.head()
cdc_infections_native_american_sum_df = cdc_infections_native_american_df.sum()
cdc_infections_native_american_sum_df.head(50)

In [None]:
cdc_infections_native_american_total = cdc_infections_native_american_sum_df['Infections'].sum()
cdc_infections_native_american_total

### f) Total infections for other.

In [None]:
#total sum of infections of other in 2021
cdc_infections_other_df = cdc_df.loc[  cdc_df['Race'] == race_mn[5]]
cdc_infections_other_df
cdc_infections_other_df = cdc_infections_other_df.groupby(['County', 'Race'])
cdc_infections_other_df.head()
cdc_infections_other_sum_df = cdc_infections_other_df.sum()
cdc_infections_other_sum_df.head(50)

In [None]:
cdc_infections_other_total = cdc_infections_other_sum_df['Infections'].sum()
cdc_infections_other_total

In [None]:
#summary of number of infections by race
total_infection_byrace = [cdc_infections_african_american_total, cdc_infections_asian_total, 
                               cdc_infections_caucasian_total, cdc_infections_hispanic_total, 
                               cdc_infections_native_american_total, cdc_infections_other_total]
for (race, infection) in zip(race_mn, total_infection_byrace):
    print(f" {race}: {infection}")

In [None]:
african_american_asian_combined_df = pd.merge(cdc_infections_african_american_sum_df, cdc_infections_asian_sum_df, on=['County'], how = "outer")
african_american_asian_caucasian_combined_df = pd.merge(african_american_asian_combined_df, cdc_infections_caucasian_sum_df, on=['County'], how = "outer")
african_american_asian_caucasian_hispanic_combined_df = pd.merge(african_american_asian_caucasian_combined_df, cdc_infections_hispanic_sum_df, on=['County'], how = "outer")
african_american_asian_caucasian_hispanic_native_american_combined_df = pd.merge(african_american_asian_caucasian_hispanic_combined_df, cdc_infections_native_american_sum_df, on=['County'], how = "outer")

In [None]:
# Rename columns. FINAL summary table
comprehensive_df = pd.merge(african_american_asian_caucasian_hispanic_native_american_combined_df, cdc_infections_other_sum_df, on=['County'], how = "outer")
comprehensive_df.columns = ['African American Infections', 'Asian Infections', 'Caucasian American Infections', 'Hispanic Infections', 'Native American Infections', 'Other Infections']
comprehensive_df
# comprehensive_clean_df = comprehensive_df.fillna(0)
comprehensive_df.head()

# Pulling data using API with the help of JSON
## = Census Dataset =

In [None]:
# Function for quickly assempling a DataFrame
def json_to_dataframe(response):
    return pd.DataFrame(response.json()[1:], columns=response.json()[0])

In [None]:
# Poverty DataFrame By Race - 2020

url = "https://api.census.gov/data/2020/acs/acs5?get=NAME,B17001_002E,B17001A_002E,B17001B_002E,B17001C_002E,B17001D_002E,B17001E_002E,B17001F_002E,B17001G_002E,B17001H_002E,B17001I_002E&for=county:*&in=state:27&key={0}".format(census_key)
response = requests.request("GET", url)
response
poverty_df = json_to_dataframe(response)

# Split NAME into county & state
name = poverty_df['NAME'].str.split(",", n=1, expand=True)
COUNTY = poverty_df['County'] = name[0]
STATE = poverty_df['State'] = name[1]
poverty_df.drop(columns=["NAME"], inplace=True)
poverty_df['Caucasian2'] = poverty_df['B17001A_002E'].astype(int) - poverty_df['B17001I_002E'].astype(int)
poverty_df['Other'] = poverty_df['B17001F_002E'].astype(int) + poverty_df['B17001G_002E'].astype(int)

# Get the columns we want
poverty_df = poverty_df[['County', 'State','B17001_002E','Caucasian2', 'B17001B_002E','B17001C_002E','B17001D_002E','B17001E_002E','Other','B17001I_002E']]

# Rename columns to something intelligible
poverty_df = poverty_df.rename(columns={
    'B17001_002E':'Poverty Total',
    'Caucasian2':'Caucasian',
    'B17001B_002E':'African American',
    'B17001C_002E':'Native American',
    'B17001D_002E':'Asian',
    'B17001E_002E':'Hawaiian', # I had to change this from Native Hawaiian to Hawaiian for compatibility with the rest - Dom
    'B17001I_002E':'Hispanic' 
    })

# # Remove ...County from County in dataframe
poverty_df['County'] = poverty_df['County'].replace('County', '', regex=True)
# NOTE: The line above accidentally leaves a blank space at the end of these county names! That space caused a merge to fail
# on any of these counties because, for example, "AITKIN " is not the same as "AITKIN". 

# Remove any blank space that may be present at the begining or end of the county name
poverty_df['County'] = poverty_df['County'].str.strip()

# And then convert the county name to uppercase
poverty_df['County'] = poverty_df['County'].str.upper()

poverty_df['Year'] = 2020
#poverty_df.to_csv("../Project_1/population_in_poverty.csv")
poverty_df



In [None]:
# Choose only the columns we need
poverty_df = poverty_df[['County', 'Caucasian', 'African American', 'Native American', 'Asian', 'Hawaiian', 'Other', 'Hispanic']]
poverty_df

In [None]:
# Use melt to convert this into a format similar to the CDC data
poverty_df = poverty_df.melt(id_vars='County', value_vars=['Caucasian', 'African American', 'Native American', 'Asian', \
                                                    'Hawaiian', 'Other', 'Hispanic'], \
                                                     var_name='Race', value_name='Poverty')
poverty_gb = poverty_df.groupby(['County', 'Race'])
poverty_df = poverty_gb.sum()
poverty_df.reset_index(inplace=True)
poverty_df

In [None]:
# Population by county, by race
url = "https://api.census.gov/data/2020/acs/acs5?get=NAME,B03002_001E,B03002_003E,B03002_004E,B03002_005E,B03002_006E,B03002_007E,B03002_008E,B03002_009E,B03002_012E&for=county:*&in=state:27&key={0}".format(census_key)


response = requests.request("GET", url)
response
response_df = json_to_dataframe(response)
response_df = response_df.rename(columns={
    'B03002_001E': 'Population Total',
    'B03002_003E': 'Caucasian',
    'B03002_004E': 'African American',
    'B03002_005E': 'Native American',
    'B03002_006E': 'Asian',
    'B03002_007E': 'Hawaiian',
    'B03002_012E': 'Hispanic',
#     'B03002_005E': 'American Indian and Alaska Native alone'
})
response_df['Other'] = response_df['B03002_008E'].astype(int) + response_df['B03002_009E'].astype(int)
mn_pop_df = response_df

# Split NAME into county & state
name = mn_pop_df['NAME'].str.split(",", n=1, expand=True)
COUNTY = mn_pop_df['County']=name[0]
STATE = mn_pop_df['State']=name[1]
mn_pop_df.drop(columns=["NAME"], inplace=True)

# # Remove ...County from County in dataframe
mn_pop_df['County'] = mn_pop_df['County'].replace('County', '', regex=True)
# NOTE: The line above accidentally leaves a blank space at the end of these county names! That space caused a merge to fail
# on any of these counties because, for example, "AITKIN " is not the same as "AITKIN". 

# Remove any blank space that may be present at the begining or end of the county name
mn_pop_df['County'] = mn_pop_df['County'].str.strip()

# # Upper case for County
mn_pop_df['County'] = mn_pop_df['County'].str.upper()
mn_pop_df = mn_pop_df[['County', 'Population Total', 'Caucasian', 'African American', 'Native American', 'Asian', 'Hawaiian', 'Hispanic', 'Other']]
mn_pop_df.to_csv("../Project_1/mn_population by race.csv")
mn_pop_df

### Create population DataFrame by county and race

In [None]:
# Use melt to convert this into a format similar to the CDC data
pop_df = mn_pop_df.melt(id_vars='County', value_vars=['Caucasian', 'African American', 'Native American', 'Asian', \
                                                      'Hawaiian', 'Other', 'Hispanic'], \
                                                      var_name='Race', value_name='Population')
pop_gb = pop_df.groupby(['County', 'Race'])
pop_df = pop_gb.sum()
pop_df.reset_index(inplace=True)
pop_df

In [None]:
#population by race infected with covid-19

mn_pop = mn_pop_df['Population Total']
mn_caucasian_pop = mn_pop_df['Caucasian']
mn_african_american_pop = mn_pop_df['African American']
mn_native_american_pop = mn_pop_df['Native American']
mn_asian_pop = mn_pop_df['Asian']
mn_hawaiian_pop = mn_pop_df['Hawaiian']
mn_hispanic_pop = mn_pop_df['Hispanic']
mn_other_pop = mn_pop_df['Other']

In [None]:
mn_pop_list = list(mn_pop)
mn_pop_list
mn_caucasian_list = list(mn_caucasian_pop)
mn_caucasian_list
mn_african_american_list = list(mn_african_american_pop)
mn_african_american_list
mn_native_american_list = list(mn_native_american_pop)
mn_native_american_list
mn_asian_list = list(mn_asian_pop)
mn_asian_list
mn_hawaiian_list = list(mn_hawaiian_pop)
mn_hawaiian_list
mn_hispanic_list = list(mn_hispanic_pop)
mn_hispanic_list
mn_other_list = list(mn_other_pop)
mn_other_list
print(type(mn_pop_list[0]))

In [None]:
#converting string to integer
# print([int(x) for x in mn_pop_list])
mn_pop_int_list = [int(x) for x in mn_pop_list]
mn_caucasian_int_list = [int(x) for x in mn_caucasian_list]
mn_african_american_int_list = [int(x) for x in mn_african_american_list]
mn_native_american_int_list = [int(x) for x in mn_native_american_list]
mn_asian_int_list = [int(x) for x in mn_asian_list]
mn_hawaiian_int_list = [int(x) for x in mn_hawaiian_list]
mn_hispanic_int_list = [int(x) for x in mn_hispanic_list]
mn_other_int_list = [int(x) for x in mn_other_list]


print(type(mn_pop_int_list[0]))

In [None]:
#sum
sum_mn_pop = sum(mn_pop_int_list)
sum_mn_caucasian = sum(mn_caucasian_int_list)
sum_mn_african_american = sum(mn_african_american_int_list)
sum_mn_native_american = sum(mn_native_american_int_list)
sum_mn_asian = sum(mn_asian_int_list)
sum_mn_hawaiian = sum(mn_hawaiian_int_list)
sum_mn_hispanic = sum(mn_hispanic_int_list)
sum_mn_other = sum(mn_other_int_list)

#percent
percent_mn_caucasian = sum_mn_caucasian/sum_mn_pop*100
percent_mn_african_american = sum_mn_african_american/sum_mn_pop*100
percent_mn_native_american = sum_mn_native_american/sum_mn_pop*100
percent_mn_asian = sum_mn_asian/sum_mn_pop*100
percent_mn_hawaiian = sum_mn_hawaiian/sum_mn_pop*100
percent_mn_hispanic = sum_mn_hispanic/sum_mn_pop*100
percent_mn_other = sum_mn_other/sum_mn_pop*100

print(sum_mn_pop)
print(sum_mn_caucasian)
print(sum_mn_african_american)
print(sum_mn_native_american)
print(sum_mn_asian)
print(sum_mn_hawaiian)
print(sum_mn_hispanic)
print(sum_mn_other)
print("-----------------")
print(percent_mn_caucasian)
print(percent_mn_african_american)
print(percent_mn_native_american)
print(percent_mn_asian)
print(percent_mn_hawaiian)
print(percent_mn_hispanic)
print(percent_mn_other)

In [None]:
#summary of population by race
population_by_race_summary = [sum_mn_pop, sum_mn_caucasian, sum_mn_african_american, 
                              sum_mn_native_american, sum_mn_asian, sum_mn_hawaiian, 
                              sum_mn_hispanic, sum_mn_other]
MN_population = ['Population', 'Caucasian', 'African American', 'Native American', 'Asian', 'Hawaiian', 'Hispanic', 'Other']
for (racepopulation, mnsum) in zip(MN_population, population_by_race_summary):
    print(f" {racepopulation}: {mnsum}")

In [None]:
#infection rate African American
african_american_infection_percent = (cdc_infections_african_american_total / sum_mn_african_american ) *100
african_american_infection_percent

#infection rate Asian
asian_infection_percent = (cdc_infections_asian_total / sum_mn_asian ) *100
asian_infection_percent

#infection rate Caucasian
caucasian_infection_percent = (cdc_infections_caucasian_total / sum_mn_caucasian ) *100
caucasian_infection_percent

#infection rate hispanic
hispanic_infection_percent = (cdc_infections_hispanic_total / sum_mn_hispanic ) *100
hispanic_infection_percent

#infection rate native american
native_american_infection_percent = (cdc_infections_native_american_total / sum_mn_native_american ) *100
native_american_infection_percent


other_infection_percent = (cdc_infections_other_total / sum_mn_other ) *100
other_infection_percent


percentage_race_infections = [african_american_infection_percent, asian_infection_percent, 
                              caucasian_infection_percent, hispanic_infection_percent, native_american_infection_percent, other_infection_percent]


# for (race, infection_percentage) in zip(race_mn, percentage_race_infections):
#     print(f" {race}: {infection_percentage}")


In [None]:
#rate corrected by population,% (non-whites)
african_american_infection_percent_corrected = african_american_infection_percent / percent_mn_african_american
african_american_infection_percent_corrected
asian_infection_percent_corrected = asian_infection_percent / percent_mn_asian
hispanic_infection_percent_corrected = hispanic_infection_percent / percent_mn_hispanic
native_american_infection_percent_corrected = native_american_infection_percent / percent_mn_native_american
other_infection_percent_corrected = other_infection_percent / percent_mn_other


#rate corrected by population,% (whites)
caucasian_infection_percent_corrected = caucasian_infection_percent / percent_mn_caucasian

## Final Summary table of Infection rate corrected

In [None]:
# Dataframe showing infection rate or ratio by group (corrected by % population)

race_percent_corrected_df = pd.DataFrame({'African American':[african_american_infection_percent_corrected],
                     'Asian':[asian_infection_percent_corrected],
                     'Caucasian American':[caucasian_infection_percent_corrected],
                     'Hispanic':[hispanic_infection_percent_corrected],
                     'Native American':[native_american_infection_percent_corrected],
                     'Other':[other_infection_percent_corrected] 
                               })

# Declare a list that is to be converted into a column
# Dataframe showing infection rate or ratio by group (corrected by % population)
State = ['Infection rate by race group']
race_percent_corrected_df['Minnesota'] = State
race_percent_corrected_df = race_percent_corrected_df.reindex(columns=['Minnesota', 'African American', 'Asian', 'Caucasian American', 'Hispanic',
       'Native American', 'Other'])
race_percent_corrected_df

In [None]:
# Dataframe showing infection rate
race_percent_df = pd.DataFrame({'African American':[african_american_infection_percent],
                     'Asian':[asian_infection_percent],
                     'Caucasian American':[caucasian_infection_percent],
                     'Hispanic':[hispanic_infection_percent],
                     'Native American':[native_american_infection_percent],
                     'Other':[other_infection_percent] 
                               })

# Declare a list that is to be converted into a column
# Dataframe showing infection rate
State = ['Infection rate by race group']
race_percent_df['Minnesota'] = State
race_percent_df = race_percent_df.reindex(columns=['Minnesota', 'African American', 'Asian', 'Caucasian American', 'Hispanic',
       'Native American', 'Other'])
race_percent_df

## Final Summary table of Infection rate corrected
    a) Caucasian vs Non-caucasian

In [None]:
# Dataframe showing infection rate

#Adding a column.
# sum of non-caucasian columns
# Create List of columns
non_caucasian_list= ['African American', 'Asian',
       'Hispanic', 'Native American', 'Other']

race_percent_df['Non-Caucasian American'] = (race_percent_df[non_caucasian_list].sum(axis=1)) / 5
race_percent_df
race_percent_df = race_percent_df[['Minnesota', 'Caucasian American', 'Non-Caucasian American' ]]
race_percent_df

In [None]:
# Dataframe showing infection rate or ratio by group (corrected by % population)

#Adding a column.
# sum of non-caucasian columns
# Create List of columns
non_caucasian_list= ['African American', 'Asian',
       'Hispanic', 'Native American', 'Other']

race_percent_corrected_df['Non-Caucasian American'] = (race_percent_corrected_df[non_caucasian_list].sum(axis=1)) / 5
race_percent_corrected_df
race_percent_summary_df = race_percent_corrected_df[['Minnesota', 'Caucasian American', 'Non-Caucasian American' ]]
race_percent_summary_df

## Hypothesis testing

In [None]:
mn_pop_race_df = mn_pop_df[['County', 'African American', 'Asian', 'Caucasian', 'Native American', 'Hispanic', 'Other']]
mn_pop_race_df

In [None]:

cdc_census_combined_df = pd.merge(comprehensive_df, mn_pop_race_df, on=['County'], how = "outer")
cdc_census_clean_combined_df = cdc_census_combined_df.fillna(0)
cdc_census_clean_combined_df
# check for na values
# cdc_census_clean_combined_df.isna().sum()

In [None]:
cdc_census_clean_combined_df.columns

In [None]:
#converting all the rows to integer
column_list = ['African American Infections', 'Asian Infections',
       'Caucasian American Infections', 'Hispanic Infections',
       'Native American Infections', 'Other Infections', 'African American',
       'Asian', 'Caucasian', 'Native American', 'Hispanic', 'Other']

for column in column_list:
    cdc_census_clean_combined_df[column] = cdc_census_clean_combined_df[column].astype(int)


In [None]:
# african american population by county DATAFRAME
cdc_census_clean_combined_african_american_df = cdc_census_clean_combined_df['African American']
cdc_census_clean_combined_african_american_df


In [None]:
# Total racial Population by county DATAFRAME
cdc_census_clean_combined_asian_df = cdc_census_clean_combined_df['Asian']
cdc_census_clean_combined_caucasian_df = cdc_census_clean_combined_df['Caucasian']
cdc_census_clean_combined_native_american_df = cdc_census_clean_combined_df['Native American']
cdc_census_clean_combined_hispanic_df = cdc_census_clean_combined_df['Hispanic']
cdc_census_clean_combined_other_df = cdc_census_clean_combined_df['Other']

#total caucasian population by county DATAFRAME
cdc_census_clean_combined_caucasian_df

In [None]:
# total population by county
cdc_census_clean_combined_sum_df = (cdc_census_clean_combined_african_american_df + cdc_census_clean_combined_caucasian_df + cdc_census_clean_combined_asian_df + cdc_census_clean_combined_native_american_df + cdc_census_clean_combined_hispanic_df + cdc_census_clean_combined_other_df)
cdc_census_clean_combined_sum_df

In [None]:
# total population by county for non-caucasian group
cdc_census_clean_combined_sum_non_caucasian_df = (cdc_census_clean_combined_african_american_df + cdc_census_clean_combined_asian_df + cdc_census_clean_combined_native_american_df + cdc_census_clean_combined_hispanic_df + cdc_census_clean_combined_other_df)
cdc_census_clean_combined_sum_non_caucasian_df

In [None]:
# number of infections by county non-caucasian group DATAFRAME
african_american_infection_df = cdc_census_clean_combined_df['African American Infections']
asian_infection_df = cdc_census_clean_combined_df['Asian Infections']
native_american_infection_df = cdc_census_clean_combined_df[ 'Native American Infections']
hispanic_infection_df = cdc_census_clean_combined_df['Hispanic Infections']
other_infection_df = cdc_census_clean_combined_df['Other Infections']

# number of infections by county caucasian group
caucasian_infection_df = cdc_census_clean_combined_df['Caucasian American Infections']
caucasian_infection_df

In [None]:
#SUM OF number of infections NON-CACUASIAN DATAFRAME
sum_non_caucasian_infection_df = african_american_infection_df + asian_infection_df + native_american_infection_df + hispanic_infection_df + other_infection_df
sum_non_caucasian_infection_df

In [None]:
# Population racial % by county
#non-caucasian group
county_asian_pop_percent = cdc_census_clean_combined_asian_df / cdc_census_clean_combined_sum_df
county_african_american_pop_percent = cdc_census_clean_combined_african_american_df / cdc_census_clean_combined_sum_df
county_native_american_pop_percent = cdc_census_clean_combined_native_american_df / cdc_census_clean_combined_sum_df
county_hispanic_pop_percent = cdc_census_clean_combined_hispanic_df / cdc_census_clean_combined_sum_df
county_other_pop_percent = cdc_census_clean_combined_asian_df / cdc_census_clean_combined_sum_df

#SUM OF NON-CACUASIAN DATAFRAME
sum_non_caucasian_pop_percent_df = county_asian_pop_percent + county_african_american_pop_percent + county_native_american_pop_percent + county_hispanic_pop_percent + county_other_pop_percent


# caucasian group
county_caucasian_pop_percent = cdc_census_clean_combined_caucasian_df / cdc_census_clean_combined_sum_df
county_caucasian_pop_percent

In [None]:
# infection rate calculation non-caucasian group
non_caucasian_infection_rate_corrected_df = (sum_non_caucasian_infection_df / cdc_census_clean_combined_sum_df) / sum_non_caucasian_pop_percent_df

# infection rate calculation african american group
african_american_infection_rate_corrected_df = (african_american_infection_df / cdc_census_clean_combined_sum_df) / county_african_american_pop_percent
asian_infection_rate_corrected_df = (asian_infection_df / cdc_census_clean_combined_sum_df) / county_asian_pop_percent
native_american_infection_rate_corrected_df = (native_american_infection_df / cdc_census_clean_combined_sum_df) / county_native_american_pop_percent
hispanic_infection_rate_corrected_df = (hispanic_infection_df / cdc_census_clean_combined_sum_df) / county_hispanic_pop_percent
other_infection_rate_corrected_df = (other_infection_df / cdc_census_clean_combined_sum_df) / county_other_pop_percent



# number of infections by county caucasian group
caucasian_infection_rate_corrected_df = (caucasian_infection_df / cdc_census_clean_combined_sum_df) / county_caucasian_pop_percent
caucasian_infection_rate_corrected_df

In [None]:
#filter to just show County column
cdc_census_combined_summary_df = cdc_census_clean_combined_df[['County']]
cdc_census_combined_summary_df


cdc_census_combined_summary_df[  'Non-Caucasian infection ratio'  ] = non_caucasian_infection_rate_corrected_df
cdc_census_combined_summary_df[  'Caucasian infection ratio'  ] = caucasian_infection_rate_corrected_df
cdc_census_combined_summary_df

In [None]:
# Calculate the mean for infection rate of non-caucasian group
non_caucasian_infection_rate_corrected = cdc_census_combined_summary_df['Non-Caucasian infection ratio'].fillna(0)
non_caucasian_infection_rate_corrected.mean()
# non_caucasian_infection_rate = cdc_census_combined_summary_df['Non-Caucasian infection rate'].to_numpy()
# np.where(non_caucasian_infection_rate != np.nan)

In [None]:

# Calculate the mean for infection rate of caucasian group
caucasian_infection_rate_corrected = cdc_census_combined_summary_df['Caucasian infection ratio']
caucasian_infection_rate_corrected.mean()


In [None]:
# Calculate Independent (Two Sample) t-test
st.ttest_ind(non_caucasian_infection_rate_corrected, caucasian_infection_rate_corrected, equal_var=False)

# nan_policy="omit"

In [None]:
#dataframe showing infection rate by group (corrected by % population)

county_list = ['ANOKA', 'BELTRAMI', 'BENTON', 'BLUE EARTH', 'CARLTON', 'CARVER',
       'CHISAGO', 'CLAY', 'DAKOTA', 'GOODHUE', 'HENNEPIN', 'KANDIYOHI',
       'LYON', 'MOWER', 'NICOLLET', 'NOBLES', 'OLMSTED', 'OTTER TAIL',
       'PINE', 'POLK', 'RAMSEY', 'RICE', 'SCOTT', 'SHERBURNE',
       'ST. LOUIS', 'STEARNS', 'STEELE', 'WASHINGTON', 'WINONA', 'WRIGHT',
       'FREEBORN', 'ISANTI', 'BECKER', 'BROWN', 'CASS', 'CROW WING',
       'DODGE', 'DOUGLAS', 'FILLMORE', 'HUBBARD', 'ITASCA', 'LE SUEUR',
       'MCLEOD', 'MEEKER', 'MILLE LACS', 'MORRISON', 'TODD', 'WABASHA',
       'BIG STONE', 'CHIPPEWA', 'WATONWAN', 'WADENA', 'CLEARWATER',
       'COOK', 'COTTONWOOD', 'FARIBAULT', 'GRANT', 'HOUSTON', 'KANABEC',
       'KOOCHICHING', 'LAC QUI PARLE', 'LAKE OF THE WOODS', 'MARSHALL',
       'MARTIN', 'MURRAY', 'NORMAN', 'PENNINGTON', 'POPE', 'RED LAKE',
       'REDWOOD', 'ROCK', 'SIBLEY', 'STEVENS', 'SWIFT', 'TRAVERSE',
       'WASECA', 'WILKIN', 'YELLOW MEDICINE', 'AITKIN', 'JACKSON',
       'KITTSON', 'LAKE', 'LINCOLN', 'MAHNOMEN', 'PIPESTONE', 'RENVILLE',
       'ROSEAU']

cdc_census_combined_allrace_summary_df = pd.DataFrame({'County': county_list,
                     'African American': african_american_infection_rate_corrected_df,
                     'Asian': asian_infection_rate_corrected_df,
                     'Caucasian American': caucasian_infection_rate_corrected_df,
                     'Hispanic': hispanic_infection_rate_corrected_df,
                     'Native American': native_american_infection_rate_corrected_df,
                     'Other': other_infection_rate_corrected_df 
                               })

cdc_census_combined_allrace_summary_df

In [None]:
#declare groups
group0 = african_american_infection_rate_corrected_df.fillna(0)
group1 = asian_infection_rate_corrected_df.fillna(0)
group2 = caucasian_infection_rate_corrected_df.fillna(0)
group3 = hispanic_infection_rate_corrected_df.fillna(0)
group4 = native_american_infection_rate_corrected_df.fillna(0)
group5 = other_infection_rate_corrected_df.fillna(0)

# group1 = asian_infection_rate_corrected_df.fillna(0).to_numpy()

In [None]:
#ANOVA test
# Perform the ANOVA
st.f_oneway(group0, group1, group2, group3, group4, group5)

In [None]:
#ANOVA test
# Perform the ANOVA
st.f_oneway(group0, group1, group2)

In [None]:
#ANOVA test
# Perform the ANOVA
st.f_oneway(group0, group2)

### Merge the datasets

In [None]:
# Edited merge_df for nan infections to be 0
merged_df = pd.merge(pop_df, poverty_df, on=['County', 'Race'])
merged_df = pd.merge(merged_df, cdc_df, on=['County', 'Race'], how = "left")
merged_df["Infections"] = merged_df["Infections"].fillna(0)
merged_df

In [None]:
len(merged_df)

In [None]:
# Convert all non-caucasian races to "non-caucasian"
#non_caucasian = ['African American', 'Hispanic', 'Asian', 'Native American', 'Hawaiian', 'Other']
merged_df['Multi-Racial']=["Non-Caucasian" if (x=="African American" or x=="Asian" or x=="Hispanic" or x=="Native American"\
    or x=="Hawaiian" or x=="Other") else "Caucasian" for x in merged_df['Race']]
merged_df

In [None]:
# Convert all numerical values in merged_df from strings to int
merged_df["Population"] = merged_df["Population"].astype(int)
merged_df["Poverty"] = merged_df["Poverty"].astype(int)
merged_df["Infections"] = merged_df["Infections"].astype(int)

# Calculate poverty and infection rates
poverty_rate = []
infection_rate = []

i = 0
while i < len(merged_df):
    if merged_df["Population"][i] == 0:
        poverty_rate.append(0)
        infection_rate.append(0)
    else: 
        poverty_rate.append( (merged_df["Poverty"][i] / merged_df["Population"][i])   * 100  )
        infection_rate.append(merged_df["Infections"][i] / merged_df["Population"][i]  * 100  )
    i += 1
    
merged_df["Poverty Rate"] = poverty_rate
merged_df["Infection Rate"] = infection_rate


merged_df

# merged_df[merged_df["County"] == "CROW WING"]  


In [None]:
merged_sorted_df = merged_df.sort_values(['case_month', 'County'])
merged_sorted_df

In [None]:
merged_sorted2_df = merged_sorted_df[['County', 'Multi-Racial', 'Poverty Rate',]]
merged_sorted2_df.head(50)

In [None]:
poverty_caucasian_df = merged_sorted2_df[   merged_sorted2_df["Multi-Racial"] == 'Caucasian']
poverty_caucasian_df

In [None]:
poverty_rate_caucasian_df = poverty_caucasian_df[['Multi-Racial', 'Poverty Rate']]
poverty_rate_caucasian_df

poverty_rate_caucasian_df = poverty_rate_caucasian_df.rename(columns={
    'Multi-Racial': 'Race'
})

poverty_rate_caucasian_df

In [None]:
poverty_non_caucasian_df = merged_sorted2_df[   merged_sorted2_df["Multi-Racial"] == 'Non-Caucasian']
poverty_non_caucasian_df

In [None]:
poverty_rate_non_caucasian_df = poverty_non_caucasian_df[['Multi-Racial', 'Poverty Rate']]
poverty_rate_non_caucasian_df

In [None]:
# Calculate the mean for poverty rate of non-caucasian group
non_caucasian_poverty_rate = poverty_rate_non_caucasian_df['Poverty Rate'].fillna(0)
non_caucasian_poverty_rate.mean()
# non_caucasian_infection_rate = cdc_census_combined_summary_df['Non-Caucasian infection rate'].to_numpy()
# np.where(non_caucasian_infection_rate != np.nan)


In [None]:
# Calculate the mean for infection rate of caucasian group
caucasian_poverty_rate = poverty_rate_caucasian_df['Poverty Rate'].fillna(0)
caucasian_poverty_rate.mean()

In [None]:
# Calculate Independent (Two Sample) t-test
st.ttest_ind(non_caucasian_poverty_rate, caucasian_poverty_rate, equal_var=False)


# -------------------------------------

### Coordinates for heat map -- if time permits

In [None]:
# Get the coordinates of counties in MN
url = "https://en.wikipedia.org/wiki/User:Michael_J/County_table"

table = pd.read_html(url)
df = table[0]
Counties = df.loc[df['State'] == "MN", : ]
counties_df = Counties[['County [2]', 'Latitude', 'Longitude']]
counties_df#.to_csv("../Project_1/county_geo.csv")  

# -------------------------------------**Sung** Work Ends Here-----------------------------------