#### Notebooks  
- [Data Collection](./01_data_collection.ipynb)
- [Data Cleaning](./02_data_cleaning.ipynb)
- [Data Preprocessing](./03_data_preprocessing.ipynb)
- [EDA Five States](./04_eda_five_states.ipynb)
- [EDA California](./05_eda_ca.ipynb)
- [EDA Florida](./05_eda_fl.ipynb)
- [EDA Illinois](./05_eda_il.ipynb)
- [EDA New York](./05_eda_ny.ipynb)
- [EDA Texas](./05_eda_tx.ipynb)
- [Modeling Five States](./06_modeling_five_states.ipynb)
- [Modeling California](./07_modeling_ca.ipynb)
- [Modeling Florida](./07_modeling_fl.ipynb)
- [Modeling Illinois](./07_modeling_il.ipynb)
- [Modeling New York](./07_modeling_ny.ipynb)
- [Modeling Texas](./07_modeling_tx.ipynb)
- [Conclusions](./08_conclusions.ipynb)

#### This Notebook's Contents  
- [Imports](#Imports) 
- [Combining COVID and Census Data](#Combining-COVID-and-Census-Data) 
- [Feature Engineering](#Feature-Engineering)
- [Break Out the States](#Break-Out-the-States)
- [Export the Data for Modeling](#Export-the-Data-for-Modeling)

# Imports

In [1]:
# Import the required libraries.
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import the data.
covid = pd.read_csv('../data/preprocessing/cleaned_covid_five_states.csv')
race = pd.read_csv('../data/preprocessing/cleaned_dp05_race_five_states.csv')
sa = pd.read_csv('../data/preprocessing/cleaned_dp05_sex_age_five_states.csv')
land = pd.read_csv('../data/preprocessing/cleaned_area_five_states.csv')
ins = pd.read_csv('../data/preprocessing/cleaned_dp03_insurance_five_states.csv')
inc = pd.read_csv('../data/preprocessing/cleaned_dp03_income_five_states.csv')
obes = pd.read_csv('../data/preprocessing/cleaned_obesity_five_states.csv')

In [3]:
# Display the first few rows of data. 
covid.head(2)

Unnamed: 0,Geographic Area Name,total_cases,total_fatalities,death_rate,total_tests
0,"Santa Clara County, California",23978.0,388.0,0.016181,839764
1,"San Mateo County, California",10942.0,159.0,0.014531,285657


In [4]:
# Display the first few rows of data. 
race.head(2)

Unnamed: 0,Geographic Area Name,race_pop,race_pop_hispanic_or_latino_of_any_race,race_pop_white_alone,race_pop_black_or_african_american_alone,race_pop_american_indian_and_alaska_native_alone,race_pop_asian_alone,race_pop_native_hawaiian_and_other_pacific_islander_alone,race_pop_some_other_race_alone,race_pop_two_or_more_races
0,"Austin County, Texas",29565,7819,18525,2576,48,89,0,52,456
1,"Kenedy County, Texas",595,522,72,0,0,1,0,0,0


In [5]:
# Display the first few rows of data. 
sa.head(2)

Unnamed: 0,Geographic Area Name,sex_age_pop,sex_age_pop_male,sex_age_pop_female,sex_age_pop_under_5,sex_age_pop_5_to_9,sex_age_pop_10_to_14,sex_age_pop_15_to_19,sex_age_pop_20_to_24,sex_age_pop_25_to_34,sex_age_pop_35_to_44,sex_age_pop_45_to_54,sex_age_pop_55_to_59,sex_age_pop_60_to_64,sex_age_pop_65_to_74,sex_age_pop_75_to_84,sex_age_pop_85_and_over,sex_age_median_age_in_years
0,"Austin County, Texas",29565,14684,14881,1780,1960,2118,1861,1712,3339,3275,3821,2327,1978,3243,1532,619,40.7
1,"Kenedy County, Texas",595,286,309,85,37,40,10,10,95,47,75,51,9,85,29,22,39.5


In [6]:
# Display the first few rows of data. 
land.head(2)

Unnamed: 0,Geographic Area Name,sq_mi
0,"Anderson County, Texas",1062.63
1,"Andrews County, Texas",1500.721


In [7]:
# Display the first few rows of data. 
ins.head(2)

Unnamed: 0,Geographic Area Name,health_ins_noninst_pop,health_ins_noninst_pop_cov_yes,health_ins_noninst_pop_cov_no
0,"Austin County, Texas",29298,25749,3549
1,"Kenedy County, Texas",595,467,128


In [8]:
# Display the first few rows of data. 
inc.head(2)

Unnamed: 0,Geographic Area Name,inc_hhlds,inc_hhlds_less_than_10_000,inc_hhlds_10_000_to_14_999,inc_hhlds_15_000_to_24_999,inc_hhlds_25_000_to_34_999,inc_hhlds_35_000_to_49_999,inc_hhlds_50_000_to_74_999,inc_hhlds_75_000_to_99_999,inc_hhlds_100_000_to_149_999,inc_hhlds_150_000_to_199_999,inc_hhlds_200_000_or_more,inc_med_hhld_inc_dol,inc_mean_hhld_inc_dol,inc_per_capita_inc_dol,inc_med_earn_male_full_yr_workers_dol,inc_med_earn_female_full_yr_workers_dol
0,"Austin County, Texas",11041,482,459,1255,927,1186,1851,1651,2150,551,529,65365,80769,30858,55417,38603
1,"Kenedy County, Texas",209,25,4,49,13,71,16,23,8,0,0,36125,40908,15820,40848,23295


In [9]:
# Display the first few rows of data. 
obes.head(2)

Unnamed: 0,Geographic Area Name,obes_percent
0,"Anderson County, Texas",0.373
1,"Andrews County, Texas",0.313


# Combining COVID and Census Data

In [10]:
# Merge the dataframes on Geographic Area Name.
df = covid.merge(land,on='Geographic Area Name').merge(obes,on='Geographic Area Name').merge(ins,on='Geographic Area Name')

In [11]:
# Merge the dataframes on Geographic Area Name.
df = df.merge(inc,on='Geographic Area Name').merge(race,on='Geographic Area Name').merge(sa,on='Geographic Area Name')

# Feature Engineering

In [12]:
# Rename the geo column.
df = df.rename(columns={'Geographic Area Name': 'county_state'})

### COVID Metrics

In [13]:
# Create a new column for deaths per 100 cases.
df['deaths_per_100_cases'] = df['death_rate'] * 100

In [14]:
# Create a new column for cases per 100 people.
df['cases_per_100_people'] =  (df['total_cases']/df['race_pop']) * 100

In [15]:
# Create a new column for tests per 100 people.
df['tests_per_100_people'] = (df['total_tests']/df['race_pop']) * 100

### Population Density

In [16]:
# Create a new column for county population density which is the result of
# dividing population by square miles
df['pop_density'] = df['race_pop'] / df['sq_mi']

### Create a State Column

In [17]:
# Create a state column based on county_state.
df['state'] = np.where(df['county_state'].str.contains('California'), 'California',
              np.where(df['county_state'].str.contains('Florida'), 'Florida',
              np.where(df['county_state'].str.contains('Illinois'), 'Illinois',
              np.where(df['county_state'].str.contains('New York'), 'New York', 'Texas'))))

### Create a County Column

In [18]:
# Create a county column based on county_state.
df['county'] = df['county_state']

In [19]:
# Drop extraneous county words for graphing.
df['county'] = df['county'].str.replace(' County, California', '')

In [20]:
# Drop extraneous county words for graphing.
df['county'] = df['county'].str.replace(' County, Florida', '')

In [21]:
# Drop extraneous county words for graphing.
df['county'] = df['county'].str.replace(' County, Illinois', '')

In [22]:
# Drop extraneous county words for graphing.
df['county'] = df['county'].str.replace(' County, New York', '')

In [23]:
# Drop extraneous county words for graphing.
df['county'] = df['county'].str.replace(' County, Texas', '')

### Ordinal COVID Severity

##### Create an ordinal COVID severity column for classification modeling.

In [24]:
# Create a covid_severity column.
df['covid_severity'] = 0

In [25]:
# Create masks to assign covid_severity to one of three classes based on
# numer of cases per 100 people in the county.
mask1 = df['cases_per_100_people'] < 2
mask2 = (df['cases_per_100_people'] >= 2) & (df['cases_per_100_people'] < 3)
mask3 = df['cases_per_100_people'] >= 3

In [26]:
# Apply the mask for level 1 (low).
df['covid_severity'][mask1] = 1

In [27]:
# Apply the mask for level 2 (medium).
df['covid_severity'][mask2] = 2

In [28]:
# Apply the mask for level 3 (high).
df['covid_severity'][mask3] = 3

In [29]:
# Display the count of counties in each severity category.
df['covid_severity'].value_counts()

1    228
3    158
2    157
Name: covid_severity, dtype: int64

### Demographic Recombinations

In [30]:
# Create different income demographic features for testing.
df['inc_hhlds_less_than_25_000'] = df['inc_hhlds_less_than_10_000']\
                    + df['inc_hhlds_10_000_to_14_999'] + df['inc_hhlds_15_000_to_24_999']

df['inc_hhlds_25_000_to_49_999'] = df['inc_hhlds_25_000_to_34_999']\
                    + df['inc_hhlds_35_000_to_49_999']

df['inc_hhlds_less_than_99_999'] = df['inc_hhlds_less_than_10_000']\
                    + df['inc_hhlds_10_000_to_14_999'] + df['inc_hhlds_15_000_to_24_999']\
                    + df['inc_hhlds_25_000_to_34_999'] + df['inc_hhlds_35_000_to_49_999']\
                    + df['inc_hhlds_50_000_to_74_999'] + df['inc_hhlds_75_000_to_99_999']\

df['inc_hhlds_100_000_or_more'] = df['inc_hhlds_100_000_to_149_999']\
                    + df['inc_hhlds_150_000_to_199_999']

df['inc_hhlds_less_than_49_999'] = df['inc_hhlds_less_than_10_000']\
                    + df['inc_hhlds_10_000_to_14_999'] + df['inc_hhlds_15_000_to_24_999']\
                    + df['inc_hhlds_25_000_to_34_999'] + df['inc_hhlds_35_000_to_49_999']\

df['inc_hhlds_50_000_to_99_999'] = df['inc_hhlds_50_000_to_74_999'] + df['inc_hhlds_75_000_to_99_999']

df['inc_hhlds_100_000_or_more'] = df['inc_hhlds_100_000_to_149_999']\
                    + df['inc_hhlds_150_000_to_199_999']

In [31]:
# Create different race demographic features for testing.
df['race_pop_other'] = df['race_pop_american_indian_and_alaska_native_alone']\
                    + df['race_pop_native_hawaiian_and_other_pacific_islander_alone']\
                    + df['race_pop_some_other_race_alone'] + df['race_pop_two_or_more_races']

In [32]:
# Create different age demographic features for testing.
df['sex_age_pop_0_to_44'] = df['sex_age_pop_under_5'] + df['sex_age_pop_5_to_9']\
                    + df['sex_age_pop_10_to_14'] + df['sex_age_pop_15_to_19']\
                    + df['sex_age_pop_20_to_24'] + df['sex_age_pop_25_to_34']\
                    + df['sex_age_pop_35_to_44']

df['sex_age_pop_45_to_74'] = df['sex_age_pop_45_to_54'] + df['sex_age_pop_55_to_59']\
                    + df['sex_age_pop_60_to_64'] + df['sex_age_pop_65_to_74']

df['sex_age_pop_75_and_over'] = df['sex_age_pop_75_to_84'] + df['sex_age_pop_85_and_over']

df['sex_age_pop_0_to_64'] = df['sex_age_pop_under_5'] + df['sex_age_pop_5_to_9']\
                    + df['sex_age_pop_10_to_14'] + df['sex_age_pop_15_to_19']\
                    + df['sex_age_pop_20_to_24'] + df['sex_age_pop_25_to_34']\
                    + df['sex_age_pop_35_to_44'] + df['sex_age_pop_45_to_54']\
                    + df['sex_age_pop_55_to_59'] + df['sex_age_pop_60_to_64']

df['sex_age_pop_65_and_over'] = df['sex_age_pop_65_to_74'] + df['sex_age_pop_75_to_84']\
                    + df['sex_age_pop_85_and_over']

df['sex_age_pop_0_to_35'] = df['sex_age_pop_under_5'] + df['sex_age_pop_5_to_9']\
                    + df['sex_age_pop_10_to_14'] + df['sex_age_pop_15_to_19']\
                    + df['sex_age_pop_20_to_24'] + df['sex_age_pop_25_to_34']

df['sex_age_pop_35_to_59'] = df['sex_age_pop_35_to_44'] + df['sex_age_pop_45_to_54']\
                    + df['sex_age_pop_55_to_59']

df['sex_age_pop_60_to_84'] = df['sex_age_pop_60_to_64'] + df['sex_age_pop_65_to_74']\
                    + df['sex_age_pop_75_to_84']

In [33]:
# Define a function to create new columns with percentages.
def to_percentage(df):
    for column in df.columns:
        if column.startswith('race_pop_'):
            df['percent_' + column] = df[column] / df['race_pop']
        elif column.startswith('sex_age_pop_'):
            df['percent_' + column] = df[column] / df['sex_age_pop']            
        elif column.startswith('health_ins_noninst_pop_cov'):
            df['percent_' + column] = df[column] / df['health_ins_noninst_pop']
        elif column.startswith('inc_hhlds_'):
            df['percent_' + column] = df[column] / df['inc_hhlds']
    return

In [34]:
# Apply the function to the dataframe.
to_percentage(df)

# Break Out the States

In [35]:
# Create a state-specific dataframe.
df_ca = df[df['county_state'].str.contains('California')]

In [36]:
# Create a state-specific dataframe.
df_fl = df[df['county_state'].str.contains('Florida')]

In [37]:
# Create a state-specific dataframe.
df_il = df[df['county_state'].str.contains('Illinois')]

In [38]:
# Create a state-specific dataframe.
df_ny = df[df['county_state'].str.contains('New York')]

In [39]:
# Create a state-specific dataframe.
df_tx = df[df['county_state'].str.contains('Texas')]

# Export Data for Modeling

In [40]:
# Export the dataframe to a csv.
df.to_csv('../data/cleaned_engineered_five_states.csv', index=False)

In [41]:
# Export the dataframe to a csv.
df_ca.to_csv('../data/cleaned_engineered_ca.csv', index=False)

In [42]:
# Export the dataframe to a csv.
df_fl.to_csv('../data/cleaned_engineered_fl.csv', index=False)

In [43]:
# Export the dataframe to a csv.
df_il.to_csv('../data/cleaned_engineered_il.csv', index=False)

In [44]:
# Export the dataframe to a csv.
df_ny.to_csv('../data/cleaned_engineered_ny.csv', index=False)

In [45]:
# Export the dataframe to a csv.
df_tx.to_csv('../data/cleaned_engineered_tx.csv', index=False)