# Exploratory analysis

In [1]:
import intake
import pandas as pd

catalog = intake.open_catalog("../catalog.yml")
# If we had a `catalog` folder with lots of YAML files, use the * to designate everything with a .yml:
# catalog = intake.open_catalog("../catalogs/*.yml")

## JHU cases data

A lot of this is directly imported from JHU. There are some added columns, starting from `state_cases`, all the way to `new_state_deaths`. Those are columns added when the data was imported and saved.

Let's poke around and see if we want to generate any new columns.

In [2]:
jhu = catalog.jhu_cases.read()
jhu.head()

Unnamed: 0,county,state,fips,date,Lat,Lon,cases,deaths,incident_rate,people_tested,state_cases,state_deaths,new_cases,new_deaths,new_state_cases,new_state_deaths
62,Autauga,Alabama,1001,2020-03-24 07:00:00+00:00,32.539527,-86.644082,1,0,1.789901,,298,0,1,0,74,0
63,Autauga,Alabama,1001,2020-03-25 07:00:00+00:00,32.539527,-86.644082,5,0,8.949507,,472,1,4,0,174,1
64,Autauga,Alabama,1001,2020-03-26 07:00:00+00:00,32.539527,-86.644082,6,0,10.739408,,574,1,1,0,102,0
65,Autauga,Alabama,1001,2020-03-27 07:00:00+00:00,32.539527,-86.644082,6,0,10.739408,,684,4,0,0,110,3
66,Autauga,Alabama,1001,2020-03-28 07:00:00+00:00,32.539527,-86.644082,6,0,10.739408,,801,4,0,0,117,0


In [None]:
"""
Alternatively, can read it in using pandas.

jhu = pd.read_parquet("../data/us-county-time-series.parquet")
"""

In [None]:
# Data types
# strings are objects
# numeric can be floats (with decimals) or integers
# datetime
jhu.dtypes

In [None]:
# Subset data
la = jhu[jhu.county=="Los Angeles"] 
la.head()

In [4]:
# We might need this
import useful_dict

useful_dict.us_state_abbrev

{'Alabama': 'AL',
 'Alaska': 'AK',
 'American Samoa': 'AS',
 'Arizona': 'AZ',
 'Arkansas': 'AR',
 'California': 'CA',
 'Colorado': 'CO',
 'Connecticut': 'CT',
 'Delaware': 'DE',
 'District of Columbia': 'DC',
 'Florida': 'FL',
 'Georgia': 'GA',
 'Guam': 'GU',
 'Hawaii': 'HI',
 'Idaho': 'ID',
 'Illinois': 'IL',
 'Indiana': 'IN',
 'Iowa': 'IA',
 'Kansas': 'KS',
 'Kentucky': 'KY',
 'Louisiana': 'LA',
 'Maine': 'ME',
 'Maryland': 'MD',
 'Massachusetts': 'MA',
 'Michigan': 'MI',
 'Minnesota': 'MN',
 'Mississippi': 'MS',
 'Missouri': 'MO',
 'Montana': 'MT',
 'Nebraska': 'NE',
 'Nevada': 'NV',
 'New Hampshire': 'NH',
 'New Jersey': 'NJ',
 'New Mexico': 'NM',
 'New York': 'NY',
 'North Carolina': 'NC',
 'North Dakota': 'ND',
 'Northern Mariana Islands': 'MP',
 'Ohio': 'OH',
 'Oklahoma': 'OK',
 'Oregon': 'OR',
 'Pennsylvania': 'PA',
 'Puerto Rico': 'PR',
 'Rhode Island': 'RI',
 'South Carolina': 'SC',
 'South Dakota': 'SD',
 'Tennessee': 'TN',
 'Texas': 'TX',
 'Utah': 'UT',
 'Vermont': 'VT',
 '

In [5]:
# Make new columns

# Can use .assign() to create multiple new columns at once
jhu = jhu.assign(
    # Make the datetime into a string, lose the timestamp
    date = pd.to_datetime(jhu.date).dt.date,
    county_state = jhu.county + ", " + jhu.state,
)

# Or, we can do it one at a time
# Make date2, which can be a datetime column, so it recognizes various forms of dates
jhu['date2'] = pd.to_datetime(jhu.date)    
jhu['state_abbrev'] = jhu.state.map(useful_dict.us_state_abbrev)

In [None]:
jhu[(jhu.date2 == "2020-12-23") & (jhu.county=="Riverside")]

In [None]:
jhu[(jhu.date2 == "12/23/20") & (jhu.county=="Riverside")]

In [None]:
jhu[(jhu.date2 == "12-23-20") & (jhu.county=="Riverside")]

In [None]:
# But date doesn't work like the datetime
jhu[(jhu.date == "12-23-20") & (jhu.county=="Riverside")].head()

## Population crosswalk

In [18]:
crosswalk = catalog.msa_county_crosswalk.read()
crosswalk.head()

Unnamed: 0,cbsacode,cbsatitle,metro_micro,county,state,county_fips,fips_state_code,fips_county_code,county_pop,msa_pop
0,10100,"Aberdeen, SD",Micropolitan Statistical Area,Brown County,South Dakota,46013,46,13,38839.0,42668.0
1,10100,"Aberdeen, SD",Micropolitan Statistical Area,Edmunds County,South Dakota,46045,46,45,3829.0,42668.0
2,10140,"Aberdeen, WA",Micropolitan Statistical Area,Grays Harbor County,Washington,53027,53,27,75061.0,75061.0
3,10180,"Abilene, TX",Metropolitan Statistical Area,Callahan County,Texas,48059,48,59,13943.0,172060.0
4,10180,"Abilene, TX",Metropolitan Statistical Area,Jones County,Texas,48253,48,253,20083.0,172060.0


In [7]:
# Look at catalog.yml
# CSVs are bad at keeping data types, so we can force certain columns to be read in with certain data types
# Here, cbsacode and county_fips are forced to be read in as string
# fips is string in JHU data
crosswalk.dtypes

cbsacode             object
cbsatitle            object
metro_micro          object
county               object
state                object
county_fips          object
fips_state_code       int64
fips_county_code      int64
county_pop          float64
msa_pop             float64
dtype: object

In [19]:
# Let's rename a column
crosswalk = crosswalk.rename(columns = {"county_fips": "fips"})

In [20]:
# We can also string a bunch of commands together
crosswalk = (crosswalk.rename(columns = {"cbsacode": "cbsa_code"})
             [["fips", "county_pop", "cbsatitle"]]
            )

crosswalk.head()

Unnamed: 0,fips,county_pop,cbsatitle
0,46013,38839.0,"Aberdeen, SD"
1,46045,3829.0,"Aberdeen, SD"
2,53027,75061.0,"Aberdeen, WA"
3,48059,13943.0,"Abilene, TX"
4,48253,20083.0,"Abilene, TX"


## Merge cases with population

In [21]:
df = pd.merge(jhu, 
              # We'll subset the crosswalk df right here
              crosswalk[["fips", "county_pop"]], 
              on = "fips", 
              how = "inner", 
              validate = "m:1")

df.head()

Unnamed: 0,county,state,fips,date,Lat,Lon,cases,deaths,incident_rate,people_tested,state_cases,state_deaths,new_cases,new_deaths,new_state_cases,new_state_deaths,county_state,date2,state_abbrev,county_pop
0,Autauga,Alabama,1001,2020-03-24,32.539527,-86.644082,1,0,1.789901,,298,0,1,0,74,0,"Autauga, Alabama",2020-03-24,AL,55869.0
1,Autauga,Alabama,1001,2020-03-25,32.539527,-86.644082,5,0,8.949507,,472,1,4,0,174,1,"Autauga, Alabama",2020-03-25,AL,55869.0
2,Autauga,Alabama,1001,2020-03-26,32.539527,-86.644082,6,0,10.739408,,574,1,1,0,102,0,"Autauga, Alabama",2020-03-26,AL,55869.0
3,Autauga,Alabama,1001,2020-03-27,32.539527,-86.644082,6,0,10.739408,,684,4,0,0,110,3,"Autauga, Alabama",2020-03-27,AL,55869.0
4,Autauga,Alabama,1001,2020-03-28,32.539527,-86.644082,6,0,10.739408,,801,4,0,0,117,0,"Autauga, Alabama",2020-03-28,AL,55869.0
