In [1]:
from datetime import date, datetime
from dateutil import parser
from io import StringIO
import pandas as pd
import requests, perspective, json

In [2]:
# standardize on abbreviations for states, and full names in 'stateName'
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP"
}
us_state_full = {key: value for (value, key) in us_state_abbrev.items()}

Data Attribution:

- [The COVID Tracking Project](https://covidtracking.com/)
- [USAFacts](https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/)

In [4]:
STATE_URL = "https://covidtracking.com/api/states/daily"
CONFIRMED_URL = "https://static.usafacts.org/public/data/covid-19/covid_confirmed_usafacts.csv"
DEATHS_URL = "https://static.usafacts.org/public/data/covid-19/covid_deaths_usafacts.csv"

### Clean Up State Level Data

In [4]:
def clean_state_data(data):
    for row in data:
        # dates are stored as integer with no separators, so add separators and parse
        d = list(str(row["date"]))
        if "/" not in d:
            d.insert(4, "/")
            d.insert(7, "/")
        row["date"] = "".join(d)
        row["date"] = parser.parse(row["date"])
        row["dateChecked"] = parser.parse(row["dateChecked"])
    return data

In [59]:
state_data = clean_state_data(requests.get(STATE_URL).json())

In [6]:
state_aggregates = {
    "positive": "high",
    "negative": "high",
    "pending": "high",
    "death": "high",
    "total": "high"
}

In [55]:
state_data = pd.DataFrame(state_data).rename(columns={
    "state": "State"
})

In [56]:
state_data["stateName"] = [us_state_full.get(x, None) for x in state_data["State"]]

In [15]:
state_data.dtypes

date                     datetime64[ns]
dateChecked     datetime64[ns, tzutc()]
death                           float64
hospitalized                    float64
negative                        float64
pending                         float64
positive                        float64
State                            object
total                             int64
stateName                        object
dtype: object

We can use `PerspectiveWidget` to visualize and transform the data:

In [57]:
state_schema = {"date": date, "dateChecked": datetime, "death": int, "hospitalized": int, "negative": int, "pending": int, "positive": int, "State": str, "total": int, "stateName": str}
w = perspective.PerspectiveWidget(state_schema, plugin="xy_line", columns=["date", "positive"], column_pivots=["stateName"])
w.update(state_data)
w

PerspectiveWidget(column_pivots=['stateName'], columns=['date', 'positive'], plugin='xy_line')

### Clean up and reorganize county level data

Dates are stored as column names in these CSVs, so we need to transform them into row values and rename the columns accordingly.

In [81]:
confirmed_df = pd.read_csv(StringIO(requests.get(CONFIRMED_URL).text)).melt(id_vars=["ï»¿countyFIPS", "County Name", "State", "stateFIPS"])
display(confirmed_df)
confirmed_df = confirmed_df[confirmed_df["variable"] != "Unnamed: 68"]
confirmed_df
#confirmed_df = confirmed_df.rename(columns={"variable": "Date", "value": "Confirmed", "County Name": "County"})
#confirmed_df["stateName"] = [us_state_full.get(x, None) for x in confirmed_df["State"]]

Unnamed: 0,ï»¿countyFIPS,County Name,State,stateFIPS,variable,value
0,0,Statewide Unallocated,AL,1,1/22/2020,0
1,1001,Autauga County,AL,1,1/22/2020,0
2,1003,Baldwin County,AL,1,1/22/2020,0
3,1009,Blount County,AL,1,1/22/2020,0
4,1013,Butler County,AL,1,1/22/2020,0
5,1015,Calhoun County,AL,1,1/22/2020,0
6,1017,Chambers County,AL,1,1/22/2020,0
7,1019,Cherokee County,AL,1,1/22/2020,0
8,1021,Chilton County,AL,1,1/22/2020,0
9,1027,Clay County,AL,1,1/22/2020,0


Unnamed: 0,ï»¿countyFIPS,County Name,State,stateFIPS,variable,value
0,0,Statewide Unallocated,AL,1,1/22/2020,0
1,1001,Autauga County,AL,1,1/22/2020,0
2,1003,Baldwin County,AL,1,1/22/2020,0
3,1009,Blount County,AL,1,1/22/2020,0
4,1013,Butler County,AL,1,1/22/2020,0
5,1015,Calhoun County,AL,1,1/22/2020,0
6,1017,Chambers County,AL,1,1/22/2020,0
7,1019,Cherokee County,AL,1,1/22/2020,0
8,1021,Chilton County,AL,1,1/22/2020,0
9,1027,Clay County,AL,1,1/22/2020,0


In [None]:
confirmed_df

In [76]:
deaths_df = pd.read_csv(StringIO(requests.get(DEATHS_URL).text))
deaths_df
#deaths_df = deaths_df.melt(id_vars=["ï»¿countyFIPS", "County Name", "State", "stateFIPS"])
#deaths_df = deaths_df.rename(columns={"ï»¿countyFIPS": "countyFIPS", "variable": "Date", "value": "Deaths", "County Name": "County"})
#deaths_df["stateName"] = [us_state_full.get(x, None) for x in deaths_df["State"]]

Unnamed: 0,ï»¿countyFIPS,County Name,State,stateFIPS,1/22/2020,1/23/2020,1/24/2020,1/25/2020,1/26/2020,1/27/2020,...,3/16/2020,3/17/2020,3/18/2020,3/19/2020,3/20/2020,3/21/2020,3/22/2020,3/23/2020,3/24/2020,3/25/2020
0,0.0,Statewide Unallocated,AL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1001.0,Autauga County,AL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,1003.0,Baldwin County,AL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1009.0,Blount County,AL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1013.0,Butler County,AL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,1015.0,Calhoun County,AL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,1017.0,Chambers County,AL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,1019.0,Cherokee County,AL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,1021.0,Chilton County,AL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,1027.0,Clay County,AL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [23]:
deaths_df

Unnamed: 0,countyFIPS,County,State,stateFIPS,Date,Deaths,stateName
0,0,Statewide Unallocated,AL,1,1/22/2020,0,Alabama
1,1003,Baldwin County,AL,1,1/22/2020,0,Alabama
2,1015,Calhoun County,AL,1,1/22/2020,0,Alabama
3,1017,Chambers County,AL,1,1/22/2020,0,Alabama
4,1043,Cullman County,AL,1,1/22/2020,0,Alabama
5,1051,Elmore County,AL,1,1/22/2020,0,Alabama
6,1059,Franklin County,AL,1,1/22/2020,0,Alabama
7,1069,Houston County,AL,1,1/22/2020,0,Alabama
8,1071,Jackson County,AL,1,1/22/2020,0,Alabama
9,1073,Jefferson County,AL,1,1/22/2020,0,Alabama


### Get auxillary state & county-level data

We only need to do this once as the data doesn't update until 5/29.

In [24]:
# Closest estimates we have are from 2018
STATE_POPULATION_URL = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-popchg2010_2019.csv?#"
COUNTY_POPULATION_URL = "https://www.ers.usda.gov/webdocs/DataFiles/48747/PopulationEstimates.csv?v=3011.3"
COUNTY_UNEMPLOYMENT_URL = "https://www.ers.usda.gov/webdocs/DataFiles/48747/Unemployment.csv?v=2564.4"

In [25]:
state_population = StringIO(requests.get(STATE_POPULATION_URL).text)
state_population_df = pd.read_csv(
    state_population, usecols=["STATE", "NAME", "POPESTIMATE2019"]) \
        .rename(columns={"POPESTIMATE2019": "Population (2019 Estimate)", "NAME": "stateName", "STATE": "stateFIPS"}) \
        .set_index("stateFIPS")

In [26]:
state_population_df["State"] = [us_state_abbrev.get(x, None) for x in state_population_df["stateName"]]

In [27]:
state_population_df = state_population_df[state_population_df["State"].notnull()]

In [28]:
# cleaned and normalized
state_population_df

Unnamed: 0_level_0,stateName,Population (2019 Estimate),State
stateFIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Alabama,4903185,AL
2,Alaska,731545,AK
4,Arizona,7278717,AZ
5,Arkansas,3017804,AR
6,California,39512223,CA
8,Colorado,5758736,CO
9,Connecticut,3565287,CT
10,Delaware,973764,DE
11,District of Columbia,705749,DC
12,Florida,21477737,FL


In [29]:
county_population = StringIO(requests.get(COUNTY_POPULATION_URL).text)
county_population_df = pd.read_csv(
    county_population, usecols=["FIPS", "State", "Area_Name", "POP_ESTIMATE_2018"]) \
    .rename(columns={"POP_ESTIMATE_2018": "Population (2018 Estimate)", "Area_Name": "County", "FIPS": "countyFIPS"}) \
    .set_index("countyFIPS")

In [30]:
county_population_df["stateName"] = [us_state_full.get(x, None) for x in county_population_df["State"]]

In [31]:
county_population_df = county_population_df[county_population_df["stateName"].notnull()]

In [32]:
county_population_df["Population (2018 Estimate)"] = pd.to_numeric(county_population_df["Population (2018 Estimate)"].str.replace(",","").astype(float))

In [34]:
# Fold NYC's counties into one value under FIPS 36061, which belongs to Manhattan
nyc_counties = county_population_df[county_population_df["County"].isin(["Kings County", "Queens County", "New York County", "Bronx County", "Richmond County"])]

In [35]:
nyc_counties = nyc_counties[nyc_counties["State"] == "NY"]

In [36]:
nyc_pop = nyc_counties["Population (2018 Estimate)"].sum()

In [37]:
nyc_population = pd.DataFrame([{
    "countyFIPS": 36061,
    "State": "NY",
    "County": "New York City",
    "Population (2018 Estimate)": nyc_pop,
    "stateName": "New York"
}]).set_index("countyFIPS")

In [38]:
county_population_df = county_population_df.drop([36005, 36047, 36081, 36085])

In [39]:
county_population_df = county_population_df.append(nyc_population, sort=True)

In [40]:
county_population_df = county_population_df.reset_index()

In [41]:
county_unemployment = StringIO(requests.get(COUNTY_UNEMPLOYMENT_URL).text)
county_unemployment_df = pd.read_csv(
    county_unemployment, usecols=["FIPS", "State", "Area_name", "Unemployment_rate_2018", "Median_Household_Income_2018", "Civilian_labor_force_2018", "Employed_2018", "Unemployed_2018"]) \
    .rename(columns={
            "Unemployment_rate_2018": "Unemployment Rate (2018)",
            "Civilian_labor_force_2018": "Civilian Labor Force (2018)",
            "Employed_2018": "Employed (2018)",
            "Unemployed_2018": "Unemployed (2018)",
            "Median_Household_Income_2018": "Median Household Income (2018)",
            "Area_name": "County",
            "FIPS": "countyFIPS"
        }) \
    .set_index("countyFIPS")

In [42]:
for col in ["Civilian Labor Force (2018)", "Employed (2018)", "Unemployed (2018)"]:
    county_unemployment_df[col] =  pd.to_numeric(county_unemployment_df[col].str.replace(",","").astype(float))

In [43]:
county_unemployment_df["Median Household Income (2018)"] =  pd.to_numeric([None if str(x) == "nan" else str(x).replace(",","").replace("$","") for x in county_unemployment_df["Median Household Income (2018)"]])

In [45]:
# Fold NYC's counties into one value under FIPS 36061, which belongs to Manhattan
nyc_counties_meta = county_unemployment_df[county_unemployment_df["County"].isin(["Kings County, NY", "Queens County, NY", "New York County, NY", "Bronx County, NY", "Richmond County, NY"])]

In [46]:
nyc_counties_meta = nyc_counties_meta[nyc_counties_meta["State"] == "NY"]

In [48]:
nyc_meta_all = pd.DataFrame([{
    "countyFIPS": 36061,
    "State": "NY",
    "County": "New York City",
    "Civilian Labor Force (2018)": nyc_counties_meta["Civilian Labor Force (2018)"].sum(),
    "Employed (2018)": nyc_counties_meta["Employed (2018)"].sum(),
    "Unemployed (2018)": nyc_counties_meta["Unemployed (2018)"].sum(),
    "Unemployment Rate (2018)": nyc_counties_meta["Unemployment Rate (2018)"].sum(),
    "Median Household Income (2018)": nyc_counties_meta["Median Household Income (2018)"].sum(),
    "stateName": "New York"
}]).set_index("countyFIPS")

In [49]:
county_unemployment_df = county_unemployment_df.drop([36005, 36047, 36081, 36085])
county_unemployment_df = county_unemployment_df.append(nyc_meta_all, sort=True).reset_index()

### Join together all datasets

In [50]:
state_covid_with_population = state_data.merge(state_population_df[["Population (2019 Estimate)", "State"]], on="State")

In [63]:
isinstance(state_covid_with_population["date"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype)

False

In [52]:
perspective.PerspectiveWidget(
    state_covid_with_population,
    plugin="xy_line",
    columns=["date", "positive % total tested"],
    row_pivots=["date"],
    column_pivots=["stateName"],
    filters=[["positive % total tested", "<", 100], ["total", ">", 1000]],
    computed_columns=['"positive" % "total" as "positive % total tested"']
)

PerspectiveWidget(column_pivots=['stateName'], columns=['positive % total tested', 'positive', 'total'], compu…

In [53]:
perspective.PerspectiveWidget(
    state_covid_with_population,
    columns=["positive % total tested"],
    plugin="y_line",
    row_pivots=["date"],
    column_pivots=["stateName"],
    filters=[["positive % total tested", "<", 100]],
    computed_columns=['"positive" % "total" as "positive % total tested"']
)

PerspectiveWidget(column_pivots=['stateName'], columns=['positive % total tested'], computed_columns=['"positi…

In [67]:
# Join together county deaths + confirmed dataset
county_covid_confirmed_with_deaths = confirmed_df.join(deaths_df["Deaths"])

In [68]:
county_covid_confirmed_with_deaths

Unnamed: 0,countyFIPS,County,State,stateFIPS,Date,Confirmed,stateName,Deaths
0,0,Statewide Unallocated,AL,1,1/22/2020,0,Alabama,0
1,1003,Baldwin County,AL,1,1/22/2020,0,Alabama,0
2,1015,Calhoun County,AL,1,1/22/2020,0,Alabama,0
3,1017,Chambers County,AL,1,1/22/2020,0,Alabama,0
4,1043,Cullman County,AL,1,1/22/2020,0,Alabama,0
5,1051,Elmore County,AL,1,1/22/2020,0,Alabama,0
6,1059,Franklin County,AL,1,1/22/2020,0,Alabama,0
7,1069,Houston County,AL,1,1/22/2020,0,Alabama,0
8,1071,Jackson County,AL,1,1/22/2020,0,Alabama,0
9,1073,Jefferson County,AL,1,1/22/2020,0,Alabama,0


In [69]:
county = perspective.Table(county_covid_confirmed_with_deaths)

In [53]:
perspective.PerspectiveWidget(
    county,
    aggregates={
        "Death Rate (%)": "last by index",
        "Deaths": "high",
        "Confirmed": "high",
        "stateName": "unique",
        "State": "unique",
        "County": "unique"
    },
    columns=["Death Rate (%)", "Deaths", "Confirmed", "stateName"],
    row_pivots=["County"],
    sort=[["Deaths", "desc"]],
    filters=[["County", "!=", "Statewide Unallocated"]],
    computed_columns=['"Deaths" % "Confirmed" as "Death Rate (%)"']
)

PerspectiveWidget(aggregates={'Death Rate (%)': 'last by index', 'Deaths': 'high', 'Confirmed': 'high', 'state…

In [70]:
county_covid_with_metadata = county_covid_confirmed_with_deaths \
    .merge(county_population_df[["countyFIPS", "Population (2018 Estimate)"]], on="countyFIPS") \
    .merge(county_unemployment_df[["countyFIPS", "Unemployment Rate (2018)", "Unemployed (2018)", "Employed (2018)", "Civilian Labor Force (2018)", "Median Household Income (2018)"]], on="countyFIPS") \

In [55]:
county_covid_with_metadata.dtypes

countyFIPS                          int64
County                             object
State                              object
stateFIPS                           int64
Date                               object
Confirmed                           int64
stateName                          object
Deaths                              int64
Population (2018 Estimate)        float64
Unemployment Rate (2018)          float64
Unemployed (2018)                 float64
Employed (2018)                   float64
Civilian Labor Force (2018)       float64
Median Household Income (2018)    float64
dtype: object

In [65]:
county_with_metadata = perspective.Table(county_covid_with_metadata)

NameError: name 'county_covid_with_metadata' is not defined

In [64]:
perspective.PerspectiveWidget(
    county_with_metadata,
    aggregates={
        "Death Rate (%)": "last by index",
        "Deaths": "high",
        "Confirmed": "high",
        "stateName": "unique",
        "State": "unique",
        "County": "unique",
        "Unemployment Rate (2018)": "high",
        "Unemployed (2018)": "high",
        "Employed (2018)": "high",
        "Civilian Labor Force (2018)": "high",
        "Population (2018 Estimate)": "high",
        "Median Household Income (2018)": "unique"
    },
    columns=["Death Rate (%)", "Deaths", "Confirmed", "stateName"],
    row_pivots=["County"],
    sort=[["Deaths", "desc"]],
    filters=[["County", "!=", "Statewide Unallocated"]],
    computed_columns=[
        '"Deaths" % "Confirmed" as "Death Rate (%)"',
        '"Confirmed" % "Population (2018 Estimate)" as "Confirmed % Population"'])

NameError: name 'county_with_metadata' is not defined

In [6]:
e

In [7]:
county = pd.read_csv(NYT_URL, error_bad_lines=False)

In [8]:
county

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0
5,2020-01-25,Orange,California,6059.0,1,0
6,2020-01-25,Cook,Illinois,17031.0,1,0
7,2020-01-25,Snohomish,Washington,53061.0,1,0
8,2020-01-26,Maricopa,Arizona,4013.0,1,0
9,2020-01-26,Los Angeles,California,6037.0,1,0


In [9]:
perspective.PerspectiveWidget(county)

PerspectiveWidget(columns=['index', 'date', 'county', 'state', 'fips', 'cases', 'deaths'])

In [12]:
county[county["state"] == "New York"]

Unnamed: 0,date,county,state,fips,cases,deaths
416,2020-03-01,New York City,New York,,1,0
448,2020-03-02,New York City,New York,,1,0
482,2020-03-03,New York City,New York,,2,0
518,2020-03-04,New York City,New York,,2,0
519,2020-03-04,Westchester,New York,36119.0,9,0
564,2020-03-05,Nassau,New York,36059.0,1,0
565,2020-03-05,New York City,New York,,4,0
566,2020-03-05,Westchester,New York,36119.0,17,0
626,2020-03-06,Nassau,New York,36059.0,4,0
627,2020-03-06,New York City,New York,,5,0
