In [10]:
import pandas as pd
import numpy as np

base_url   = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/'
global_url = base_url + 'csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
us_url     = base_url + 'csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'

world = pd.read_csv(global_url)
us    = pd.read_csv(us_url)

Let's look first at what's going on in Canada. The global cases files don't have separate national and province level totals, the national is calculated as the sum of the provinces, so something definitely weird going on there.

In [8]:
# find all columns that look like m/d/yy dates
days = list(filter(lambda x: len(x.split('/')) == 3, world.columns))
today = days[-1]
last_week = days[-7:]

world[world['Country/Region']=='Canada'][['Province/State'] + last_week]

Unnamed: 0,Province/State,4/16/20,4/17/20,4/18/20,4/19/20,4/20/20,4/21/20,4/22/20
35,Alberta,1996,2397,2562,2803,2908,3095,3401
36,British Columbia,1561,1575,1618,1647,1647,1724,1795
37,Grand Princess,13,13,13,13,13,13,-1
38,Manitoba,250,250,253,254,254,255,257
39,New Brunswick,117,117,117,118,118,118,118
40,Newfoundland and Labrador,252,256,257,257,257,257,256
41,Nova Scotia,579,606,649,675,721,737,772
42,Ontario,9840,10456,11013,11561,12063,12715,13718
43,Prince Edward Island,26,26,26,26,26,26,26
44,Quebec,15857,16798,17521,17950,19319,20126,20965


Maybe they're using "-1" for missing? There's nothing in the docs about it, but that would make sense. I can convert those to NaN easily enough.

### subnational US data ###

Here's a quick glance at the negative "new cases" data in the US and what that looks like. The subnational US case data come from different files than the global data, so it's entirely possible the subnational and national totals aren't consistent.

In [17]:
days = list(filter(lambda x: len(x.split('/')) == 3, us.columns))
today = days[-1]
last_2_weeks = days[-14:]

states = us.groupby('Province_State').sum()[last_2_weeks]

# calculate new cases as difference from previous day
nc = states.diff(axis=1)

# show states with negative case numbers in the last 2 weeks
nc[(nc<0).any(axis=1)]

Unnamed: 0_level_0,4/9/20,4/10/20,4/11/20,4/12/20,4/13/20,4/14/20,4/15/20,4/16/20,4/17/20,4/18/20,4/19/20,4/20/20,4/21/20,4/22/20
Province_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Nevada,,266.0,-20.0,134.0,154.0,144.0,77.0,3.0,310.0,102.0,102.0,102.0,107.0,144.0
New Hampshire,,0.0,66.0,44.0,56.0,-63.0,217.0,0.0,148.0,55.0,48.0,57.0,43.0,98.0
Puerto Rico,,42.0,63.0,109.0,6.0,20.0,51.0,69.0,25.0,50.0,95.0,39.0,46.0,-46.0
South Carolina,,274.0,144.0,109.0,71.0,162.0,103.0,275.0,168.0,149.0,129.0,69.0,-7.0,322.0
Washington,,480.0,315.0,175.0,26.0,164.0,143.0,115.0,460.0,259.0,202.0,136.0,449.0,-512.0


In [19]:
# let's compare the new case totals from the global file to what we get
# if we aggregate the state-level data

days = list(filter(lambda x: len(x.split('/')) == 3, world.columns))
world[world['Country/Region']=='US'][days].diff(axis=1)

Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,4/13/20,4/14/20,4/15/20,4/16/20,4/17/20,4/18/20,4/19/20,4/20/20,4/21/20,4/22/20
225,,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,...,25306.0,27051.0,28680.0,31242.0,32114.0,32491.0,26612.0,25517.0,27710.0,27639.0


In [22]:

days = list(filter(lambda x: len(x.split('/')) == 3, us.columns))
us.sum()[days].diff()

1/22/20      NaN
1/23/20        0
1/24/20        1
1/25/20        0
1/26/20        3
           ...  
4/18/20    32491
4/19/20    26612
4/20/20    25517
4/21/20    27710
4/22/20    27639
Length: 92, dtype: object

It looks like they match, which means if we treat the negative numbers as NaN then the sum of states will be more than the national total. Is this a problem for the tree map?