In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
import plotly.io
plotly.io.renderers.default = 'notebook_connected'

In [3]:
SHOW_SET = set(('New Zealand', 'China', 'Italy', 'Spain', 'Germany', 'US', 'Korea, South', 'Japan', 'Canada', 'Australia', 'Israel', 'Ireland', 'France', 'United Kingdom'))
SERIES = ['confirmed', 'deaths', 'recovered']
URL_FORMAT = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_%s_global.csv'

In [4]:
cdf =pd.concat({series: pd.read_csv(URL_FORMAT%series).groupby('Country/Region').sum().iloc[:,2:] for series in SERIES}, names=['Series'])
the_dates = pd.to_datetime(cdf.columns, dayfirst=False)
cdf.set_axis(the_dates, axis=1, inplace=True)
cdf.columns.rename('date', inplace=True)

In [5]:
c30_idx = ['New Zealand'] + list(cdf.loc['confirmed', cdf.columns[-1]].nlargest(30).index.values)

In [6]:
cdf.loc['confirmed'].loc[c30_idx].pct_change(1, axis=1).iloc[:,-10:]

date,2020-03-20,2020-03-21,2020-03-22,2020-03-23,2020-03-24,2020-03-25,2020-03-26,2020-03-27,2020-03-28,2020-03-29
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
New Zealand,0.392857,0.333333,0.961538,0.0,0.519608,0.322581,0.380488,0.300353,0.225543,0.13969
US,0.396505,0.334503,0.305504,0.317676,0.225625,0.224004,0.274529,0.21257,0.194979,0.159766
Italy,0.145875,0.139448,0.103774,0.08098,0.082109,0.075315,0.083389,0.073323,0.069065,0.056417
China,0.001158,0.000677,0.001599,0.000774,0.001141,0.000858,0.001482,0.001406,0.001245,0.0015
Spain,0.136224,0.243214,0.133759,0.221357,0.135161,0.241444,0.16704,0.137282,0.114366,0.093876
Germany,0.295561,0.119156,0.11975,0.168174,0.135256,0.13148,0.177237,0.157791,0.134143,0.076263
France,0.16299,0.133642,0.123073,0.238872,0.124186,0.131642,0.154336,0.130317,0.1408,0.068311
Iran,0.067203,0.049175,0.049879,0.065209,0.076446,0.088912,0.088426,0.099504,0.095138,0.081931
United Kingdom,0.477909,0.262332,0.133807,0.170757,0.213797,0.180794,0.225311,0.248307,0.174093,0.14256
Switzerland,0.299141,0.241972,0.13673,0.176746,0.123024,0.10327,0.083876,0.094573,0.0888,0.053495


In [7]:
cdf.loc[('confirmed','New Zealand'), '2020-03-23'] = 112

In [8]:
flat_df = cdf.stack().unstack('Series').loc[c30_idx]
flat_df = flat_df.assign(active=flat_df.confirmed-flat_df.deaths-flat_df.recovered)

In [9]:
def gen_rates(df, days_ago,names):
    today = df.iloc[:,-1]
    return pd.DataFrame({name:today/pair[1] for name,pair in zip(names, df.iloc[:, -1 - np.array(days_ago)].items())})

In [10]:
numdays_series = (cdf.loc['confirmed']>0).sum(1)
has10days = numdays_series.index[numdays_series>=10]


In [11]:
rates_all = gen_rates(cdf.loc['confirmed'],[1,3,7,14],['1 day', '3 day', '1 week', '2 week'])
rates = rates_all.loc[has10days]

In [12]:
def set_widths(figure, factor_series):
    for trace in figure.data:
        country = trace['name']
        factor = factor_series.loc[country]
        trace['hovertemplate'] = trace['hovertemplate'] + (''.join('<br>Scaling - %s=%.1fx' % kv for kv in factor.items()))
        if country not in SHOW_SET:
            trace['visible'] = 'legendonly'
        trace['line'].update(width=np.log2(factor['1 week'])+1)
    return figure

FIGURE_KW = dict(facet_col_wrap=2, height=900, labels={'0':'count'}, log_y=True, color='Country/Region', line_dash='Country/Region')
    
def coronavirus_figure(xdf, countries, today, factors, y=0, **kw):
    title='Coronavirus case data for NZ & the 30 countries with the most confirmed cases for %s' % today
    return set_widths(px.line(xdf, category_orders={'Country/Region':list(countries)}, x='date', y=y, title=title, **dict(FIGURE_KW, **kw)), factors)
    

In [13]:
rates.nlargest(20, '3 day')

Unnamed: 0_level_0,1 day,3 day,1 week,2 week
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Congo (Brazzaville),4.75,4.75,6.333333,19.0
Guinea,2.0,4.0,8.0,16.0
Saint Lucia,3.0,3.0,4.5,4.5
Kazakhstan,1.245614,2.558559,4.733333,31.555556
Turkey,1.245204,2.539818,7.45712,1536.166667
Ukraine,1.33427,2.423469,6.506849,158.333333
Zimbabwe,1.0,2.333333,2.333333,inf
Honduras,1.157895,2.115385,4.230769,36.666667
Cuba,1.168067,2.074627,3.971429,34.75
Philippines,1.31907,2.005658,3.731579,10.128571


In [14]:
px.bar(rates.loc[c30_idx].unstack().reset_index(), barmode='group', color='Country/Region', x='level_0', y=0, log_y=True)

In [15]:
xdf = flat_df.stack().reset_index()
fig= coronavirus_figure(xdf, c30_idx, the_dates[-1].strftime('%d %B'), rates, facet_col='Series')
fig

In [16]:
xdf = flat_df.reset_index()
fig=coronavirus_figure(xdf, c30_idx, the_dates[-1].strftime('%d %B'), rates, y='confirmed', hover_data=['deaths', 'recovered', 'active'])
fig.write_html('../Coronavirus_plot.html', include_plotlyjs='cdn')
fig

In [17]:
nz_df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vQCN9pL21lGy3XPBhKwMX7jT1_SG-Sb_4ZWZ1I0Ctd-0vNhtmH4gFKaLsV5jhz4vSjYlQ9NR_fXF_b6/pub?gid=0&single=true&output=csv', index_col='Date', parse_dates=True, header=1)
nz_df = nz_df[~nz_df.isna().all(1)]

In [18]:
nz_df['Total Cases']

Date
2020-03-13      7.0
2020-03-14      8.0
2020-03-15     10.0
2020-03-16     10.0
2020-03-17     13.0
2020-03-18     23.0
2020-03-19     31.0
2020-03-20     42.0
2020-03-21     56.0
2020-03-22     60.7
2020-03-23    112.0
2020-03-24    155.0
2020-03-25    205.0
2020-03-26    283.0
2020-03-27    368.0
2020-03-28    451.0
2020-03-29    514.0
2020-03-30    589.0
Name: Total Cases, dtype: float64

In [19]:
cdf.loc[('confirmed', 'New Zealand'), '2020-02-24':]

date
2020-02-24      0
2020-02-25      0
2020-02-26      0
2020-02-27      0
2020-02-28      1
2020-02-29      1
2020-03-01      1
2020-03-02      1
2020-03-03      1
2020-03-04      3
2020-03-05      3
2020-03-06      4
2020-03-07      5
2020-03-08      5
2020-03-09      5
2020-03-10      5
2020-03-11      5
2020-03-12      5
2020-03-13      5
2020-03-14      6
2020-03-15      8
2020-03-16      8
2020-03-17     12
2020-03-18     20
2020-03-19     28
2020-03-20     39
2020-03-21     52
2020-03-22    102
2020-03-23    112
2020-03-24    155
2020-03-25    205
2020-03-26    283
2020-03-27    368
2020-03-28    451
2020-03-29    514
Name: (confirmed, New Zealand), dtype: int64

In [20]:
df = pd.read_excel('https://www.health.govt.nz/system/files/documents/pages/covid-19-confirmed-cases-29mar20.xlsx', header=3)
df2 = pd.read_excel('https://www.health.govt.nz/system/files/documents/pages/covid-19-probable-cases-29mar20.xlsx', header=3)

HTTPError: HTTP Error 404: Not Found

In [None]:
df2.tail(15)

In [None]:
cdf.head()

In [None]:
df2.groupby(['Report Date', 'DHB']).agg(c=('DHB', 'count')).c

In [29]:
datestr='30_mar_2020'

In [28]:
'https://www.health.govt.nz/system/files/documents/pages/covid-cases-%s.xlsx' %datestr

'https://www.health.govt.nz/system/files/documents/pages/covid-cases-30_mar_20.xlsx'

In [None]:
https://www.health.govt.nz/system/files/documents/pages/covid-cases-30_mar_2020.xlsx

In [54]:
with pd.ExcelFile('https://www.health.govt.nz/system/files/documents/pages/covid-cases-%s.xlsx' %datestr) as excel_file:
    cases_dfs = {name: excel_file.parse(name, header=3) for name in ['confirmed', 'probable']}

In [63]:
df = cases_dfs['confirmed']
df2 = cases_dfs['probable'].rename(columns={'ReportDate': 'Report Date'})

In [64]:
df2

Unnamed: 0,Report Date,Sex,Age Group,DHB,Overseas,Last country before NZ,Flight no,Departure date,Arrival date
0,2020-03-30,Female,20 to 29,Counties Manukau,,,,NaT,NaT
1,2020-03-29,Male,60 to 69,Auckland,No,,,NaT,NaT
2,2020-03-29,Male,20 to 29,Capital and Coast,Yes,,,NaT,NaT
3,2020-03-29,Female,20 to 29,Waitemata,,,,NaT,NaT
4,2020-03-28,Female,40 to 49,Auckland,No,,,NaT,NaT
5,2020-03-28,Female,<1,Counties Manukau,,,,NaT,NaT
6,2020-03-27,Female,50 to 59,Auckland,No,,,NaT,NaT
7,2020-03-27,Female,30 to 39,Canterbury,Yes,United Kingdom,EK412,2020-03-18,2020-03-19
8,2020-03-27,Male,30 to 39,Canterbury,Yes,United Kingdom,EK412,2020-03-18,2020-03-19
9,2020-03-27,Male,20 to 29,Waitemata,,,,NaT,NaT


In [None]:
, names=['type'])

In [35]:
df = cases_df.loc['confirmed']
df2 = cases_df.loc['probable']

In [60]:
df2

Unnamed: 0,ReportDate,Sex,Age Group,DHB,Overseas,Last country before NZ,Flight no,Departure date,Arrival date
0,2020-03-30,Female,20 to 29,Counties Manukau,,,,NaT,NaT
1,2020-03-29,Male,60 to 69,Auckland,No,,,NaT,NaT
2,2020-03-29,Male,20 to 29,Capital and Coast,Yes,,,NaT,NaT
3,2020-03-29,Female,20 to 29,Waitemata,,,,NaT,NaT
4,2020-03-28,Female,40 to 49,Auckland,No,,,NaT,NaT
5,2020-03-28,Female,<1,Counties Manukau,,,,NaT,NaT
6,2020-03-27,Female,50 to 59,Auckland,No,,,NaT,NaT
7,2020-03-27,Female,30 to 39,Canterbury,Yes,United Kingdom,EK412,2020-03-18,2020-03-19
8,2020-03-27,Male,30 to 39,Canterbury,Yes,United Kingdom,EK412,2020-03-18,2020-03-19
9,2020-03-27,Male,20 to 29,Waitemata,,,,NaT,NaT


In [65]:
cdf = pd.concat({'confirmed':df.groupby(['Report Date', 'DHB']).agg(c=('DHB', 'count')).c, 'probable':df2.groupby(['Report Date', 'DHB']).agg(c=('DHB', 'count')).c}, axis=1, names=['type'])

In [44]:
cdf

Unnamed: 0_level_0,type,confirmed,probable
Report Date,DHB,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-26,Auckland,1,
2020-02-28,Southern,2,
2020-03-02,Waitemata,1,
2020-03-04,Counties Manukau,1,
2020-03-04,Waitemata,1,
...,...,...,...
2020-03-29,Nelson Marlborough,1,
2020-03-29,Northland,2,
2020-03-29,Southern,12,
2020-03-29,Waikato,13,


In [51]:
cdf[~cdf.probable.isna()]

Unnamed: 0_level_0,type,confirmed,probable
Report Date,DHB,Unnamed: 2_level_1,Unnamed: 3_level_1


In [37]:
totals_df = cdf.sum(1)

In [38]:
cdf.unstack('DHB')

type,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,...,probable,probable,probable,probable,probable,probable,probable,probable,probable,probable
DHB,Auckland,Bay of Plenty,Canterbury,Capital and Coast,Counties Manukau,Hawke's Bay,Hutt Valley,Lakes,MidCentral,Nelson Marlborough,...,Northland,South Canterbury,Southern,Tairawhiti,Taranaki,Waikato,Wairarapa,Waitemata,West Coast,Whanganui
Report Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-02-26,1.0,,,,,,,,,,...,,,,,,,,,,
2020-02-28,,,,,,,,,,,...,,,,,,,,,,
2020-03-02,,,,,,,,,,,...,,,,,,,,,,
2020-03-04,,,,,1.0,,,,,,...,,,,,,,,,,
2020-03-06,,,,,1.0,,,,,,...,,,,,,,,,,
2020-03-12,,,,,1.0,,,,,,...,,,,,,,,,,
2020-03-13,,,,,,,,,,,...,,,,,,,,,,
2020-03-14,,,,1.0,,,,,,,...,,,,,,,,,,
2020-03-15,,,,,,,,,,,...,,,,,,,,,,
2020-03-16,1.0,,,2.0,,,,,,,...,,,,,,,,,,


In [None]:
tdf = cdf.unstack('DHB').fillna(0).cumsum().stack('DHB')





In [None]:
tdf = tdf.assign(total = tdf.sum(1)).reset_index()
tdf = tdf[tdf.total>0]


In [None]:
tdf

In [None]:
df.tail(30)

In [None]:
df[df.DHB=='Auckland']

In [None]:
cdf['confirmed'].loc[('','Auckland')]

In [47]:
cdf.unstack('DHB').index

DatetimeIndex(['2020-02-26', '2020-02-28', '2020-03-02', '2020-03-04',
               '2020-03-06', '2020-03-12', '2020-03-13', '2020-03-14',
               '2020-03-15', '2020-03-16', '2020-03-17', '2020-03-18',
               '2020-03-19', '2020-03-20', '2020-03-21', '2020-03-22',
               '2020-03-23', '2020-03-24', '2020-03-25', '2020-03-26',
               '2020-03-27', '2020-03-28', '2020-03-29'],
              dtype='datetime64[ns]', name='Report Date', freq=None)

In [66]:
xdf = cdf.unstack('DHB').fillna(0).cumsum()
xdf = xdf.stack(['DHB', 'type'])
xdf = xdf[xdf>0].unstack('type')
sdf = xdf.sum(0, level='Report Date').assign(DHB='total')

In [67]:
xdf = xdf.append(sdf.set_index('DHB', append=True), sort=True)
xdf = xdf.assign(total=xdf.sum(1)).reset_index().fillna(0)

In [41]:
xdf[xdf.total==0]

type,Report Date,DHB,confirmed,total


In [50]:
xdf

type,Report Date,DHB,confirmed,total
0,2020-02-26,Auckland,1.0,1.0
1,2020-02-28,Auckland,1.0,1.0
2,2020-02-28,Southern,2.0,2.0
3,2020-03-02,Auckland,1.0,1.0
4,2020-03-02,Southern,2.0,2.0
...,...,...,...,...
272,2020-03-25,total,297.0,297.0
273,2020-03-26,total,366.0,366.0
274,2020-03-27,total,433.0,433.0
275,2020-03-28,total,495.0,495.0


In [None]:
cum_curves_df = cdf.unstack('DHB').cumsum().stack('DHB').reset_index()

In [None]:
cdf.reset_index('DHB').index

In [68]:
px.line(xdf, x='Report Date', y='total', color='DHB', hover_data=['confirmed', 'probable'], log_y=True)

In [None]:
DHBS = ['']

In [None]:
df.groupby('DHB').count()

In [None]:
cdf.groupby(['Report Date', 'DHB']).agg(count=('DHB', 'count')).cumsum()

In [None]:
px.bar(cdf.groupby(['Report Date', 'DHB']).agg(count=('DHB', 'count')).reset_index(), x='Report Date', y='count', color='DHB')

In [None]:
df2.groupby('Report Date').agg(count=('DHB', 'count'))

In [None]:
df2.groupby('DHB').count()