In [22]:
# full_country_list = list(recovered_df['Country/Region'].unique())
# recovered_df[recovered_df['Province/State'].notnull()]['Country/Region'].unique()

array(['Australia', 'China', 'Denmark', 'France', 'Netherlands',
       'United Kingdom'], dtype=object)

In [None]:
#Some useful functions
def check_latest_country_data(dataframe, country):
    return dataframe[dataframe['country']==country].sort_values(by='date',ascending=False)[0:50]


## 1. Download a timeseries of (1) daily deaths, (2) confirmed cases, (3) recovered per country (from John Hopkins Covid-19 Data Github repository)

In [4]:
import pandas as pd
import numpy as np
import bokeh

# Download daily deaths per country from Github repo raw csv file and save to Pandas dataframe
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
url2 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
url3 = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'

deaths_df = pd.read_csv(url, error_bad_lines=False)
confirmed_df = pd.read_csv(url2, error_bad_lines=False)
recovered_df = pd.read_csv(url3, error_bad_lines=False)

# Write daily deaths dataframe to CSV file (for reference)
# deaths_df.to_csv('time_series_covid19_deaths_global.csv', index = False, header=True)

## 2. Deal with deaths_df data

### 2. (a) Aggregate state data for countries which have negligible data

In [2]:
# Extract df of Countries without state info - remove China, Aus and Canada
list_countries_with_state_info = ['China','Australia','Canada']

df_countries_without_state_info = deaths_df[~deaths_df['Country/Region'].isin(list_countries_with_state_info)]

# Aggregate (squash) deaths of countries with negligible state Info (A)
df_countries_without_state_info = df_countries_without_state_info.groupby('Country/Region').sum()
df_countries_without_state_info = df_countries_without_state_info.reset_index()
df_countries_without_state_info.insert(1, 'Province/State',np.nan)

# Get dataframe of Countries with State Info (B)
df_countries_with_state_info = deaths_df[deaths_df['Country/Region'].isin(list_countries_with_state_info)]

# Append aggregated Countries (A) with (B) dataframes
combined_df = df_countries_with_state_info.append(df_countries_without_state_info)
combined_df = combined_df.drop(["Lat", "Long"], axis=1)

# Get date headers
date_headers = list(combined_df.columns[2:].values)

long_df = pd.melt(combined_df, id_vars= ['Country/Region','Province/State'], value_vars= date_headers)

### 2. (b) Unpivot table

In [92]:
# Rename Columns
long_df = long_df.rename(columns={"Country/Region": "country", "Province/State": "state","variable": "date", "value": "total_deaths"}, errors="raise")

# Change date column type
long_df['date'] = pd.to_datetime(long_df['date'])

# Include deaths change column
long_df.insert(4, 'deaths_change',0)

# Get list of unique country names
full_country_list = list(long_df['country'].unique())

### 2. (c) Calculate daily change for each country

In [139]:
# Calculate deaths_change for each country in 'countries' list

for country in full_country_list:
    
    if ( country not in list_countries_with_state_info ): 
    
        # Set temporary df for country
        temp_df = long_df.loc[long_df['country'] == country]

        # Find difference between rows (returns difference results dataframe)
        diff = temp_df['total_deaths'].diff()

        # Apply difference calculation to original long_df according to index
        long_df.iloc[diff.index,4] = diff
        
    else: 
        # Deal with Australia, China and Canada (purely state data)
        # Set temporary df for country
        temp_country_df = long_df.loc[long_df['country'] == country]
        
        # Get unique list of states
        states_list = list(temp_country_df['state'].unique())

        for state in states_list:
            temp_state_df = temp_country_df.loc[long_df['state'] == state]

            diff = temp_state_df['total_deaths'].diff()

            long_df.iloc[diff.index,4] = diff 
        
# Remove NaN values from dataframe
long_df = long_df.fillna(0)

In [None]:
long_df.to_csv('time_series_covid19_deaths_global.csv', index = False, header=True)

## X. Plot Total and Change of Covid-19 Deaths of Top 20 Countries

In [8]:
# Find latest date
latest_date = long_df['date'].max()

# Get Top_20 highest Covid-19 deaths of countries
latest_deaths_df = long_df[long_df['date'] == latest_date]
latest_deaths_df = latest_deaths_df.sort_values(by='total_deaths', ascending=False)[0:19]
 
grp_list = list(latest_deaths_df['country'])
grp_list.append('Australia')

In [10]:
grp_df = long_df[long_df['country'].isin(grp_list)]

## 5. Plot Australia and US graphs of Total Deaths

In [7]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.palettes import Category20

us_df = long_df[long_df['country'] == 'US']
us_df = us_df[us_df['total_deaths']>0]
aus_df = long_df[long_df['country'] == 'Australia']
aus_df = aus_df[aus_df['total_deaths']>0]
my_df = long_df[long_df['country'] == 'Malaysia']
my_df = my_df[my_df['total_deaths']>0]


In [8]:
# Prepare and plot US Deaths Data
deaths_array = np.array(us_df['total_deaths'])
dates_array = np.array(us_df['date'], dtype=np.datetime64)

window_size = 30

# output to static HTML file
output_file("total_US_deaths.html", title="Timeline of US Covid-19 Deaths")

# create a new plot with a datetime axis type
p = figure(plot_width=700, plot_height=400, x_axis_type="datetime", sizing_mode="scale_both")

# add renderers
p.line(dates_array, deaths_array, color='navy', legend_label='total deaths', line_width =2)

# NEW: customize by setting attributes
p.title.text = "Timeline of US Covid-19 Deaths"
p.legend.location = "top_left"
p.grid.grid_line_alpha = 1
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Total Deaths'
p.ygrid.band_fill_color = "grey"
p.ygrid.band_fill_alpha = 0.05

# show the results
show(p)

In [9]:
# Prepare and plot Australia Deaths Data
deaths_array = np.array(aus_df['total_deaths'])
dates_array = np.array(aus_df['date'], dtype=np.datetime64)

window_size = 30

# output to static HTML file
output_file("total_Australia_deaths.html", title="Timeline of Australia Covid-19 Deaths")

# create a new plot with a datetime axis type
p = figure(plot_width=700, plot_height=400, x_axis_type="datetime", sizing_mode="scale_both")

# add renderers
p.line(dates_array, deaths_array, color='navy', legend_label='total deaths', line_width =2)

# NEW: customize by setting attributes
p.title.text = "Timeline of Australia Covid-19 Deaths"
p.legend.location = "top_left"
p.grid.grid_line_alpha = 1
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Total Deaths'
p.ygrid.band_fill_color = "grey"
p.ygrid.band_fill_alpha = 0.05

# show the results
show(p)

In [10]:
# Prepare and plot Malaysia Deaths Data
deaths_array = np.array(my_df['total_deaths'])
dates_array = np.array(my_df['date'], dtype=np.datetime64)

window_size = 30

# output to static HTML file
output_file("total_Malaysia_deaths.html", title="Timeline of Malaysia Covid-19 Deaths")

# create a new plot with a datetime axis type
p = figure(plot_width=700, plot_height=400, x_axis_type="datetime", sizing_mode="scale_both")

# add renderers
p.line(dates_array, deaths_array, color='navy', legend_label='total deaths', line_width =2)

# NEW: customize by setting attributes
p.title.text = "Timeline of Malaysia Covid-19 Deaths"
p.legend.location = "top_left"
p.grid.grid_line_alpha = 1
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Total Deaths'
p.ygrid.band_fill_color = "grey"
p.ygrid.band_fill_alpha = 0.05

# show the results
show(p)

## 4. Plot Top 20 Countries Total Deaths 

In [11]:
#bokeh - multi_linegrp_list = df.group.unique()
ys = [grp_df.loc[grp_df.country == i].total_deaths for i in grp_list]
xs = [grp_df.loc[grp_df.country == i].date for i in grp_list]
source = ColumnDataSource(data=dict(
     x = xs,
     y = ys,
     color = (Category20[20]),
     group = grp_list))
p3 = figure(plot_width=700, plot_height=500, x_axis_type="datetime", sizing_mode="scale_both")
p3.multi_line(
     xs='x',
     ys='y',
     legend='group',
     source=source,
     line_color='color',
line_width = 3)

p3.title.text = "Top 20 Countries + Australia Covid-19 Total Deaths (cumulative) "
p3.legend.location = "top_left"
p3.xaxis.axis_label = 'Date'
p3.yaxis.axis_label = 'Total Deaths'

output_file("top20_total_covid_deaths.html", title="Top 20 Countries + Australia Covid-19 Total Deaths (cumulative) ")
show(p3)



## 4. Plot Top 20 Countries Daily Deaths

In [12]:
#bokeh - multi_linegrp_list = df.group.unique()
ys = [grp_df.loc[grp_df.country == i].deaths_change for i in grp_list]
xs = [grp_df.loc[grp_df.country == i].date for i in grp_list]
source = ColumnDataSource(data=dict(
     x = xs,
     y = ys,
     color = (Category20[20]),
     group = grp_list))
p3 = figure(plot_width=700, plot_height=500, x_axis_type="datetime", sizing_mode="scale_both")
p3.multi_line(
     xs='x',
     ys='y',
     legend='group',
     source=source,
     line_color='color',
line_width = 3)

p3.title.text = "Top 20 Countries + Australia Covid-19 Daily Deaths (rate of change) "
p3.legend.location = "top_left"
p3.xaxis.axis_label = 'Date'
p3.yaxis.axis_label = 'Daily Deaths'

output_file("top20_daily_covid_deaths.html", title="Top 20 Countries + Australia Covid-19 Daily Deaths (rate of change)")
show(p3)



In [13]:
grp_df

Unnamed: 0,country,date,total_deaths,deaths_change
8,Australia,2020-01-22,0,0.0
16,Belgium,2020-01-22,0,0.0
23,Brazil,2020-01-22,0,0.0
32,Canada,2020-01-22,0,0.0
35,Chile,2020-01-22,0,0.0
...,...,...,...,...
36236,Russia,2020-08-01,14034,95.0
36252,South Africa,2020-08-01,8153,148.0
36254,Spain,2020-08-01,28445,0.0
36270,US,2020-08-01,154447,1133.0
