In [1]:
# --- Run this cell to initialize data after doing a git pull on the COVID-19 subdirectory ---

import csv
import pandas as pd
import altair as alt

# show charts inline
alt.renderers.enable('default') #note: you may have to change 'default' to 'notebook' depending on your configuration


# --- Data-handling functions and classes ---
    
# Countries/regions will be defined using this class
# Examples of how to call:
#    singapore = Region('Singapore', 5639000, 'Singapore')
#    hongkong = Region('Hong Kong', 7451000, 'China', ['Hong Kong'])
#    china = Region('China (Mainland)', 1427647786, 'China', ['Hong Kong'], True)  # mainland
#    spain = Region('Spain', 46940000, 'Spain')
#    uk = Region('UK', 66650000, 'United Kingdom', [''])  # UK main islands only
class Region:
    # name: The name that we want to display in charts: 'China (Mainland)'
    # population: The region's population: 1427647786
    # csse_col2: The country name as specified in the CSSE tables, column 2: 'China'
    # csse_col1: (optional, must specify as list) The region name as specified in the CSSE tables, column 1: ['Hong Kong']
    # csse_col1_excl: (optional) If set to True, this will exclude the column 1 region(s), for countries that
    #     have irrelevant regions, e.g. Hong Kong for Mainland China and Virgin Islands for UK: True
    def __init__(self, name: str, population: int, csse_col2: str, csse_col1=None, csse_col1_excl=False):
        self.name = name
        self.population = population
        self.csse_col2 = csse_col2
        self.csse_col1 = csse_col1
        self.csse_col1_excl = csse_col1_excl
        self.confirmed = []
        self.deaths = []
        
    # Latest confirmed count
    def get_confirmed(self):
        return self.confirmed[-1]
    
    # Latest death count
    def get_deaths(self):
        return self.deaths[-1]
        
    # Confirmed over population
    def get_cop(self): 
        return self.get_confirmed()/self.population
    
    # Deaths over population
    def get_dop(self):
        return self.get_deaths()/self.population
    
    # Deaths over confirmed
    def get_doc(self):
        return self.get_deaths()/self.get_confirmed()
    
    # Summary
    # Format: Name: Confirmed, Deaths, Confirmed over pop., Deaths over pop., Deaths over confirmed
    def summary(self):
        print(self.name+': '+str(self.get_confirmed())+', '+str(self.get_deaths())+', '+str(self.get_cop())+ \
              ', '+str(self.get_dop())+', '+str(self.get_doc()))
        

# Contains lists of cases, deaths, percentages, etc. for a set of Region objects
# Used to define sets of Region objects that will be used in charts
# Called like this: some_regions = Places([taiwan, usa, southkorea, australia]) after initializing 
#    the corresponding Region objects
class Places:
    # regions: a list of Region objects: [taiwan, germany]
    def __init__(self, regions: list):
        self.regions = regions
        self.dates = read_csvs(self.regions)  # load the data from the CSV files
        self.dates_str = [d.strftime("%Y-%m-%d") for d in self.dates]
        self.as_of_date = self.dates[-1]  # the last date that was recorded 
        
        # Build the lists for use as chart data
        self.names = []  # region name
        self.cases = []  # number of confirmed cases
        self.deaths = []  # number of deaths
        self.doc = []  # deaths over confirmed cases
        self.cop = []  # confirmed cases over population
        self.dop = []  # deaths over population
        for r in self.regions:
            self.names.append(r.name)
            self.cases.append(r.get_confirmed())
            self.deaths.append(r.get_deaths())
            self.doc.append(r.get_doc())
            self.cop.append(r.get_cop())
            self.dop.append(r.get_dop())
            
        print_summary(self.regions, self.as_of_date)
            

# Print summaries of values for all specified regions
# regions: a list of region objects: [taiwan, usa, uk]
# as_of_date: the most recent date in the data
def print_summary(regions, as_of_date):
    print('As of', as_of_date.strftime("%Y-%m-%d"))
    print('Region: Confirmed, Deaths, Confirmed over pop., Deaths over pop., Deaths over confirmed')
    for r in regions:
        r.summary()
        
        
# --- Plot Functions ---

# Plots quantity data (cases, deaths)
# places: a Places object
# qtys: the corresponding quantities (cases, deaths): [1017, 98476]
# title: the plot title: 'COVID-19: Confirmed Cases'
# xlabel: The X axis label: 'Cases'
# color (optional): the color of the bars: 'darkred'
def plot_case_qty(places: Places, qtys: list, title: str, xlabel: str, color='steelblue'):
    data = pd.DataFrame(list(zip(places.names, qtys)), columns=['Region', xlabel])

    bars = alt.Chart(data).mark_bar().encode(
        x=xlabel+":Q",
        y=alt.Y("Region:O", sort='x')
    ).properties(
        title={
            "text":title,
            "subtitle":'As of ' + places.as_of_date.strftime("%B %d, %Y")
        }
    )

    text = bars.mark_text(
        align='left',
        baseline='middle',
        dx=3
    ).encode(
        text=(
            alt.Text(xlabel+":Q", format=",")
        )
    )

    return alt.layer(bars, text).configure_axisX(
        labelAngle=-45
    ).configure_axisY(
        title=None
    ).configure_bar(
        color=color
    )


# Plots percentage data (confirmed over pop., deaths over pop., deaths over confirmed)
# places: a Places object
# pcts: the corresponding percentages (cop, dop, doc): [0.0001475698, 0.001489540]
# title: the plot title: 'COVID-19: Confirmed Cases as % of Population'
# annotation_decimals (optional): The number of decimal places to show on bar annotations: 2
# color (optional): the color of the bars: 'darkred'
def plot_case_pct(places: Places, pcts: list, title: str, annotation_decimals=3, color='steelblue'):
    data = pd.DataFrame(list(zip(places.names, pcts)), columns=['Region', '%'])

    bars = alt.Chart(data).mark_bar().encode(
        x=alt.X("%:Q", axis=alt.Axis(format='%')),
        y=alt.Y("Region:O", sort='x')
    ).properties(
        title={
            "text":title,
            "subtitle":'As of ' + places.as_of_date.strftime("%B %d, %Y")
        }
    )

    text = bars.mark_text(
        align='left',
        baseline='middle',
        dx=3
    ).encode(
        text=(
            alt.Text("%:Q", format="."+str(annotation_decimals)+"%")
        )
    )

    return alt.layer(bars, text).configure_axisX(
        labelAngle=-45,
        title=None
    ).configure_axisY(
        title=None
    ).configure_bar(
        color=color
    )


# Plots confirmed and death data stacked
# places: a Places object
# title: the plot title: 'COVID-19: Regions with Few Confirmed Cases'
# color_conf (optional): the color of the confirmed bars: 'steelblue'
# color_death (optional): the color of the deaths bars: 'darkred'
def plot_group_qtys(places: Places, title: str, color_conf='steelblue', color_death='darkred'):
    # structure data into confirmed cases and deaths by region
    data = []
    for r in places.regions:
        data.append([r.name, r.get_confirmed(), 'Confirmed'])
        data.append([r.name, r.get_deaths(), 'Deaths'])
    data = pd.DataFrame(data, columns=['Region', 'Count', 'Case Type'])

    bars = alt.Chart(data).mark_bar().encode(
        x="Count:Q",
        y="Case Type:N",
        color=alt.Color("Case Type:N", legend=None, scale=alt.Scale(range=[color_conf,color_death]))
    )

    text = bars.mark_text(
        align='left',
        baseline='middle',
        dx=3
    ).encode(
        text=(
            alt.Text("Count:Q", format=",")
        )
    )

    return alt.layer(bars, text).facet(
        row="Region:N"
    ).configure_axisX(
        labelAngle=-45,
        title=None
    ).configure_axisY(
        title=None
    ).properties(
        title={
            "text":title,
            "subtitle":'As of ' + places.as_of_date.strftime("%B %d, %Y")
        }
    )


# Plots case trajectories
# places: a Places object
# title: the plot title: 'COVID-19: Confirmed Cases'
# cases: The data to plot, either 'Confirmed' or 'Deaths'
def plot_trajectory_qty(places: Places, title: str, cases='Confirmed'):
    names = []
    dates = []
    count = []
    length = len(places.dates_str)
    for r in places.regions:
        names += [r.name]*length
        dates += places.dates_str
        if cases == 'Confirmed':
            count += r.confirmed
            ylabel = 'Cases'
        else:
            count += r.deaths
            ylabel = 'Deaths'
        
    data = pd.DataFrame(list(zip(names, dates, count)), columns=['Region', 'Dates', 'Count'])
    
    return alt.Chart(data).properties(width=550).mark_line().encode(
        x = 'Dates:O',
        y = 'Count:Q',
        color='Region',
        strokeDash='Region'
    ).configure_axisX(
        labelAngle=-45,
        title=None
    ).properties(
        title={
            "text":title,
            "subtitle":'As of ' + places.as_of_date.strftime("%B %d, %Y")
        }
    )


# --- Functions to read CSV data files ---

# Converts a list of strings into a list of ints
def to_int(row: list) -> list:
    return [int(r) for r in row]


# Adds values corresponding to the same position in multiple lists 
# (for countries that have multiple regions in the data)
def add_row(old_row: list, new_row: list) -> list:
    if old_row == []:
        return to_int(new_row)
    else:
        return[o + int(n) for o, n in zip(old_row, new_row)]
    

# Reads the CSSE time-series data files and updates the Region objects
# regions: the list of Region objects to load data for: [taiwan, usa, southkorea]
def read_csvs(regions: list):
    # Get confirmed cases
    with open('./COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            if row[1] == "Country/Region":
                dates = pd.to_datetime(row[4:], format="%m/%d/%y")
            else:
                for region in regions:
                    if row[1] == region.csse_col2:  # Country matches
                        if region.csse_col1 is None:  # No spec for region (column 1 of csv)
                            region.confirmed = to_int(row[4:])
                        elif region.csse_col1_excl == False and row[0] in region.csse_col1:  # Found the specific region we want data for
                            region.confirmed = to_int(row[4:])
                        elif region.csse_col1_excl and row[0] not in region.csse_col1:  # We can add the data for the region as it isn't excluded (HK)
                            region.confirmed = add_row(region.confirmed, row[4:])

    # Get deaths
    with open('./COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            for region in regions:
                if row[1] == region.csse_col2:  # Country matches
                    if region.csse_col1 is None:  # No spec for region (column 1 of csv)
                        region.deaths = to_int(row[4:])
                    elif region.csse_col1_excl == False and row[0] in region.csse_col1:  # Found the specific region we want data for
                        region.deaths = to_int(row[4:])
                    elif region.csse_col1_excl and row[0] not in region.csse_col1:  # We can add the data for the region as it isn't excluded (HK)
                        region.deaths = add_row(region.deaths, row[4:])
                        
    return dates

In [2]:
# --- Set the regions to analyze in this cell ---

# Initialize the regions and set populations
# Refer to the comments for the Region class __init__ above for guidance on how to do this properly
taiwan = Region('Taiwan', 23780000, 'Taiwan*')
singapore = Region('Singapore', 5639000, 'Singapore')
hongkong = Region('Hong Kong', 7451000, 'China', ['Hong Kong'])
china = Region('China (Mainland)', 1427647786, 'China', ['Hong Kong'], True)  # mainland
usa = Region('USA', 329515103, 'US')
italy = Region('Italy', 60360000, 'Italy')
southkorea = Region('South Korea', 51640000, 'Korea, South')
japan = Region('Japan', 126500000, 'Japan')
germany = Region('Germany', 83020000, 'Germany')
spain = Region('Spain', 46940000, 'Spain')
uk = Region('UK', 66650000, 'United Kingdom', [''])  # UK main islands only


# The regions we care about, so we can pull in the data and automatically populate the charts
# If you want to exclude any region, just remove it from this list
# If you want to add a country, initialize it above then add it to this list
regions = Places([taiwan, singapore, hongkong, southkorea, japan, china, usa, italy, germany, spain, uk])

As of 2020-04-17
Region: Confirmed, Deaths, Confirmed over pop., Deaths over pop., Deaths over confirmed
Taiwan: 395, 6, 1.6610597140454165e-05, 2.5231286795626577e-07, 0.015189873417721518
Singapore: 5050, 11, 0.0008955488561801738, 1.9507004788082995e-06, 0.0021782178217821784
Hong Kong: 1021, 4, 0.00013702858676687692, 5.368406925244934e-07, 0.0039177277179236044
South Korea: 10635, 230, 0.0002059450038729667, 4.453911696359411e-06, 0.021626704278326282
Japan: 9787, 190, 7.736758893280632e-05, 1.5019762845849801e-06, 0.019413507714314906
China (Mainland): 82739, 4632, 5.795477064536981e-05, 3.244497729358017e-06, 0.055983272700902836
USA: 699706, 36773, 0.0021234413646891324, 0.000111597312733796, 0.05255493021354683
Italy: 172434, 22745, 0.00285675944333996, 0.00037682239893969515, 0.1319055406706334
Germany: 141397, 4352, 0.0017031679113466635, 5.2421103348590704e-05, 0.030778587947410483
Spain: 190839, 20002, 0.0040655943757988925, 0.00042611844908393694, 0.1048108615115359
UK: 1

In [3]:
# Total Cases
plot_case_qty(regions, regions.cases, 'COVID-19: Confirmed Cases', 'Cases')

In [4]:
# Total Deaths
plot_case_qty(regions, regions.deaths, 'COVID-19: Deaths', 'Deaths', 'darkred')

In [5]:
# Confirmed over population
plot_case_pct(regions, regions.cop, 'COVID-19: Confirmed Cases as % of Population')

In [6]:
# Deaths over confirmed
plot_case_pct(regions, regions.doc, 'COVID-19: Deaths as % of Confirmed Cases', 2, 'darkred')

In [7]:
# Deaths over population
plot_case_pct(regions, regions.dop, 'COVID-19: Deaths as % of Population', 5, 'darkred')

In [8]:
# Confirmed and Deaths, Small Regions
regions_small = Places([taiwan, singapore, southkorea, japan, hongkong])
plot_group_qtys(regions_small, 'COVID-19: Regions with Few Confirmed Cases')

As of 2020-04-17
Region: Confirmed, Deaths, Confirmed over pop., Deaths over pop., Deaths over confirmed
Taiwan: 395, 6, 1.6610597140454165e-05, 2.5231286795626577e-07, 0.015189873417721518
Singapore: 5050, 11, 0.0008955488561801738, 1.9507004788082995e-06, 0.0021782178217821784
South Korea: 10635, 230, 0.0002059450038729667, 4.453911696359411e-06, 0.021626704278326282
Japan: 9787, 190, 7.736758893280632e-05, 1.5019762845849801e-06, 0.019413507714314906
Hong Kong: 1021, 4, 0.00013702858676687692, 5.368406925244934e-07, 0.0039177277179236044


In [9]:
# Confirmed and Deaths, Large Regions
regions_large = Places([china, usa, italy, germany, spain, uk])
plot_group_qtys(regions_large, 'COVID-19: Regions with Many Confirmed Cases')

As of 2020-04-17
Region: Confirmed, Deaths, Confirmed over pop., Deaths over pop., Deaths over confirmed
China (Mainland): 165478, 9264, 0.00011590954129073963, 6.488995458716034e-06, 0.055983272700902836
USA: 699706, 36773, 0.0021234413646891324, 0.000111597312733796, 0.05255493021354683
Italy: 172434, 22745, 0.00285675944333996, 0.00037682239893969515, 0.1319055406706334
Germany: 141397, 4352, 0.0017031679113466635, 5.2421103348590704e-05, 0.030778587947410483
Spain: 190839, 20002, 0.0040655943757988925, 0.00042611844908393694, 0.1048108615115359
UK: 108692, 14576, 0.001630787696924231, 0.0002186946736684171, 0.13410370588451773


In [10]:
# Alternate (incomplete) version just for China regions. 
hubei = Region('Hubei', 58500000, 'China', ['Hubei'])
guangdong = Region('Guangdong', 113460000, 'China', ['Guangdong'])
shanghai = Region('Shanghai', 24281400, 'China', ['Shanghai'])
beijing = Region('Beijing', 21542000, 'China', ['Beijing'])

regions_china = Places([hongkong, hubei, shanghai, beijing, guangdong])

As of 2020-04-17
Region: Confirmed, Deaths, Confirmed over pop., Deaths over pop., Deaths over confirmed
Hong Kong: 1021, 4, 0.00013702858676687692, 5.368406925244934e-07, 0.0039177277179236044
Hubei: 68128, 4512, 0.0011645811965811967, 7.712820512820512e-05, 0.06622827618600281
Shanghai: 628, 7, 2.58634180895665e-05, 2.8828650736777946e-07, 0.011146496815286623
Beijing: 593, 8, 2.7527620462352614e-05, 3.7136756104354286e-07, 0.013490725126475547
Guangdong: 1577, 8, 1.3899171514190022e-05, 7.050943063634761e-08, 0.0050729232720355105


In [11]:
plot_case_qty(regions_china, regions_china.cases, 'COVID-19: China Confirmed Cases', 'Cases')

In [12]:
plot_case_qty(regions_china, regions_china.deaths, 'COVID-19: China Deaths', 'Deaths', 'darkred')

In [13]:
# Trajectory

plot_trajectory_qty(regions, 'COVID-19: Confirmed Cases', 'Confirmed')