# DIY COVID-19 Dashboard - Yuzo Makitani

This is a COVID-19 dashboard based on data published by [Public Health England](https://www.gov.uk/government/organisations/public-health-england)

In [1]:
# ideas for data visualization
# 1. cumulative timeseries plot with vaccination included
# 2. student t-test on death rate by gender
# 3. student t-test on death rate by age
# 4. tukey kramer test on death rate by gender and age
# 5. timeseries plot by UK region (re-use existing code with multiple filters) - need regional population estimates

In [2]:
# import all libraries
import json
import time
import numpy as np
import pandas as pd
import ipywidgets as wdg
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from uk_covid19 import Cov19API
from IPython.display import clear_output

# delete this code for binders
import os
os.chdir('/Users/yuzomakitani/Desktop/Programming ECS780P/diy-covid19dash-main/submission')

# enable plotting and make figures larger
%matplotlib inline
plt.rcParams['figure.dpi'] = 100

In [3]:
# helper functions

# function to convert date strings into pandas datetime objects
def parse_date(datestring):
    """ Convert a date string into a pandas datetime object """
    return pd.to_datetime(datestring, format="%Y-%m-%d")

# magnitude formatter
def format_magnitude(x, pos):
    if x >= 1_000_000:
        return f'{x / 1_000_000:.0f}M'
    elif x >= 1_000:
        return f'{x / 1_000:.1f}K'
    else:
        return f'{x:.0f}'

In [4]:
# --- a1. import data: cases, deaths, cum vaccinations, cum boosters by date ---

def import_timeseries():

    # select which area data to retrieve
    # viable filters: areaType (mandatory), areaName, areaCode, date
    filters = [
        'areaType=nation',
        'areaName=England'
    ]

    # select the field display name followed by field name
    # "metric display name" : "metric name (see website)"
    structure = {
        "date": "date",
        "cases": "newCasesBySpecimenDateRollingRate",
        "deaths": "newDailyNsoDeathsByDeathDate",
        "death rate": "newDeaths60DaysByDeathDateRollingRate", # new deaths within 60 days of a positive test rolling rate by death date
        "cum vaccinations": "cumPeopleVaccinatedCompleteByVaccinationDate", # cumulative people fully vaccinated by vaccination date
        "cum boosters": "cumPeopleVaccinatedThirdInjectionByVaccinationDate", # cumulative people vaccinated booster or third dose by vaccination date
        "vaccinations": "newPeopleVaccinatedCompleteByPublishDate", # new people vaccinated complete by publish date
        "boosters": "newPeopleVaccinatedThirdInjectionByVaccinationDate" # new people vaccinated with a booster or third dose by vaccination date
    }

    # create a cov19api object
    api = Cov19API(filters=filters, structure=structure)

    # call the government server for the data and put it in json format
    timeseries=api.get_json()     # consider replacing with timeseries=api.get_dataframe() to get pandas table

    # check the call was successful
    # print("keys =", list(timeseries.keys()))

    # save as json file to local machine
    with open("timeseries.json", "wt") as OUTF:
        json.dump(timeseries, OUTF)

In [5]:
# --- a2. wrangle timeseries data ---

def wrangle_timeseries():

    with open("timeseries.json", "rt") as INFILE:
        data=json.load(INFILE)

    # save json as datalist (type list) filled with a dictionary for each row
    datalist=data['data']

    # extra dates into dates list and sort them (type list)
    dates=[dictionary['date'] for dictionary in datalist]
    dates.sort()

    # get the start and end date and convert them into pandas datetime objects
    startdate=parse_date(dates[0])
    enddate=parse_date(dates[-1])

    # fill missing dates and name the columns using the start and end dates
    index=pd.date_range(startdate, enddate, freq='D')
    timeseriesdf=pd.DataFrame(index=index, columns=['cases', 'deaths', 'death rate', 'fatality rate',
                                                    'cum vaccinations', 'cum boosters', 'vaccinations', 'boosters'])

    # put the data from datalist into timeseriesdf (type pandas dataframe)
    for entry in datalist:
        date=parse_date(entry['date'])
        for column in ['cases', 'deaths', 'death rate', 'cum vaccinations', 'cum boosters', 'vaccinations', 'boosters']:
            if pd.isna(timeseriesdf.loc[date, column]): 
                value= float(entry[column]) if entry[column]!=None else 0.0
                timeseriesdf.loc[date, column]=value

    # fill missing values due to missing dates with 0
    timeseriesdf.fillna(0.0, inplace=True)

    # fill calcaulated column fatality rate
    timeseriesdf['fatality rate'] = timeseriesdf['deaths'] / timeseriesdf['cases']
    
    # save pickle
    timeseriesdf.to_pickle("timeseriesdf.pkl")
    
    return timeseriesdf

In [6]:
# --- a3. display timeseries data ---

def display_timeseries(timeseriesdf):
    
    series1=wdg.Select(
        options=['cases', 'deaths', 'cum vaccinations', 'cum boosters', 'death rate'],
        value='death rate',
        rows=5,
        description='Axis 1:',
        disabled=False
    )
    
    series2=wdg.Select(
        options=['cases', 'deaths', 'cum vaccinations', 'cum boosters', 'death rate'],
        value='cum boosters',
        rows=5,
        description='Axis 2:',
        disabled=False
    )

    scale=wdg.RadioButtons(
        options=['linear', 'log'],
        description='Scale:',
        disabled=False
    )

    # try replacing HBox with a VBox
    controls=wdg.HBox([series1, series2, scale])
    scale.layout.margin = '0 0 0 50px'

    def timeseries_graph(gcol1, gcol2, gscale):
        if gscale == 'linear':
            logscale = False
        else:
            logscale = True


        fig, ax = plt.subplots(figsize=(8, 6))
        timeseriesdf[gcol1].plot(ax=ax, logy=logscale, label=gcol1)
        ax.set_ylabel(gcol1)

        ax_twin = ax.twinx()
        timeseriesdf[gcol2].plot(ax=ax_twin, logy=logscale, label=gcol2, color='orange')
        ax_twin.set_ylabel(gcol2)

        ax.yaxis.set_major_formatter(ticker.FuncFormatter(format_magnitude))
        ax_twin.yaxis.set_major_formatter(ticker.FuncFormatter(format_magnitude))

        ax.legend(loc='upper left')
        ax_twin.legend(loc='upper right')
        plt.tight_layout()
        plt.show()
        

    # input function handle and arguments as a dict
    graph = wdg.interactive_output(timeseries_graph, {'gcol1': series1, 'gcol2': series2, 'gscale': scale})

    display(controls, graph)

In [7]:
# b2. --- wrangle age data ---

def wrangle_agedist(regions):

    # function to return the minimum age in an age range
    def min_age(agerange):
        agerange=agerange.replace('+','') # remove the + from 90+
        start=agerange.split('_')[0]
        return int(start)

    # open age distribution
    with open("agedistribution.json", "rt") as INFILE:
        data=json.load(INFILE)
        
    col = ['region', 'male cases', 'female cases', 'total cases']
    ageranges = []
    dfs, datadic, male_cases, female_cases = {}, {}, {}, {}

    for region in regions:

        # each entry is a dict, with age band, rate, and value
        datadic[region] = data[region]['data'][0]

        # put male data into males and female data into females
        male_cases[region] = datadic[region]['male cases']
        female_cases[region] = datadic[region]['female cases']

        if region == 'London':
            # fill age ranges list with male ages
            ageranges=[x['age'] for x in male_cases['London']] # each entry of males is a dictionary
            # sort age ranges by minimum age
            ageranges.sort(key=min_age)

            dfs[all_regions] = age_df=pd.DataFrame(0, index=ageranges, columns=col)
            dfs[all_regions]['region'] = all_regions

        # create age_df (pandas dataframe) with columns males, females, and total
        age_df=pd.DataFrame(index=ageranges, columns=col)

        for entry in male_cases[region]:
            ageband = entry['age']
            age_df.loc[ageband, 'male cases'] = entry['value']
            dfs[all_regions].loc[ageband, 'male cases'] += entry['value']

        for entry in female_cases[region]:
            ageband = entry['age']
            age_df.loc[ageband, 'female cases'] = entry['value']
            dfs[all_regions].loc[ageband, 'female cases'] += entry['value']

        # define total column as males + females
        age_df['total cases'] = age_df['male cases'] + age_df['female cases']

        # insert region
        age_df['region'] = region

        # add to dfs dict
        dfs[region] = age_df

    dfs[all_regions]['total cases'] = dfs[all_regions]['male cases'] + dfs[all_regions]['female cases']

    age_dfs = pd.concat(dfs.values(), axis=0)

    # save pickle
    age_dfs.to_pickle("agedfs.pkl")
    
    return age_dfs

In [8]:
# b3. --- display age data ---

def display_agedist(regions, age_dfs):

    all_regions = 'All Regions'
    region_options = regions[:]
    region_options.append(all_regions)

    agecols = wdg.SelectMultiple(
        options = ['male cases', 'female cases', 'total cases'], # options available
        value = ['male cases', 'female cases'], # initial value
        rows = 3, # rows of the selection box
        description = 'Sex',
        disabled = False
    )

    regionfilter = wdg.RadioButtons(
        options = region_options, # options available
        value = 'London', # initial value
        description = 'Region',
        disabled = False
    )
    

    def age_graph(graphcolumns, regfilt):

        filtered_df = age_dfs[age_dfs['region'] == regfilt]

        # our callback function
        ncols=len(graphcolumns)
        if ncols>0:
            ax3 = filtered_df.plot(kind='bar', y=list(graphcolumns)) # graphcolumns is a tuple - we need a list
            plt.show() # important - graphs won't update properly if this is missing
        else:
            # if the user has not selected any column, print a message instead
            print("Click to select data for graph")
            print("(CTRL-Click to select more than one category)")

    # keep calling age_graph(graphcolumns=value_of_agecols); capture output in widget output    
    output=wdg.interactive_output(age_graph, {'graphcolumns': agecols, 'regfilt': regionfilter})

    controls = wdg.HBox([agecols, regionfilter])
    regionfilter.layout.margin = '0 0 0 50px'
    display(controls, output)

In [9]:
# --- API access function ---
def access_api(button, plots, regions):
    
    if 'timeseries' in plots:
    
        # import, wrangle, and display new timeseries data
        import_timeseries()
        timeseries_df = wrangle_timeseries()
        display_timeseries(timeseries_df)
        
        # change icon to check and disable the button
        apibutton1.icon="check"
        apibutton1.disabled=True
    
    if 'agedist' in plots:
        
        # import, wrangle, and display new age distribution data
        import_agedist(regions)
        age_dfs = wrangle_agedist(regions)
        display_agedist(regions, age_dfs)
    
        # change icon to check and disable the button
        apibutton2.icon="check"
        apibutton2.disabled=True

## Two Variables on Timeseries
* Choose one variable on each axes to explore their relationship
* For example, take a look at the death rate after vaccination are administered
* Warning: Refresh data may take a while because this dashboard queries the PHE database for each region

In [11]:
# --- main ---

regions = ['London', 'East Midlands', 'East of England', 'North East',
           'North West', 'South East', 'South West', 'West Midlands', 'Yorkshire and The Humber']
all_regions = 'All Regions'

# --- executed only when button is clicked ---

# create API button object 
apibutton1=wdg.Button(
    description='Refresh data',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to download current Public Health England data',
    icon='download' # (FontAwesome names without the `fa-` prefix)
)

# register the callback function with the button
apibutton1.on_click(lambda button: access_api(button, ['timeseries'], regions))
display(apibutton1)

Button(description='Refresh data', icon='download', style=ButtonStyle(), tooltip='Click to download current Pu…

In [None]:
# --- main ---

# --- first time pass (skip import functions) - executed when page loads for the first time ---

timeseries_df = wrangle_timeseries()
display_timeseries(timeseries_df)

## Cases by Region
* See the age distributions for cases broken down by sex and region
* Select "All Regions" to see the sum total for all cases

In [12]:
# --- main ---

# --- executed only when button is clicked ---

# create API button object 
apibutton2=wdg.Button(
    description='Refresh data',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to download current Public Health England data',
    icon='download' # (FontAwesome names without the `fa-` prefix)
)

# register the callback function with the button
apibutton2.on_click(lambda button: access_api(button, ['agedist'], regions))
display(apibutton2)

age_dfs = wrangle_agedist(regions)
display_agedist(regions, age_dfs)

Button(description='Refresh data', icon='download', style=ButtonStyle(), tooltip='Click to download current Pu…

HBox(children=(SelectMultiple(description='Sex', index=(0, 1), options=('male cases', 'female cases', 'total c…

Output()