# Process JHU Data

This notebook downloads the latest JHU data from GitHub, processes it for dashboard visualization, and places it in the published folder.

### Papermill

In [None]:
# parameters
data_dir = '/opt/src/data'

For papermill execution, the pameters are:
- data_dir: That data directory to read data from and publish data to.

In [None]:
import json
import io
import os
from datetime import datetime

import requests
import numpy as np
import pandas as pd
from shapely.geometry import Point, shape
from slugify import slugify

In [None]:
# constants

eac_countries = [
    'Burundi',
    'Kenya',
    'Rwanda',
    'South Sudan',
    'Tanzania',
    'Uganda'
]

In [None]:
def fetch_df(url):
    """Fetches a Pandas DataFrame from a remote source"""
    r = requests.get(url)
    return pd.read_csv(io.BytesIO(r.content))

Fetch the JHU data from it's source on GitHub.

In [None]:
cases_df = fetch_df('https://github.com/CSSEGISandData/COVID-19/raw/master/'
                    'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
deaths_df = fetch_df('https://github.com/CSSEGISandData/COVID-19/raw/master/'
                     'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
recovered_df= fetch_df('https://github.com/CSSEGISandData/COVID-19/raw/master/'
                       'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

Fix the lat/lng of Congo, which is the same as the Democratic Republic of Congo in the JHU data.

In [None]:
congo_ll = [-1.402385, 15.405892]
def fix_congo(df):
    df.loc[df['Country/Region'] == 'Congo (Brazzaville)', 'Lat'] = congo_ll[0]
    df.loc[df['Country/Region'] == 'Congo (Brazzaville)', 'Long'] = congo_ll[1]
fix_congo(cases_df)
fix_congo(deaths_df)
fix_congo(recovered_df)

Merge in the USecounty data. This data only has cases and deaths.

In [None]:
us_cases_df = fetch_df('https://github.com/CSSEGISandData/COVID-19/raw/master/'
                    'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv')
us_deaths_df = fetch_df('https://github.com/CSSEGISandData/COVID-19/raw/master/'
                     'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv')

In [None]:
us_deaths_df[us_deaths_df.iloc[:,-1] == 0]

In [None]:
us_territories = [
    'American Samoa',
    'Guam',
    'Northern Mariana Islands',
    'Puerto Rico',
    'Virgin Islands'
]

def filter_and_reformat_us(df):
    # Filter out counties that have 0 latest data.
    filtered_df = df[df.iloc[:,-1] != 0]
    filtered_df = filtered_df[
        (filtered_df['Province_State'].isin(us_territories)) |
        (
            (~filtered_df['Lat'].isnull()) &
            (filtered_df['Lat'] != 0.0) &
            (~filtered_df['FIPS'].isnull())
        )
    ]

    columns_to_drop = [
        'UID',
        'iso2',
        'iso3',
        'code3',
        'FIPS',
        'Admin2',
        'Province_State',
        'Country_Region',
        'Combined_Key',
        'Long_'
    ]
    
    def set_prov(row):        
        prov = row['Province_State']
        if row['Admin2'] and (type(row['Admin2']) != float or not np.isnan(row['Admin2'])):
            prov = '{} - {}'.format(row['Admin2'], prov)
        return prov

    formatted_df = filtered_df.copy()
    formatted_df['Province/State'] = formatted_df.apply(set_prov, axis=1)
    formatted_df['Country/Region'] = formatted_df['Country_Region']
    formatted_df['Long'] = formatted_df['Long_']
    formatted_df = formatted_df.drop(columns=columns_to_drop)
    return formatted_df
    
formatted_us_cases_df = filter_and_reformat_us(us_cases_df)
formatted_us_deaths_df = filter_and_reformat_us(us_deaths_df)

Map out the formatted dates for each date column.

In [None]:
non_date_columns = ['Province/State', 'Country/Region', 'Lat', 'Long']
date_columns = list(set(cases_df.columns) - set(non_date_columns))

dates_to_format = {}
for d in date_columns:
    dt = datetime.strptime(d, '%m/%d/%y')
    dates_to_format[d] = dt.strftime('%Y-%m-%d')

This code gathers the data in an intermediate dictionary, keyed to a region ID that made out of a tuple of the admin0 and admin1 column values. The values represent the total confirmed cases, deaths, and recovered patients for each date.

In [None]:
gathered = {}

# Setup multi-national entries
# Data points with the 'points' property set to None don't display on the map.
global_key = ('global', None)
eac_key = ('EAC', None)
gathered[global_key] = { 'point': None, 'dates': {} }
gathered[eac_key] = { 'point': None, 'dates': {} }


def add_multinational_cases(key, dt, cases):
    if not dt in gathered[key]['dates']:
        gathered[key]['dates'][dt] = [cases, 0, 0]
    else:
        (prev_c, _, _) = gathered[key]['dates'][dt]
        gathered[key]['dates'][dt] = [cases + prev_c, 0, 0]
        
def add_multinational_deaths(key, dt, deaths):
    (prev_c, total_d, _) = gathered[key]['dates'][dt]
    if total_d is None:
        total_d = deaths
    else:
        total_d += deaths
    gathered[key]['dates'][dt] = [prev_c, total_d, 0]
    
def add_multinational_recovered(key, dt, recovered):
    (prev_c, prev_d, total_r) = gathered[key]['dates'][dt]
    if total_r is None:
        total_r = recovered
    else:
        total_r += recovered
    gathered[key]['dates'][dt] = [prev_c, prev_d, total_r]

## Note:
# It was requested to put in US county data. This caused a general
# site slowdown. Commenting out the inclusion of US county data
# until we rework how we're loading things - e.g. shifting to 
# a vector tile approach.

#for _, row in pd.concat([cases_df, formatted_us_cases_df]).iterrows():
for _, row in cases_df.iterrows():
    admin0 = row['Country/Region']
    admin1 = row['Province/State']
    if type(admin1) is float and np.isnan(admin1):
        admin1 = None
    lat = row['Lat']
    lng = row['Long']
    gathered[(admin0, admin1)] = { 'point': [lng, lat], 'dates': {} }
    for d in date_columns:
        cases = row[d]
        dt = dates_to_format[d]
        gathered[(admin0, admin1)]['dates'][dt] = [cases, 0, 0]
        
        add_multinational_cases(global_key, dt, cases)

        if admin0 in eac_countries:
            add_multinational_cases(eac_key, dt, cases)

#for _, row in pd.concat([deaths_df, formatted_us_deaths_df]).iterrows():
for _, row in deaths_df.iterrows():
    admin0 = row['Country/Region']
    admin1 = row['Province/State']
    if type(admin1) is float and np.isnan(admin1):
        admin1 = None

    for d in date_columns:
        deaths = row[d]
        dt = dates_to_format[d]
        if (admin0, admin1) not in gathered or dt not in gathered[(admin0, admin1)]['dates']:
            continue
            
        gathered[(admin0, admin1)]['dates'][dt][1] = deaths
        
        add_multinational_deaths(global_key, dt, deaths)

        if admin0 in eac_countries:
            add_multinational_deaths(eac_key, dt, deaths)

for _, row in recovered_df.iterrows():
    admin0 = row['Country/Region']
    admin1 = row['Province/State']
    if type(admin1) is float and np.isnan(admin1):
        admin1 = None

    for d in date_columns:
        recovered = row[d]
        dt = dates_to_format[d]
        # Skip canada as it doesn't match up with the other datasets
        if (admin0, admin1) != ('Canada', None):
            gathered[(admin0, admin1)]['dates'][dt][2] = recovered

        add_multinational_recovered(global_key, dt, recovered)

        if admin0 in eac_countries:
            add_multinational_recovered(eac_key, dt, recovered)

The max date and latest counts for EAC countries:

In [None]:
max_date = sorted(gathered[eac_key]['dates'], reverse=True)[0]
print('MAX DATE: {}'.format(max_date))
print('\nCountry counts:')
for x in eac_countries:
    print('  {}: {}'.format(x, gathered[(x, None)]['dates'][max_date]))


Create a map of continent names to continent geometries so that we can find the continent for each point geometry.

In [None]:
continents = {}

with open(os.path.join(data_dir, 'continents.geojson')) as f:
    continents_js = json.loads(f.read())
    for f in continents_js['features']:
        continents[f['properties']['CONTINENT']] = shape(f['geometry'])


Loop through the gathered data and construct the JSON file that is used to display the data in the visualization.

In [None]:
def get_continent(admin0, admin1, info):
    if admin0 in ['Diamond Princess', 'MS Zaandam']:
        # Skip the cruise ships
        return None
    
    # Skip multinational regions
    if info['point'] is None:
        return None
    
    result = None
    if admin0 in ['Cabo Verde', 'EAC']:
        result = 'Africa'
    elif admin0 in ['US', 'Canada', 'Saint Vincent and the Grenadines']:
        result = 'North America'
    elif admin0 in ['China', 'Maldives', 'Philippines']:
        result = 'Asia'
    elif admin0 in ['Denmark', 'France', 'Monaco', 'United Kingdom']:
        result = 'Europe'
    elif admin0 in ['New Zealand']:
        result = 'Oceania'
    elif admin1 in ['Aruba']:
        result = 'South America'
    elif admin1 in ['Sint Maarten']:
        result = 'North America'
    else:
        p = Point(*info['point'])

        for c in continents:
            if continents[c].intersects(p):
                result = c
        if result is None:
            print(info)
            raise Exception('Continent not found for "{}" - "{}"'.format(admin0, admin1))

    return result

def process_area(admin0, admin1, info):
    result = []
    continent = get_continent(admin0, admin1, info)
    
    slug_txt = admin0
    if admin1 is not None:
        slug_txt = "{} {}".format(admin1, admin0)
    code = slugify(slug_txt)
    
    sorted_dates = sorted(gathered[(admin0, admin1)]['dates'])

    (prev_cases,
     prev_deaths,
     prev_recovered,
     prev_active) = (None,
                     None,
                     None,
                     None)

    for date in sorted_dates:
             
        (cases,
         deaths,
         recovered) = gathered[(admin0, admin1)]['dates'][date]

        active = cases - deaths - recovered

        (cases_change,
         deaths_change,
         recovered_change,
         active_change) = (None,
                           None,
                           None,
                           None)

        if prev_cases is not None:
            cases_change = cases - prev_cases
            deaths_change = deaths - prev_deaths
            recovered_change = recovered - prev_recovered
            active_change = active - prev_active

        
        result.append({
            'name': admin0,
            'admin1': admin1,
            'continent': continent,
            'date': date,
            'cases': cases,
            'cases_change': cases_change,
            'deaths': deaths,
            'deaths_change': deaths_change,
            'recovered': recovered,
            'recovered_change': recovered_change,
            'active': active,
            'active_change': active_change,
            'code': code,
            'coordinates': info['point']
        })

        (prev_cases,
         prev_deaths,
         prev_recovered,
         prev_active) = (cases,
                         deaths,
                         recovered,
                         active)
    return result

In [None]:
output_js = []
for key in sorted(gathered.keys(), key=lambda x: '{}_{}'.format(x[0], x[1])):
    admin0, admin1 = key
    info = gathered[key]
    output_js.extend(process_area(admin0, admin1, info))

In [None]:
for latest_eac_entry in [
    json.dumps(js, indent=4) 
    for js in output_js 
    if js['name'] in eac_countries and
    js['date'] == max_date
]:
    print(latest_eac_entry)

Write the output

In [None]:
output_file = os.path.join(data_dir, 'published/jhu-case-data.json')

open(output_file, 'w').write(json.dumps(output_js, sort_keys=True))