In [2]:
import pandas as pd
from datetime import date
from glob import glob

In [3]:
today = date.today()
dates = [date.strftime('%m-%d-%Y') for date in pd.date_range(start='2020-01-22', end=today)][:-1]
dates_as_date = pd.date_range(start='2020-01-22', end=today)[:-1]


# standardize column names for all entries
def rename_columns(column):
    column_map = {
        'Lat': 'Latitude',
        'Long_': 'Longitude',
        'Incidence_Rate': 'Incident_Rate'
    }
    if column in column_map:
        return column_map[column]
    return column.replace('/', '_').replace('-', '_').replace(' ', '_')

all_cases = []
for d in range(len(dates)):
    path = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{}.csv".format(dates[d])
    data = pd.read_csv(path)
    data.rename(columns=rename_columns, inplace=True)
    data['Date'] = dates_as_date[d]
    all_cases.append(data)

df = pd.concat(all_cases)

In [4]:
# all_cases.Country_Region.unique()

# standardize country names for all entries
country_mapping = {
    'MS Zaandam|Diamond Princess|Cruise Ship': 'Others', # move cruise ships to others
    'Hong Kong.+': 'Hong Kong',
    'Iran.+': 'Iran',
    '.*Congo.*': 'Congo',
    'Mainland China': 'China',
    '.*Bahamas.*': 'The Bahamas',
    '.*Gambia.*': 'The Gambia',
    'Viet Nam': 'Vietnam',
    'Taiwan\*': 'Taiwan',
    'Cote d\'Ivoire': 'Ivory Coast',
    'Cabo Verde': 'Cape Verde',
    'Russian Federation': 'Russia',
    ' Azerbaijan': 'Azerbaijan',
    'Holy See': 'Vatican City',
    'Republic of Ireland': 'Ireland',
    'Republic of Moldova': 'Moldova',
    'Czechia': 'Czech Republic',
    'Republic of Korea|Korea, South': 'South Korea',
    'Timor-Leste': 'East Timor',
    'Macao SAR|Macau': 'Macao',
    'UK': 'United Kingdom',
    'Jersey|Guernsey': 'Channel Islands',
    'Dominican Republicn Republic|Dominica': 'Dominican Republic'
}

df['Country_Region'].replace(
    to_replace=country_mapping.keys(),
    value=country_mapping.values(),
    regex=True,
    inplace=True
)

In [5]:
# group data by country
daily_updates = df.groupby(['Country_Region', 'Date']).agg(
     Confirmed = ('Confirmed','sum'),
     Deaths = ('Deaths','sum'),
 ).reset_index()

In [10]:
# get changes in data
updates_per_country = daily_updates.groupby('Country_Region')
# diff takes difference to point in group before it
daily_updates['New_Confirmed'] = updates_per_country['Confirmed'].diff().fillna(0)
daily_updates['New_Deaths'] = updates_per_country['Deaths'].diff().fillna(0)

In [12]:
# reorder columns
worldwide_pretty = daily_updates.loc[:, ['Date', 'Country_Region', 'New_Confirmed', 'Confirmed', 'New_Deaths', 'Deaths']]

In [18]:
worldwide_pretty.head(3)

Unnamed: 0,Date,Country_Region,New_Confirmed,Confirmed,New_Deaths,Deaths
0,2020-02-24,Afghanistan,0.0,1.0,0.0,0.0
1,2020-02-25,Afghanistan,0.0,1.0,0.0,0.0
2,2020-02-26,Afghanistan,0.0,1.0,0.0,0.0
