<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-the-COVID-daily-cases-data" data-toc-modified-id="Import-pandas-and-the-COVID-daily-cases-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and the COVID daily cases data</a></span></li><li><span><a href="#Create-lists-for-the-daily-cases-and-deaths-columns,-the-case-total-columns,-and-the-demographic-columns" data-toc-modified-id="Create-lists-for-the-daily-cases-and-deaths-columns,-the-case-total-columns,-and-the-demographic-columns-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Create lists for the daily cases and deaths columns, the case total columns, and the demographic columns</a></span></li><li><span><a href="#Create-a-DataFrame-with-just-the-daily-data" data-toc-modified-id="Create-a-DataFrame-with-just-the-daily-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Create a DataFrame with just the daily data</a></span></li><li><span><a href="#Select-one-row-per-country" data-toc-modified-id="Select-one-row-per-country-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Select one row per country</a></span></li><li><span><a href="#Sum-the-values-for-each-group" data-toc-modified-id="Sum-the-values-for-each-group-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Sum the values for each group</a></span></li></ul></div>

# Import pandas and the COVID daily cases data

In [1]:
import pandas as pd

In [2]:
# pd.set_option('display.width', 200)
# pd.set_option('display.max_columns', 7)
# pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -i -iv

json     : 2.0.9
watermark: 2.1.0
pandas   : 1.2.1



In [4]:
covidcases = pd.read_csv('data/covidcases720.csv')

# Create lists for the daily cases and deaths columns, the case total columns, and the demographic columns

In [5]:
dailyvars = ['casedate', 'new_cases', 'new_deaths']
totvars = ['location', 'total_cases', 'total_deaths']
demovars = [
    'population', 'population_density', 'median_age', 'gdp_per_capita',
    'hospital_beds_per_thousand', 'region'
]

In [6]:
covidcases[dailyvars + totvars + demovars].head(3).T

Unnamed: 0,0,1,2
casedate,2019-12-31,2020-01-01,2020-01-02
new_cases,0.0,0.0,0.0
new_deaths,0.0,0.0,0.0
location,Afghanistan,Afghanistan,Afghanistan
total_cases,0.0,0.0,0.0
total_deaths,0.0,0.0,0.0
population,38928341.0,38928341.0,38928341.0
population_density,54.42,54.42,54.42
median_age,18.6,18.6,18.6
gdp_per_capita,1803.99,1803.99,1803.99


# Create a DataFrame with just the daily data

In [7]:
coviddaily = covidcases[['location'] + dailyvars]

In [8]:
coviddaily.shape

(29529, 4)

In [9]:
coviddaily.head()

Unnamed: 0,location,casedate,new_cases,new_deaths
0,Afghanistan,2019-12-31,0.0,0.0
1,Afghanistan,2020-01-01,0.0,0.0
2,Afghanistan,2020-01-02,0.0,0.0
3,Afghanistan,2020-01-03,0.0,0.0
4,Afghanistan,2020-01-04,0.0,0.0


# Select one row per country

In [10]:
covidcases['location'].nunique()

209

In [11]:
coviddemo = covidcases[['casedate'] + totvars + demovars].sort_values([
    'location', 'casedate'
]).drop_duplicates(['location'],
                   keep='last').rename(columns={'casedate': 'lastdate'})

In [12]:
coviddemo.shape

(209, 10)

In [13]:
coviddemo.head(3).T

Unnamed: 0,184,310,500
lastdate,2020-07-12,2020-07-12,2020-07-12
location,Afghanistan,Albania,Algeria
total_cases,34451.0,3371.0,18712.0
total_deaths,1010.0,89.0,1004.0
population,38928341.0,2877800.0,43851043.0
population_density,54.42,104.87,17.35
median_age,18.6,38.0,29.1
gdp_per_capita,1803.99,11803.43,13913.84
hospital_beds_per_thousand,0.5,2.89,1.9
region,South Asia,Eastern Europe,North Africa


# Sum the values for each group

In [15]:
covidtotals = covidcases.groupby(['location'], as_index=False).agg({
    'new_cases':
    'sum',
    'new_deaths':
    'sum',
    'median_age':
    'last',
    'gdp_per_capita':
    'last',
    'region':
    'last',
    'casedate':
    'last',
    'population':
    'last'
}).rename(
    columns={
        'new_cases': 'total_cases',
        'new_deaths': 'total_deaths',
        'casedate': 'lastdate'
    })

In [16]:
covidtotals.head(3).T

Unnamed: 0,0,1,2
location,Afghanistan,Albania,Algeria
total_cases,34451.0,3371.0,18712.0
total_deaths,1010.0,89.0,1004.0
median_age,18.6,38.0,29.1
gdp_per_capita,1803.99,11803.43,13913.84
region,South Asia,Eastern Europe,North Africa
lastdate,2020-07-12,2020-07-12,2020-07-12
population,38928341.0,2877800.0,43851043.0
