# Data Understanding

- What kind of data is available?
- What is the quality of the available data
- Is a restructuring of the data necessary?

## Get the data

In [63]:
##
# IMPORTS
#
import os
import urllib.request
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

##
# SETTINGS
#
# Render all plots inline (instead of popup)
%matplotlib inline

# create data folder if not existing already
os.makedirs('./data/00-raw', exist_ok=True)

##
# DOWNLOADS
#
# download covid dataset

link = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'
path = './data/00-raw/'
filename = 'owid-covid-data.csv'
urllib.request.urlretrieve(link, path+filename)

# load dataframe
df_src = pd.read_csv(path+filename, parse_dates=['date'])

## What kind of data is available?

In [64]:
df_src

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70439,ZWE,Africa,Zimbabwe,2021-02-17,35423.0,108.0,79.857,1418.0,4.0,7.714,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
70440,ZWE,Africa,Zimbabwe,2021-02-18,35543.0,120.0,97.000,1420.0,2.0,8.000,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
70441,ZWE,Africa,Zimbabwe,2021-02-19,35710.0,167.0,95.000,1430.0,10.0,5.286,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
70442,ZWE,Africa,Zimbabwe,2021-02-20,35768.0,58.0,94.857,1432.0,2.0,4.857,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571


In [65]:
print ('The dataset contains {} rows and {} columns.\n'.format(df_src.shape[0], df_src.shape[1]))
print ('===============================================================================\n')
print ('The columns contain data about: \n')

print(df_src.columns)

print ('===============================================================================\n')
print ('The aritmethic description: \n')

print(df_src.describe())

The dataset contains 70444 rows and 59 columns.


The columns contain data about: 

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated',

## What is the quality of the available data?

In [66]:
print ('What datatypes are available?\n')
print(df_src.info())

print ('===============================================================================\n')

What datatypes are available?

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70444 entries, 0 to 70443
Data columns (total 59 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   iso_code                               70444 non-null  object        
 1   continent                              66936 non-null  object        
 2   location                               70444 non-null  object        
 3   date                                   70444 non-null  datetime64[ns]
 4   total_cases                            69540 non-null  float64       
 5   new_cases                              69538 non-null  float64       
 6   new_cases_smoothed                     68537 non-null  float64       
 7   total_deaths                           60538 non-null  float64       
 8   new_deaths                             60696 non-null  float64       
 9   new_deaths_smoothed           

**Note:** *icu_patients* holds the number of COVID-19 patients in intensive care units (ICUs) on a given day

In [67]:
print ('Check why some columns contain objects:')
print ('===============================================\n')
print ('column: iso_code\n')
print (df_src.iso_code.unique())
print ('\n===============================================\n')
print ('column: continent\n')
print (df_src.continent.unique())
print ('\n===============================================\n')
print ('column: tests_units\n')
print (df_src.tests_units.unique())

Check why some columns contain objects:

column: iso_code

['AFG' 'OWID_AFR' 'ALB' 'DZA' 'AND' 'AGO' 'AIA' 'ATG' 'ARG' 'ARM'
 'OWID_ASI' 'AUS' 'AUT' 'AZE' 'BHS' 'BHR' 'BGD' 'BRB' 'BLR' 'BEL' 'BLZ'
 'BEN' 'BMU' 'BTN' 'BOL' 'BIH' 'BWA' 'BRA' 'BRN' 'BGR' 'BFA' 'BDI' 'KHM'
 'CMR' 'CAN' 'CPV' 'CYM' 'CAF' 'TCD' 'CHL' 'CHN' 'COL' 'COM' 'COG' 'CRI'
 'CIV' 'HRV' 'CUB' 'CYP' 'CZE' 'COD' 'DNK' 'DJI' 'DMA' 'DOM' 'ECU' 'EGY'
 'SLV' 'GNQ' 'ERI' 'EST' 'SWZ' 'ETH' 'OWID_EUR' 'OWID_EUN' 'FRO' 'FLK'
 'FJI' 'FIN' 'FRA' 'GAB' 'GMB' 'GEO' 'DEU' 'GHA' 'GIB' 'GRC' 'GRL' 'GRD'
 'GTM' 'GGY' 'GIN' 'GNB' 'GUY' 'HTI' 'HND' 'HKG' 'HUN' 'ISL' 'IND' 'IDN'
 'OWID_INT' 'IRN' 'IRQ' 'IRL' 'IMN' 'ISR' 'ITA' 'JAM' 'JPN' 'JEY' 'JOR'
 'KAZ' 'KEN' 'OWID_KOS' 'KWT' 'KGZ' 'LAO' 'LVA' 'LBN' 'LSO' 'LBR' 'LBY'
 'LIE' 'LTU' 'LUX' 'MAC' 'MDG' 'MWI' 'MYS' 'MDV' 'MLI' 'MLT' 'MHL' 'MRT'
 'MUS' 'MEX' 'FSM' 'MDA' 'MCO' 'MNG' 'MNE' 'MAR' 'MOZ' 'MMR' 'NAM' 'NPL'
 'NLD' 'NZL' 'NIC' 'NER' 'NGA' 'OWID_NAM' 'MKD' 'OWID_NCY' 'NOR'
 'OWID_OCE' 

## Is a restructuring of the data necessary?

Short answer: **YES**

- New grouping with year, month, week (, maybe for continents as well)
- Subsets for each location and *special* owid-datasets (aggregated data: e.g. OWID_ASI for continent asia)