<a href="https://colab.research.google.com/github/w-oke/covid_reproduction/blob/main/owid_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import pickle

The Our World In Data (OWID) project provides COVID-19 data on its websites at https://ourworldindata.org/coronavirus and https://github.com/owid/covid-19-data/tree/master/public/data


In [2]:
# column descriptions:
owid_col_desc_link = 'https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-codebook.csv'
owid_col_desc = pd.read_csv(owid_col_desc_link)
owid_col_desc.head()

Unnamed: 0,column,source,category,description
0,iso_code,International Organization for Standardization,Others,ISO 3166-1 alpha-3 – three-letter country codes
1,continent,Our World in Data,Others,Continent of the geographical location
2,location,Our World in Data,Others,Geographical location
3,date,Our World in Data,Others,Date of observation
4,total_cases,COVID-19 Data Repository by the Center for Sys...,Confirmed cases,Total confirmed cases of COVID-19


In [3]:
# create a list of all the columns in the dataset
cols = owid_col_desc.column.to_list()

In [4]:
# possible independent variables
var = {'y': [ 
 # 'new_cases_smoothed_per_million',
 'reproduction_rate',
 # 'positive_rate',
 # 'tests_per_case', # inverse of positiviy rate
 ]}

# metadata - not intended for training
var['meta'] = [    
 'date',
 'iso_code',
 'location',
 # 'population', 
  ]

var['number'] = [
 # 'tests_units',
 'new_tests_smoothed_per_thousand',
 # 'total_vaccinations',
 # 'total_vaccinations_per_hundred', # use people_vaccinated & people_fully_vaccinated instead
 'people_vaccinated_per_hundred',
 'people_fully_vaccinated_per_hundred',
 'total_boosters_per_hundred',
 'stringency_index',
 'population_density',
 'median_age',
 'human_development_index',
 #'aged_65_older',
 #'aged_70_older',
 'gdp_per_capita',
 'extreme_poverty',
 #'cardiovasc_death_rate',
 #'diabetes_prevalence',
 #'female_smokers',
 #'male_smokers',
 'handwashing_facilities', # Share of the population with basic handwashing facilities on premises
 'hospital_beds_per_thousand', # Hospital beds per 1,000 people, most recent year available since 2010
 'life_expectancy',
 ]

# save the variables to file
with open('covid_owid_var_dictionary.pkl', 'wb') as f:
    pickle.dump(var, f)

In [5]:
# create a single list of all the features
var_all = [item for sublist in list(var.values()) for item in sublist]
print('The first 4 items in "var_all" are: ', var_all[0:4])
print('There are {} variables in var_all'.format(len(var_all)))

# create a single string of all the features
var_all2 = ', '.join(var_all)
print('var_all2: ', var_all2)

The first 4 items in "var_all" are:  ['reproduction_rate', 'date', 'iso_code', 'location']
There are 17 variables in var_all
var_all2:  reproduction_rate, date, iso_code, location, new_tests_smoothed_per_thousand, people_vaccinated_per_hundred, people_fully_vaccinated_per_hundred, total_boosters_per_hundred, stringency_index, population_density, median_age, human_development_index, gdp_per_capita, extreme_poverty, handwashing_facilities, hospital_beds_per_thousand, life_expectancy


In [6]:
# owid dataset:
owid_link = 'https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-data.csv'
owid = pd.read_csv(owid_link, usecols=var_all)
# Convert the 'date' column to a Datetime format
owid.dropna(subset=['reproduction_rate'], inplace=True)
owid['date'] = pd.to_datetime(owid['date'])
owid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107570 entries, 34 to 136038
Data columns (total 17 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   iso_code                             107570 non-null  object        
 1   location                             107570 non-null  object        
 2   date                                 107570 non-null  datetime64[ns]
 3   reproduction_rate                    107570 non-null  float64       
 4   new_tests_smoothed_per_thousand      65805 non-null   float64       
 5   people_vaccinated_per_hundred        27640 non-null   float64       
 6   people_fully_vaccinated_per_hundred  24946 non-null   float64       
 7   total_boosters_per_hundred           5377 non-null    float64       
 8   stringency_index                     99406 non-null   float64       
 9   population_density                   105841 non-null  float64       


In [7]:
owid.shape

(107570, 17)

In [9]:
owid.head()

Unnamed: 0,iso_code,location,date,reproduction_rate,new_tests_smoothed_per_thousand,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,stringency_index,population_density,median_age,gdp_per_capita,extreme_poverty,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
34,AFG,Afghanistan,2020-03-29,1.5,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
35,AFG,Afghanistan,2020-03-30,1.5,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
36,AFG,Afghanistan,2020-03-31,1.51,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
37,AFG,Afghanistan,2020-04-01,1.51,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
38,AFG,Afghanistan,2020-04-02,1.5,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511


In [10]:
owid.to_parquet('covid_owid_df.parquet') # output to a parquet file