<a href="https://colab.research.google.com/github/w-oke/covid_reproduction/blob/main/owid_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import pickle

The Our World In Data (OWID) project provides COVID-19 data on its websites at https://ourworldindata.org/coronavirus and https://github.com/owid/covid-19-data/tree/master/public/data


In [3]:
# column descriptions:
owid_col_desc_link = 'https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-codebook.csv'
owid_col_desc = pd.read_csv(owid_col_desc_link)
owid_col_desc.head()

Unnamed: 0,column,source,category,description
0,iso_code,International Organization for Standardization,Others,ISO 3166-1 alpha-3 – three-letter country codes
1,continent,Our World in Data,Others,Continent of the geographical location
2,location,Our World in Data,Others,Geographical location
3,date,Our World in Data,Others,Date of observation
4,total_cases,COVID-19 Data Repository by the Center for Sys...,Confirmed cases,Total confirmed cases of COVID-19


In [4]:
# create a list of all the columns in the dataset
cols = owid_col_desc.column.to_list()

In [5]:
# possible independent variables
var = {'y': [ 
 # 'new_cases_smoothed_per_million',
 'reproduction_rate',
 # 'positive_rate',
 # 'tests_per_case', # inverse of positiviy rate
 ]}

# metadata - not intended for training
var['meta'] = [    
 'date',
 'iso_code',
 'location',
 # 'population', 
  ]

var['number'] = [
 # 'tests_units',
 'new_tests_smoothed_per_thousand',
 # 'total_vaccinations',
 # 'total_vaccinations_per_hundred', # use people_vaccinated & people_fully_vaccinated instead
 'people_vaccinated_per_hundred',
 'people_fully_vaccinated_per_hundred',
 'total_boosters_per_hundred',
 'stringency_index',
 'population_density',
 'median_age',
 'human_development_index',
 #'aged_65_older',
 #'aged_70_older',
 'gdp_per_capita',
 'extreme_poverty',
 #'cardiovasc_death_rate',
 #'diabetes_prevalence',
 #'female_smokers',
 #'male_smokers',
 'handwashing_facilities', # Share of the population with basic handwashing facilities on premises
 'hospital_beds_per_thousand', # Hospital beds per 1,000 people, most recent year available since 2010
 'life_expectancy',
 ]

# save the variables to file
with open('covid_owid_var_dictionary.pkl', 'wb') as f:
    pickle.dump(var, f)

In [6]:
# create a single list of all the features
var_all = [item for sublist in list(var.values()) for item in sublist]
print('The first 4 items in "var_all" are: ', var_all[0:4])
print('There are {} variables in var_all'.format(len(var_all)))

# create a single string of all the features
var_all2 = ', '.join(var_all)
print('var_all2: ', var_all2)

The first 4 items in "var_all" are:  ['reproduction_rate', 'date', 'iso_code', 'location']
There are 17 variables in var_all
var_all2:  reproduction_rate, date, iso_code, location, new_tests_smoothed_per_thousand, people_vaccinated_per_hundred, people_fully_vaccinated_per_hundred, total_boosters_per_hundred, stringency_index, population_density, median_age, human_development_index, gdp_per_capita, extreme_poverty, handwashing_facilities, hospital_beds_per_thousand, life_expectancy


In [7]:
# owid dataset:
owid_link = 'https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-data.csv'
owid = pd.read_csv(owid_link, usecols=var_all)
owid.dropna(subset=['reproduction_rate'], inplace=True)
owid['date'] = pd.to_datetime(owid['date']) # Convert the 'date' column to a Datetime format
owid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107755 entries, 34 to 136247
Data columns (total 17 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   iso_code                             107755 non-null  object        
 1   location                             107755 non-null  object        
 2   date                                 107755 non-null  datetime64[ns]
 3   reproduction_rate                    107755 non-null  float64       
 4   new_tests_smoothed_per_thousand      65855 non-null   float64       
 5   people_vaccinated_per_hundred        27707 non-null   float64       
 6   people_fully_vaccinated_per_hundred  25016 non-null   float64       
 7   total_boosters_per_hundred           5413 non-null    float64       
 8   stringency_index                     99508 non-null   float64       
 9   population_density                   106023 non-null  float64       


Note that:
* about half the data has 'handwashing_facilities' data
* about a quarter of the data has vaccination data, but this might be due to no vaccinations being available for large portions of the time


In [9]:
owid.head()

Unnamed: 0,iso_code,location,date,reproduction_rate,new_tests_smoothed_per_thousand,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,stringency_index,population_density,median_age,gdp_per_capita,extreme_poverty,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
34,AFG,Afghanistan,2020-03-29,1.51,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
35,AFG,Afghanistan,2020-03-30,1.51,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
36,AFG,Afghanistan,2020-03-31,1.52,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
37,AFG,Afghanistan,2020-04-01,1.51,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
38,AFG,Afghanistan,2020-04-02,1.51,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511


## Variant Data

The OWID variant data is provided by country at a frequency of every two weeks.

In [23]:

variant_link = 'https://github.com/owid/covid-19-data/raw/master/public/data/variants/covid-variants.csv'
variants = pd.read_csv(variant_link)
variants['date'] = pd.to_datetime(variants['date']) # Convert the 'date' column to a Datetime format
variants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54096 entries, 0 to 54095
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   location             54096 non-null  object        
 1   date                 54096 non-null  datetime64[ns]
 2   variant              54096 non-null  object        
 3   num_sequences        54096 non-null  int64         
 4   perc_sequences       54096 non-null  float64       
 5   num_sequences_total  54096 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 2.5+ MB


In [24]:
variants.head()

Unnamed: 0,location,date,variant,num_sequences,perc_sequences,num_sequences_total
0,Angola,2020-12-21,Alpha,0,0.0,93
1,Angola,2020-12-21,B.1.1.277,0,0.0,93
2,Angola,2020-12-21,B.1.1.302,0,0.0,93
3,Angola,2020-12-21,B.1.1.519,0,0.0,93
4,Angola,2020-12-21,B.1.160,0,0.0,93


In [25]:
variants.drop(['num_sequences', 'num_sequences_total'], axis=1, inplace=True)
variants.variant.unique()

array(['Alpha', 'B.1.1.277', 'B.1.1.302', 'B.1.1.519', 'B.1.160',
       'B.1.177', 'B.1.221', 'B.1.258', 'B.1.367', 'B.1.620', 'Beta',
       'Delta', 'Epsilon', 'Eta', 'Gamma', 'Iota', 'Kappa', 'Lambda',
       'Mu', 'Omicron', 'S:677H.Robin1', 'S:677P.Pelican', 'others',
       'non_who'], dtype=object)

In [29]:
# create a list of the variants of interest
who_variants = '''
    Alpha
    Beta
    Delta
    Epsilon
    Eta
    Gamma
    Iota
    Kappa
    Lambda
    Mu
    Omicron
    non_who'''.split()
# note that the sum will add to 100% for any given day/location

In [28]:
variants2 = variants[variants.variant.isin(who_variants)]
variants2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27048 entries, 0 to 54095
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   location        27048 non-null  object        
 1   date            27048 non-null  datetime64[ns]
 2   variant         27048 non-null  object        
 3   perc_sequences  27048 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 1.0+ MB


In [40]:
variants3 = variants2.pivot(index=['location','date'], columns='variant', values='perc_sequences')
variants3.reset_index(inplace=True)
variants3

variant,location,date,Alpha,Beta,Delta,Epsilon,Eta,Gamma,Iota,Kappa,Lambda,Mu,Omicron,non_who
0,Angola,2020-12-21,0.00,74.19,0.0,0.0,1.08,0.0,0.0,0.0,0.0,0.0,0.0,24.73
1,Angola,2021-01-25,5.77,3.85,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,90.38
2,Angola,2021-02-08,9.52,28.57,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,61.91
3,Angola,2021-02-22,2.56,41.03,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,56.41
4,Angola,2021-03-08,6.19,61.06,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,32.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249,Zimbabwe,2021-01-11,0.00,95.61,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,4.39
2250,Zimbabwe,2021-01-25,0.00,94.74,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,5.26
2251,Zimbabwe,2021-02-08,0.00,100.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00
2252,Zimbabwe,2021-07-12,0.00,0.00,100.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00


In [41]:
variants3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2254 entries, 0 to 2253
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   location  2254 non-null   object        
 1   date      2254 non-null   datetime64[ns]
 2   Alpha     2254 non-null   float64       
 3   Beta      2254 non-null   float64       
 4   Delta     2254 non-null   float64       
 5   Epsilon   2254 non-null   float64       
 6   Eta       2254 non-null   float64       
 7   Gamma     2254 non-null   float64       
 8   Iota      2254 non-null   float64       
 9   Kappa     2254 non-null   float64       
 10  Lambda    2254 non-null   float64       
 11  Mu        2254 non-null   float64       
 12  Omicron   2254 non-null   float64       
 13  non_who   2254 non-null   float64       
dtypes: datetime64[ns](1), float64(12), object(1)
memory usage: 246.7+ KB


We want to increase the amount of data available.  We will do this by:
1. Populating dates between each 2-week date stamp.  The values will be the average of the previous and following week's data.
2. Creating a 'Global' location for each week who's values will be the average of all available locations for that week.

In [52]:
dates_new = variants3.date[:-1] + pd.Timedelta('1W')
dates_new

0      2020-12-28
1      2021-02-01
2      2021-02-15
3      2021-03-01
4      2021-03-15
          ...    
2248   2021-01-11
2249   2021-01-18
2250   2021-02-01
2251   2021-02-15
2252   2021-07-19
Name: date, Length: 2253, dtype: datetime64[ns]

In [54]:
variants3.date[0]+pd.Timedelta('1W')

Timestamp('2020-12-28 00:00:00')

In [94]:
# create a new dataframe based on the existing one, which will be populated with the dates between each 2-week datestamp. The values will be the average of the previous and following week's data.
def edit_week(df, row1, row2, cols):
    ''' If the two 2nd row is two weeks after the first row in the same location,
    modify the first row so that its date is between the two dates and the variant data
    contains the average values of the two rows (for the columns in cols).
    Note: The df must be sorted by Location and Date!

    df: the dataframe that will be modified
    row1: the name of the row to be modified
    row2: the name of the 2nd row to be checked/averaged
    cols: a list of the columns to be averaged'''
    if (df.location[row1] == df.location[row2]) and (df.date[row2] == df.date[row1]+pd.Timedelta('2W')):
        df.loc[row1, 'date'] = df.loc[row1, 'date'] + pd.Timedelta('1W')
        df.loc[row1,cols] = (df.loc[row1,cols] + df.loc[row2,cols])/2
    else:
        df.drop(row1, inplace=True)
    return df

In [91]:
variants3W = variants3.copy()
cols = variants3.columns[2:].to_list() # identify the columns that have the variant data
idx = variants3.index
for n in range(len(idx)-1):
    variants3W = edit_week(variants3W, idx[n], idx[n+1], cols)
variants3W.drop(idx[-1], inplace=True) # drop the last row

In [92]:
variants3W.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1902 entries, 1 to 2252
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   location  1902 non-null   object        
 1   date      1902 non-null   datetime64[ns]
 2   Alpha     1902 non-null   float64       
 3   Beta      1902 non-null   float64       
 4   Delta     1902 non-null   float64       
 5   Epsilon   1902 non-null   float64       
 6   Eta       1902 non-null   float64       
 7   Gamma     1902 non-null   float64       
 8   Iota      1902 non-null   float64       
 9   Kappa     1902 non-null   float64       
 10  Lambda    1902 non-null   float64       
 11  Mu        1902 non-null   float64       
 12  Omicron   1902 non-null   float64       
 13  non_who   1902 non-null   float64       
dtypes: datetime64[ns](1), float64(12), object(1)
memory usage: 222.9+ KB


In [102]:
variants4 = variants3.append(variants3W)
variants4.sort_values(by=['location','date'], inplace=True)
variants4.reset_index(drop=True, inplace=True)
variants4.head()

variant,location,date,Alpha,Beta,Delta,Epsilon,Eta,Gamma,Iota,Kappa,Lambda,Mu,Omicron,non_who
0,Angola,2020-12-21,0.0,74.19,0.0,0.0,1.08,0.0,0.0,0.0,0.0,0.0,0.0,24.73
1,Angola,2021-01-25,5.77,3.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.38
2,Angola,2021-02-01,7.645,16.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76.145
3,Angola,2021-02-08,9.52,28.57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.91
4,Angola,2021-02-15,6.04,34.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.16


In [113]:
variants4.shape

(4156, 14)

### Create a 'Global' location
With the average of all the countries data.

In [112]:
variants_glob = variants4.groupby('date').mean()
variants_glob['date'] = variants_glob.index
variants_glob['location'] = 'Global'
variants5 = variants4.append(variants_glob)
variants5.reset_index(drop=True, inplace=True)
variants5.tail()

Unnamed: 0,location,date,Alpha,Beta,Delta,Epsilon,Eta,Gamma,Iota,Kappa,Lambda,Mu,Omicron,non_who
4232,Global,2021-10-25,0.309245,0.028868,98.957453,0.0,9.4e-05,0.150566,0.0,0.0,0.050189,0.340943,0.0,0.162642
4233,Global,2021-11-01,0.082982,0.024386,98.388246,0.0,0.000175,0.42193,0.0,0.0,0.711053,0.261228,0.0,0.11
4234,Global,2021-11-08,0.103571,0.0,99.593714,0.0,0.000143,0.005,0.0,0.000429,0.004,0.244714,0.0,0.048429
4235,Global,2021-11-15,0.069487,0.0,99.452051,0.0,0.0,0.0,0.0,0.000769,0.007179,0.182564,0.126154,0.161795
4236,Global,2021-11-26,0.0,0.0,98.912857,0.0,0.0,0.155714,0.0,0.0,0.0,0.931429,0.0,0.0


## Merge the OWID and Variant data:

In [98]:
owid2 = pd.merge(left=owid, right=variants4,
    how="inner",
    on=['location', 'date'],
    left_index=False,
    right_index=False,
    sort=False,
    copy=True,
)

In [99]:
owid2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4027 entries, 0 to 4026
Data columns (total 30 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   iso_code                             4027 non-null   object        
 1   location                             4027 non-null   object        
 2   date                                 4027 non-null   datetime64[ns]
 3   reproduction_rate                    4027 non-null   float64       
 4   new_tests_smoothed_per_thousand      3782 non-null   float64       
 5   people_vaccinated_per_hundred        2242 non-null   float64       
 6   people_fully_vaccinated_per_hundred  2109 non-null   float64       
 7   total_boosters_per_hundred           552 non-null    float64       
 8   stringency_index                     3994 non-null   float64       
 9   population_density                   4027 non-null   float64       
 10  median_age  

In [44]:
owid2.head()

Unnamed: 0,iso_code,location,date,reproduction_rate,new_tests_smoothed_per_thousand,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,stringency_index,population_density,median_age,gdp_per_capita,extreme_poverty,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,Alpha,Beta,Delta,Epsilon,Eta,Gamma,Iota,Kappa,Lambda,Mu,Omicron,non_who
0,AGO,Angola,2020-12-21,0.96,,,,,65.74,23.89,16.8,5819.495,,26.664,,61.15,0.581,0.0,74.19,0.0,0.0,1.08,0.0,0.0,0.0,0.0,0.0,0.0,24.73
1,AGO,Angola,2021-01-25,0.91,,,,,62.96,23.89,16.8,5819.495,,26.664,,61.15,0.581,5.77,3.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.38
2,AGO,Angola,2021-02-08,0.88,,,,,61.11,23.89,16.8,5819.495,,26.664,,61.15,0.581,9.52,28.57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.91
3,AGO,Angola,2021-02-22,0.99,,,,,58.33,23.89,16.8,5819.495,,26.664,,61.15,0.581,2.56,41.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.41
4,AGO,Angola,2021-03-08,1.07,,0.02,,,58.33,23.89,16.8,5819.495,,26.664,,61.15,0.581,6.19,61.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.75


In [None]:
owid.to_parquet('covid_owid_df.parquet') # output to a parquet file