<a href="https://colab.research.google.com/github/w-oke/covid_reproduction/blob/main/covid_owid_1_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import urllib.request
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

The Our World In Data (OWID) project provides COVID-19 data on its websites at https://ourworldindata.org/coronavirus and https://github.com/owid/covid-19-data/tree/master/public/data


In [2]:
# column descriptions from OWID:
owid_col_desc_link = 'https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-codebook.csv'
owid_col_desc = pd.read_csv(owid_col_desc_link)
owid_col_desc.head()
# Note: the Variant data isn't described here, but is percentage by country or as calculated globally
# on that date if the country's data wasn't available.

Unnamed: 0,column,source,category,description
0,iso_code,International Organization for Standardization,Others,ISO 3166-1 alpha-3 – three-letter country codes
1,continent,Our World in Data,Others,Continent of the geographical location
2,location,Our World in Data,Others,Geographical location
3,date,Our World in Data,Others,Date of observation
4,total_cases,COVID-19 Data Repository by the Center for Sys...,Confirmed cases,Total confirmed cases of COVID-19


In [3]:
# create a list of all the columns in the dataset
cols = owid_col_desc.column.to_list()

In [4]:
# independent variables
var = {'y': [ 
 # 'new_cases_smoothed_per_million',
 'reproduction_rate',
 # 'positive_rate',
 # 'tests_per_case', # inverse of positiviy rate
 ]}

# metadata - not intended for training
var['meta'] = [    
 'date',
 'iso_code',
 'location',
 # 'population', 
  ]

var['number'] = [
 # 'tests_units',
 'new_tests_smoothed_per_thousand',
 # 'total_vaccinations',
 # 'total_vaccinations_per_hundred', # use people_vaccinated & people_fully_vaccinated instead
 'people_vaccinated_per_hundred',
 'people_fully_vaccinated_per_hundred',
 'total_boosters_per_hundred',
 'stringency_index',
 'population_density',
 'median_age',
 'human_development_index',
 #'aged_65_older',
 #'aged_70_older',
 'gdp_per_capita',
 'extreme_poverty',
 #'cardiovasc_death_rate',
 #'diabetes_prevalence',
 #'female_smokers',
 #'male_smokers',
 'handwashing_facilities', # Share of the population with basic handwashing facilities on premises
 'hospital_beds_per_thousand', # Hospital beds per 1,000 people, most recent year available since 2010
 'life_expectancy',
 ]

In [5]:
# create a single list of all the features
var_all = [item for sublist in list(var.values()) for item in sublist]
print('The first 4 items in "var_all" are: ', var_all[0:4])
print('There are {} variables in var_all'.format(len(var_all)))

# create a single string of all the features
var_all2 = ', '.join(var_all)
print('var_all2: ', var_all2)

The first 4 items in "var_all" are:  ['reproduction_rate', 'date', 'iso_code', 'location']
There are 17 variables in var_all
var_all2:  reproduction_rate, date, iso_code, location, new_tests_smoothed_per_thousand, people_vaccinated_per_hundred, people_fully_vaccinated_per_hundred, total_boosters_per_hundred, stringency_index, population_density, median_age, human_development_index, gdp_per_capita, extreme_poverty, handwashing_facilities, hospital_beds_per_thousand, life_expectancy


In [6]:
# owid dataset:
owid_link = 'https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-data.csv'
owid = pd.read_csv(owid_link, usecols=var_all)
owid.dropna(subset=['reproduction_rate'], inplace=True)
owid['date'] = pd.to_datetime(owid['date']) # Convert the 'date' column to a Datetime format
owid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108865 entries, 34 to 138043
Data columns (total 17 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   iso_code                             108865 non-null  object        
 1   location                             108865 non-null  object        
 2   date                                 108865 non-null  datetime64[ns]
 3   reproduction_rate                    108865 non-null  float64       
 4   new_tests_smoothed_per_thousand      66535 non-null   float64       
 5   people_vaccinated_per_hundred        28477 non-null   float64       
 6   people_fully_vaccinated_per_hundred  25779 non-null   float64       
 7   total_boosters_per_hundred           6304 non-null    float64       
 8   stringency_index                     100827 non-null  float64       
 9   population_density                   107115 non-null  float64       


Note that:
* about half the data has 'handwashing_facilities' data
* about a quarter of the data has vaccination data, but this might be due to no vaccinations being available for large portions of the time

It is thought that the reason that many of the vaccination and booster data values are null is due to the timeframes (2019-2020) and countries where no vaccinations were available or provided.  Most of these values could therefore be filled with 0 (zero).

In [7]:
owid.head()

Unnamed: 0,iso_code,location,date,reproduction_rate,new_tests_smoothed_per_thousand,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,stringency_index,population_density,median_age,gdp_per_capita,extreme_poverty,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
34,AFG,Afghanistan,2020-03-29,1.5,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
35,AFG,Afghanistan,2020-03-30,1.5,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
36,AFG,Afghanistan,2020-03-31,1.51,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
37,AFG,Afghanistan,2020-04-01,1.51,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511
38,AFG,Afghanistan,2020-04-02,1.5,,,,,67.59,54.422,18.6,1803.987,,37.746,0.5,64.83,0.511


## Variant Data

The OWID variant data is provided by country at a frequency of every two weeks.

In [8]:
variant_link = 'https://github.com/owid/covid-19-data/raw/master/public/data/variants/covid-variants.csv'
variants = pd.read_csv(variant_link)
variants['date'] = pd.to_datetime(variants['date']) # Convert the 'date' column to a Datetime format
variants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92256 entries, 0 to 92255
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   location             92256 non-null  object        
 1   date                 92256 non-null  datetime64[ns]
 2   variant              92256 non-null  object        
 3   num_sequences        92256 non-null  int64         
 4   perc_sequences       92256 non-null  float64       
 5   num_sequences_total  92256 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 4.2+ MB


In [9]:
variants.head()

Unnamed: 0,location,date,variant,num_sequences,perc_sequences,num_sequences_total
0,Angola,2020-07-06,Alpha,0,0.0,3
1,Angola,2020-07-06,B.1.1.277,0,0.0,3
2,Angola,2020-07-06,B.1.1.302,0,0.0,3
3,Angola,2020-07-06,B.1.1.519,0,0.0,3
4,Angola,2020-07-06,B.1.160,0,0.0,3


In [10]:
variants.drop(['num_sequences', 'num_sequences_total'], axis=1, inplace=True)
variants.variant.unique()

array(['Alpha', 'B.1.1.277', 'B.1.1.302', 'B.1.1.519', 'B.1.160',
       'B.1.177', 'B.1.221', 'B.1.258', 'B.1.367', 'B.1.620', 'Beta',
       'Delta', 'Epsilon', 'Eta', 'Gamma', 'Iota', 'Kappa', 'Lambda',
       'Mu', 'Omicron', 'S:677H.Robin1', 'S:677P.Pelican', 'others',
       'non_who'], dtype=object)

In [11]:
# create a list of the variants of interest
var['variants'] = '''
    Alpha
    Beta
    Delta
    Epsilon
    Eta
    Gamma
    Iota
    Kappa
    Lambda
    Mu
    Omicron
    non_who'''.split()
# note that the sum will add to 100% for any given day/location

In [12]:
variants2 = variants[variants.variant.isin(var['variants'])]
variants2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46128 entries, 0 to 92255
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   location        46128 non-null  object        
 1   date            46128 non-null  datetime64[ns]
 2   variant         46128 non-null  object        
 3   perc_sequences  46128 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 1.8+ MB


In [13]:
variants3 = variants2.pivot(index=['location','date'], columns='variant', values='perc_sequences')
variants3.reset_index(inplace=True)
variants3

variant,location,date,Alpha,Beta,Delta,Epsilon,Eta,Gamma,Iota,Kappa,Lambda,Mu,Omicron,non_who
0,Angola,2020-07-06,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.00
1,Angola,2020-08-31,0.0,100.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
2,Angola,2020-09-28,0.0,60.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.00
3,Angola,2020-10-12,0.0,48.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.72
4,Angola,2020-10-26,0.0,100.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3839,Zimbabwe,2021-09-06,0.0,0.00,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
3840,Zimbabwe,2021-09-20,0.0,0.00,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
3841,Zimbabwe,2021-10-04,0.0,0.00,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
3842,Zimbabwe,2021-10-18,0.0,0.00,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00


In [14]:
variants3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3844 entries, 0 to 3843
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   location  3844 non-null   object        
 1   date      3844 non-null   datetime64[ns]
 2   Alpha     3844 non-null   float64       
 3   Beta      3844 non-null   float64       
 4   Delta     3844 non-null   float64       
 5   Epsilon   3844 non-null   float64       
 6   Eta       3844 non-null   float64       
 7   Gamma     3844 non-null   float64       
 8   Iota      3844 non-null   float64       
 9   Kappa     3844 non-null   float64       
 10  Lambda    3844 non-null   float64       
 11  Mu        3844 non-null   float64       
 12  Omicron   3844 non-null   float64       
 13  non_who   3844 non-null   float64       
dtypes: datetime64[ns](1), float64(12), object(1)
memory usage: 420.6+ KB


We want to increase the amount of data available.  We will do this by:
1. Populating dates between each 2-week date stamp.  The values will be the average of the previous and following week's data.
2. Creating a 'Global' location for each week who's values will be the average of all available locations for that week.

In [15]:
dates_new = variants3.date[:-1] + pd.Timedelta('1W')
dates_new

0      2020-07-13
1      2020-09-07
2      2020-10-05
3      2020-10-19
4      2020-11-02
          ...    
3838   2021-08-30
3839   2021-09-13
3840   2021-09-27
3841   2021-10-11
3842   2021-10-25
Name: date, Length: 3843, dtype: datetime64[ns]

In [16]:
variants3.date[0]+pd.Timedelta('1W')

Timestamp('2020-07-13 00:00:00')

In [17]:
# create a new dataframe based on the existing one, which will be populated with the dates between each 2-week datestamp. The values will be the average of the previous and following week's data.
def edit_week(df, row1, row2, cols):
    ''' If the two 2nd row is two weeks after the first row in the same location,
    modify the first row so that its date is between the two dates and the variant data
    contains the average values of the two rows (for the columns in cols).
    Note: The df must be sorted by Location and Date!

    df: the dataframe that will be modified
    row1: the name of the row to be modified
    row2: the name of the 2nd row to be checked/averaged
    cols: a list of the columns to be averaged'''
    if (df.location[row1] == df.location[row2]) and (df.date[row2] == df.date[row1]+pd.Timedelta('2W')):
        df.loc[row1, 'date'] = df.loc[row1, 'date'] + pd.Timedelta('1W')
        df.loc[row1,cols] = (df.loc[row1,cols] + df.loc[row2,cols])/2
    else:
        df.drop(row1, inplace=True)
    return df

In [18]:
variants3W = variants3.copy()
cols = variants3.columns[2:].to_list() # identify the columns that have the variant data
idx = variants3.index
for n in range(len(idx)-1):
    variants3W = edit_week(variants3W, idx[n], idx[n+1], cols)
variants3W.drop(idx[-1], inplace=True) # drop the last row

In [19]:
variants3W.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3410 entries, 2 to 3842
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   location  3410 non-null   object        
 1   date      3410 non-null   datetime64[ns]
 2   Alpha     3410 non-null   float64       
 3   Beta      3410 non-null   float64       
 4   Delta     3410 non-null   float64       
 5   Epsilon   3410 non-null   float64       
 6   Eta       3410 non-null   float64       
 7   Gamma     3410 non-null   float64       
 8   Iota      3410 non-null   float64       
 9   Kappa     3410 non-null   float64       
 10  Lambda    3410 non-null   float64       
 11  Mu        3410 non-null   float64       
 12  Omicron   3410 non-null   float64       
 13  non_who   3410 non-null   float64       
dtypes: datetime64[ns](1), float64(12), object(1)
memory usage: 399.6+ KB


In [20]:
variants4 = variants3.append(variants3W)
variants4.sort_values(by=['location','date'], inplace=True)
variants4.reset_index(drop=True, inplace=True)
variants4.head()

variant,location,date,Alpha,Beta,Delta,Epsilon,Eta,Gamma,Iota,Kappa,Lambda,Mu,Omicron,non_who
0,Angola,2020-07-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
1,Angola,2020-08-31,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Angola,2020-09-28,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0
3,Angola,2020-10-05,0.0,54.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45.86
4,Angola,2020-10-12,0.0,48.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.72


In [21]:
variants4.shape

(7254, 14)

### Create a 'Global' location
With the average of all the countries data.

In [22]:
variants_glob = variants4.groupby('date').mean()
variants_glob['date'] = variants_glob.index
variants_glob['location'] = 'Global'
variants_glob.reset_index(drop=True, inplace=True)
variants_glob.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Alpha     83 non-null     float64       
 1   Beta      83 non-null     float64       
 2   Delta     83 non-null     float64       
 3   Epsilon   83 non-null     float64       
 4   Eta       83 non-null     float64       
 5   Gamma     83 non-null     float64       
 6   Iota      83 non-null     float64       
 7   Kappa     83 non-null     float64       
 8   Lambda    83 non-null     float64       
 9   Mu        83 non-null     float64       
 10  Omicron   83 non-null     float64       
 11  non_who   83 non-null     float64       
 12  date      83 non-null     datetime64[ns]
 13  location  83 non-null     object        
dtypes: datetime64[ns](1), float64(12), object(1)
memory usage: 9.2+ KB


## Merge the OWID and Variant data:

In [23]:
owid2 = pd.merge(left=owid, right=variants_glob,
    how="inner",
    on=['date'],
    suffixes=[None, "_y"]
)
owid2.drop('location_y', axis=1, inplace=True)

In [24]:
owid2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14671 entries, 0 to 14670
Data columns (total 29 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   iso_code                             14671 non-null  object        
 1   location                             14671 non-null  object        
 2   date                                 14671 non-null  datetime64[ns]
 3   reproduction_rate                    14671 non-null  float64       
 4   new_tests_smoothed_per_thousand      9006 non-null   float64       
 5   people_vaccinated_per_hundred        4313 non-null   float64       
 6   people_fully_vaccinated_per_hundred  3918 non-null   float64       
 7   total_boosters_per_hundred           913 non-null    float64       
 8   stringency_index                     13562 non-null  float64       
 9   population_density                   14428 non-null  float64       
 10  median_age

In [25]:
owid_countries = pd.merge(left=owid, right=variants4,
    how="inner",
    on=['location', 'date']
)
owid_countries.shape

(6995, 29)

In [34]:
# we want to replace the Global data with  the Countries data where available.
# To do so, we will add the two datasets together, and then drop the duplicate
# entries (that match both location & date) from merged set, keeping the Country data
owid3 = owid_countries.append(owid2)
owid3.shape

(21666, 29)

In [35]:
# create an integer date_diff column to help with imputation
owid3['date_diff'] = pd.TimedeltaIndex(owid3.date - owid3.date.min()).days
owid3['date_diff'].head()

0     56
1    112
2    140
3    147
4    154
Name: date_diff, dtype: int64

In [36]:
owid3.drop_duplicates(subset=['location', 'date'], inplace=True, keep='first', ignore_index=True)
owid3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14671 entries, 0 to 14670
Data columns (total 30 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   iso_code                             14671 non-null  object        
 1   location                             14671 non-null  object        
 2   date                                 14671 non-null  datetime64[ns]
 3   reproduction_rate                    14671 non-null  float64       
 4   new_tests_smoothed_per_thousand      9006 non-null   float64       
 5   people_vaccinated_per_hundred        4313 non-null   float64       
 6   people_fully_vaccinated_per_hundred  3918 non-null   float64       
 7   total_boosters_per_hundred           913 non-null    float64       
 8   stringency_index                     13562 non-null  float64       
 9   population_density                   14428 non-null  float64       
 10  median_age

## Scale Data

In [37]:
# use the MinMaxScaler to scale the Number, Variant, and date_diff columns
var['scale'] = var['number'] + var['variants'] + ['date_diff']
scaler = MinMaxScaler()
owid3[var['scale']] = pd.DataFrame(scaler.fit_transform(owid3[var['scale']]), columns = var['scale'])
owid3.head()

Unnamed: 0,iso_code,location,date,reproduction_rate,new_tests_smoothed_per_thousand,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,stringency_index,population_density,median_age,gdp_per_capita,extreme_poverty,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,Alpha,Beta,Delta,Epsilon,Eta,Gamma,Iota,Kappa,Lambda,Mu,Omicron,non_who,date_diff
0,AGO,Angola,2020-07-06,1.3,,,,,0.7593,0.001133,0.05136,0.044363,,0.260462,,0.235136,0.332149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.098765
1,AGO,Angola,2020-08-31,1.13,,,,,0.7639,0.001133,0.05136,0.044363,,0.260462,,0.235136,0.332149,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.197531
2,AGO,Angola,2020-09-28,1.16,,,,,0.75,0.001133,0.05136,0.044363,,0.260462,,0.235136,0.332149,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.40006,0.246914
3,AGO,Angola,2020-10-05,1.18,,,,,0.713,0.001133,0.05136,0.044363,,0.260462,,0.235136,0.332149,0.0,0.5414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.458654,0.259259
4,AGO,Angola,2020-10-12,1.2,,,,,0.713,0.001133,0.05136,0.044363,,0.260462,,0.235136,0.332149,0.0,0.4828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.517248,0.271605


In [38]:
# check that the values were properly scaled
owid3[var['scale']].describe()

Unnamed: 0,new_tests_smoothed_per_thousand,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,stringency_index,population_density,median_age,human_development_index,gdp_per_capita,extreme_poverty,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,Alpha,Beta,Delta,Epsilon,Eta,Gamma,Iota,Kappa,Lambda,Mu,Omicron,non_who,date_diff
count,9006.0,4313.0,3918.0,913.0,13562.0,14428.0,14210.0,14267.0,14121.0,9793.0,7335.0,13066.0,14589.0,14671.0,14671.0,14671.0,14671.0,14671.0,14671.0,14671.0,14671.0,14671.0,14671.0,14671.0,14671.0,14671.0
mean,0.031861,0.309481,0.274807,0.089975,0.558454,0.015732,0.462314,0.586523,0.156674,0.175673,0.503022,0.212789,0.58565,0.135999,0.031464,0.278307,0.001583,0.005016,0.021805,0.002054,0.002174,0.006041,0.005961,0.001984,0.510628,0.510624
std,0.073003,0.26925,0.273094,0.179356,0.188171,0.078814,0.276773,0.270509,0.169345,0.263146,0.327255,0.18014,0.229822,0.240658,0.101743,0.407454,0.013383,0.031222,0.079873,0.017536,0.019252,0.039937,0.040701,0.019252,0.443168,0.289618
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.00344,0.047299,0.030798,0.0,0.4259,0.001772,0.199396,0.35524,0.030672,0.005161,0.184918,0.087591,0.413206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026047,0.259259
50%,0.01176,0.249541,0.173077,0.005213,0.5648,0.004167,0.441088,0.62167,0.100053,0.027097,0.494362,0.167664,0.633104,0.002397,0.000156,0.000434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50005,0.518519
75%,0.031952,0.55158,0.484729,0.094062,0.7037,0.010539,0.712991,0.806394,0.224873,0.272258,0.838893,0.284672,0.755602,0.177391,0.023175,0.732812,0.000137,0.000303,0.012787,0.000101,4.2e-05,0.003552,0.001837,0.0,0.999999,0.765432
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Impute Data

In [39]:
var['vaccine'] = ['people_vaccinated_per_hundred',
                  'people_fully_vaccinated_per_hundred',
                  'total_boosters_per_hundred']
# inpute vaccine columns with nulls if date <= 31 Dec 2020
owid3.loc[owid3.date <= pd.Timestamp(2020,12,31), var['vaccine']] = \
        owid3.loc[owid3.date <= pd.Timestamp(2020,12,31), var['vaccine']].fillna(0.0)

In [40]:
owid3[var['vaccine']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14671 entries, 0 to 14670
Data columns (total 3 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   people_vaccinated_per_hundred        10122 non-null  float64
 1   people_fully_vaccinated_per_hundred  9759 non-null   float64
 2   total_boosters_per_hundred           6767 non-null   float64
dtypes: float64(3)
memory usage: 344.0 KB


In [41]:
# the KNN imputer will use the KNN to find examples with similar features and 
# impute the missing values according to the average of the n nearest neighbours
imputer = KNNImputer(n_neighbors=10, add_indicator=False)

# use the KNN imputer to first impute values that won't vary significantly with time:
var['imputer_no_date'] = ['population_density', 'median_age',
       'gdp_per_capita', 'extreme_poverty', 'handwashing_facilities',
       'hospital_beds_per_thousand', 'life_expectancy',
       'human_development_index']
owid3[var['imputer_no_date']] = pd.DataFrame(imputer.fit_transform(owid3[var['imputer_no_date']]),
                                        columns = var['imputer_no_date'])

# next, use the KNN imputer to impute time-sensitive health-related values:
var['imputer_date'] = ['new_tests_smoothed_per_thousand', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred',
       'gdp_per_capita', 'hospital_beds_per_thousand',
       'stringency_index', 'date_diff']
owid3[var['imputer_date']] = pd.DataFrame(imputer.fit_transform(owid3[var['imputer_date']]),
                                        columns = var['imputer_date'])

In [42]:
owid3.info() # the result is a fully populated DF:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14671 entries, 0 to 14670
Data columns (total 30 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   iso_code                             14671 non-null  object        
 1   location                             14671 non-null  object        
 2   date                                 14671 non-null  datetime64[ns]
 3   reproduction_rate                    14671 non-null  float64       
 4   new_tests_smoothed_per_thousand      14671 non-null  float64       
 5   people_vaccinated_per_hundred        14671 non-null  float64       
 6   people_fully_vaccinated_per_hundred  14671 non-null  float64       
 7   total_boosters_per_hundred           14671 non-null  float64       
 8   stringency_index                     14671 non-null  float64       
 9   population_density                   14671 non-null  float64       
 10  median_age

## Save files externally for the next notebook

In [45]:
owid3.to_parquet('covid_owid_df.parquet') # output to a parquet file

In [44]:
var

{'imputer_date': ['new_tests_smoothed_per_thousand',
  'people_vaccinated_per_hundred',
  'people_fully_vaccinated_per_hundred',
  'total_boosters_per_hundred',
  'gdp_per_capita',
  'hospital_beds_per_thousand',
  'stringency_index',
  'date_diff'],
 'imputer_no_date': ['population_density',
  'median_age',
  'gdp_per_capita',
  'extreme_poverty',
  'handwashing_facilities',
  'hospital_beds_per_thousand',
  'life_expectancy',
  'human_development_index'],
 'meta': ['date', 'iso_code', 'location'],
 'number': ['new_tests_smoothed_per_thousand',
  'people_vaccinated_per_hundred',
  'people_fully_vaccinated_per_hundred',
  'total_boosters_per_hundred',
  'stringency_index',
  'population_density',
  'median_age',
  'human_development_index',
  'gdp_per_capita',
  'extreme_poverty',
  'handwashing_facilities',
  'hospital_beds_per_thousand',
  'life_expectancy'],
 'scale': ['new_tests_smoothed_per_thousand',
  'people_vaccinated_per_hundred',
  'people_fully_vaccinated_per_hundred',
  't

In [46]:
# save the variables to file
with open('covid_owid_var_dictionary.pkl', 'wb') as f:
    pickle.dump(var, f)