In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import pyplot
from sklearn.feature_selection import VarianceThreshold

In [45]:
data = pd.read_csv('new_dataset/owid-covid-data-most-recent.csv')
print(data.shape)
data = data.dropna(subset = ['new_deaths'])
print(data.shape)

# converting date to date object
data['date'] = pd.to_datetime(data['date'])

# dropping observations before COVID-19 was detected in the country
data = data[~data['total_cases'].isna()]

# dropping redundant columns like smoothed and similar predictors
# maybe someone else go through this and check just in case
columns_to_drop = ['new_cases_smoothed',
                   'new_cases_smoothed_per_million',
                   'total_deaths',
                   'total_deaths_per_million',
                   'new_deaths_per_million',
                   'new_deaths_smoothed',
                   'new_deaths_smoothed_per_million',
                   'excess_mortality_cumulative',
                   'excess_mortality_cumulative_absolute',
                   'excess_mortality_cumulative_per_million',
                   'icu_patients',
                   'hosp_patients',
                   'weekly_icu_admissions',
                   'weekly_hosp_admissions',
                   'total_tests',
                   'new_tests',
                   'new_tests_smoothed',
                   'new_tests_smoothed_per_thousand',
                   'tests_per_case', #this is the inverse of positive_rate
                   'new_vaccinations_smoothed',
                   'total_vaccinations',
                   'people_vaccinated',
                   'people_fully_vaccinated',
                   'total_boosters',
                   'new_vaccinations_smoothed_per_million',
                   'new_people_vaccinated_smoothed']
data = data.drop(columns_to_drop,axis=1)

# filtering countries by null values in other columns
null_counts = data.groupby('location').agg(lambda x: x.isnull().sum(axis=0))
# missing_counts = data.groupby('location').apply(lambda x: x.isnull().sum())
total_null = null_counts.sum(axis=1)
null_counts['total_null'] = null_counts.sum(axis=1)
#filtering top 10 countries 
sorted_countries = null_counts[['total_null']].sort_values(by = 'total_null',ascending=True)
top_df = sorted_countries.iloc[:10]
filtered_countries_list = top_df.index.to_list()
data = data[data['location'].isin(filtered_countries_list)]
print(data.shape)
total_missing_per_country = total_null
sorted_countries = total_missing_per_country.sort_values(ascending=True)
sorted_countries.head(10)

(276420, 67)
(51621, 67)
(2007, 41)


location
United States    1008
Estonia          1104
Italy            1191
Ireland          1285
Israel           1367
Czechia          1408
Malaysia         1465
Belgium          1514
Chile            1559
France           1583
dtype: int64

In [46]:
missingness_prop = sorted_countries / len(data)
sorted_countries = sorted_countries.to_frame(name='total_missing').join(missingness_prop.rename('missingness_prop'))
sorted_countries.head(10)

Unnamed: 0_level_0,total_missing,missingness_prop
location,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,1008,0.502242
Estonia,1104,0.550075
Italy,1191,0.593423
Ireland,1285,0.640259
Israel,1367,0.681116
Czechia,1408,0.701545
Malaysia,1465,0.729945
Belgium,1514,0.75436
Chile,1559,0.776781
France,1583,0.788739


In [47]:
#missingness by variable
def highlight(val):
    color = 'red' if val > 0.5 else 'default'
    return f'color: {color}'

missing_values = data.isnull().sum()
missingness_prop = missing_values / len(data)
column_missingness = missing_values.to_frame(name='total_missing').join(missingness_prop.rename('missingness_prop')).sort_values(by='missingness_prop',ascending=False)
column_missingness = column_missingness.style.applymap(highlight, subset=['missingness_prop'])
column_missingness

Unnamed: 0,total_missing,missingness_prop
handwashing_facilities,2007,1.0
total_boosters_per_hundred,996,0.496263
positive_rate,832,0.414549
new_tests_per_thousand,825,0.411061
total_tests_per_thousand,825,0.411061
tests_units,825,0.411061
weekly_icu_admissions_per_million,781,0.389138
new_vaccinations,764,0.380668
people_fully_vaccinated_per_hundred,735,0.366218
people_vaccinated_per_hundred,733,0.365222


In [48]:
#correlation matrix
corr = pd.DataFrame(np.abs(data.corrwith(data["new_deaths"])).sort_values(ascending=False))
corr = corr.rename(columns={0: 'Correlation with new_deaths'})
corr.style.applymap(highlight)

  corr = pd.DataFrame(np.abs(data.corrwith(data["new_deaths"])).sort_values(ascending=False))


Unnamed: 0,Correlation with new_deaths
new_deaths,1.0
population,0.711513
new_cases,0.642166
new_vaccinations,0.548846
weekly_icu_admissions_per_million,0.461244
male_smokers,0.372662
total_cases,0.350927
icu_patients_per_million,0.338267
weekly_hosp_admissions_per_million,0.30529
excess_mortality,0.300388


In [49]:
num_col = data.select_dtypes(include=['number']).columns
data_num = data[num_col]
threshold = 0.95
selector = VarianceThreshold(threshold)
selector.fit(data_num)
quasi_constant_indices = data_num.columns[~selector.get_support()]
quasi_constant_indices

  self.variances_ = np.nanvar(X, axis=0)


Index(['reproduction_rate', 'positive_rate',
       'new_people_vaccinated_smoothed_per_hundred', 'extreme_poverty',
       'handwashing_facilities', 'human_development_index'],
      dtype='object')

In [50]:
corr['missingness_prop'] = missingness_prop
corr['low correlation/high missingness'] = (1-corr['Correlation with new_deaths'])*missingness_prop
corr.sort_values(by='low correlation/high missingness', ascending=False).style.applymap(highlight)

Unnamed: 0,Correlation with new_deaths,missingness_prop,low correlation/high missingness
new_tests_per_thousand,0.03182,0.411061,0.397981
total_tests_per_thousand,0.042468,0.411061,0.393604
positive_rate,0.068776,0.414549,0.386038
total_boosters_per_hundred,0.250674,0.496263,0.371863
people_vaccinated_per_hundred,0.198599,0.365222,0.292689
people_fully_vaccinated_per_hundred,0.220114,0.366218,0.285609
total_vaccinations_per_hundred,0.217326,0.363229,0.28429
new_people_vaccinated_smoothed_per_hundred,0.115379,0.315894,0.279447
reproduction_rate,0.104465,0.265072,0.237382
weekly_icu_admissions_per_million,0.461244,0.389138,0.20965


In [51]:
# Variables to remove:
#       quasi-constant variables: 'reproduction_rate', 'positive_rate', 'extreme_poverty','handwashing_facilities', 'human_development_index']
#       low correlation and high missigness: 'new_tests_per_thousand','total_tests_per_thousand','positive_rate','total_boosters_per_hundred'
#       handwashing_facilities is 100% missing in the smaller data set

In [52]:
# Feature reduction 

features_to_remove = quasi_constant_indices.to_list()
features_to_remove.extend(['new_tests_per_thousand','total_tests_per_thousand','positive_rate','total_boosters_per_hundred'])
data = data.drop(features_to_remove, axis=1, errors='ignore')
data.shape

(2007, 32)