In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
# Question: Between different countries,
# what is the relationship between Case Fatality Rate and hospital beds per capita?

In [3]:
# This url updates more frequently than the one Dr. Mick gave, according to the authors.
raw_covid = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv')

raw_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49878 entries, 0 to 49877
Data columns (total 41 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   iso_code                         49590 non-null  object 
 1   continent                        49302 non-null  object 
 2   location                         49878 non-null  object 
 3   date                             49878 non-null  object 
 4   total_cases                      49242 non-null  float64
 5   new_cases                        49018 non-null  float64
 6   new_cases_smoothed               48236 non-null  float64
 7   total_deaths                     49242 non-null  float64
 8   new_deaths                       49018 non-null  float64
 9   new_deaths_smoothed              48236 non-null  float64
 10  total_cases_per_million          48954 non-null  float64
 11  new_cases_per_million            48954 non-null  float64
 12  new_cases_smoothed

In [4]:
# We need to clean up the data, removing non-countries & outliers.
country_covid = raw_covid[(raw_covid.location != 'International')
                         &(raw_covid.location != 'World')
                         &(raw_covid.hospital_beds_per_thousand.notnull()) # excluding nulls
                         &(raw_covid.location != 'Yemen') # extreme outlier
                         ]


In [5]:
country_covid.location.unique()

array(['Afghanistan', 'Albania', 'United Arab Emirates', 'Argentina',
       'Armenia', 'Antigua and Barbuda', 'Australia', 'Austria',
       'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso',
       'Bangladesh', 'Bulgaria', 'Bahrain', 'Bahamas',
       'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bolivia', 'Brazil',
       'Barbados', 'Brunei', 'Bhutan', 'Botswana',
       'Central African Republic', 'Canada', 'Switzerland', 'Chile',
       'China', 'Cameroon', 'Colombia', 'Comoros', 'Cape Verde',
       'Costa Rica', 'Cuba', 'Cyprus', 'Czech Republic', 'Germany',
       'Djibouti', 'Dominica', 'Denmark', 'Dominican Republic', 'Algeria',
       'Ecuador', 'Egypt', 'Eritrea', 'Spain', 'Estonia', 'Ethiopia',
       'Finland', 'Fiji', 'France', 'Gabon', 'United Kingdom', 'Georgia',
       'Ghana', 'Guinea', 'Gambia', 'Equatorial Guinea', 'Greece',
       'Grenada', 'Guatemala', 'Guyana', 'Honduras', 'Croatia', 'Haiti',
       'Hungary', 'Indonesia', 'India', 'Ireland', 'Ira

In [6]:
# Overall case fatality rate = total deaths / total cases x 100
cfr_by_country = country_covid.groupby('location').total_deaths.max() / country_covid.groupby('location').total_cases.max() * 100

cfr_by_country

location
Afghanistan            3.706672
Albania                2.723186
Algeria                3.398779
Antigua and Barbuda    2.702703
Argentina              2.676280
                         ...   
Uzbekistan             0.828810
Venezuela              0.840537
Vietnam                3.153153
Zambia                 2.218792
Zimbabwe               2.867473
Length: 163, dtype: float64

In [7]:
# Predictor here is hospitals per thousand. For each country, that field is constant.
# If it were not, I might still just use the mean, but that seems a little sketchy.
predictor = country_covid.groupby('location').hospital_beds_per_thousand.mean()

predictor

location
Afghanistan            0.50
Albania                2.89
Algeria                1.90
Antigua and Barbuda    3.80
Argentina              5.00
                       ... 
Uzbekistan             4.00
Venezuela              0.80
Vietnam                2.60
Zambia                 2.00
Zimbabwe               1.70
Name: hospital_beds_per_thousand, Length: 163, dtype: float64

In [8]:
# Calculates r^2. We see that this is barely 1%.
predictor.corr(cfr_by_country, method='pearson') ** 2

0.010396215930641458

In [9]:
# How come there is no correlation whatsoever? 