Begin by importing all necessary libraries.

In [1]:
import math
import pandas as pd
import numpy as np
%matplotlib inline

Then import all data files.

In [2]:
demo_ref = pd.read_csv('demographicref.csv') #demographic reference data
econ_ref = pd.read_csv('econref.csv') #economic reference data
health_expen = pd.read_csv('healthexpenditure.csv') #healthcare expenditure data
health_qual = pd.read_csv('healthquality.csv') #healthcare quality data
health_resor = pd.read_csv('healthresources.csv') #healthcare resources data
health_util = pd.read_csv('healthutil.csv') #healthcare utilization data
social_proc = pd.read_csv('socialprotection.csv') #social healthcare protection data
worker_migr = pd.read_csv('healthworkmigration.csv') #healthcare worker migration data

The final dataset cannot be loaded past the 284,614th line, so in order to avoid having partial data on one variable we needed to cut the last 40,000 or so entries.

In [3]:
health_stat = pd.read_csv('healthstatus.csv',nrows = 284583) #health status data

Remove unwanted columns of data.

In [4]:
demo_ref = demo_ref[['Variable','Measure','Country','Year','Value']]
econ_ref = econ_ref[['Variable', 'Measure', 'Country','Year', 'Value']]
health_expen = health_expen[['Financing scheme', 'Function','Provider', 'Measure', 'Country', 'Year', 'Unit', 'Value']]
health_qual = health_qual[['Country','Periods','Indicator','Gender','Age Group','Value','Value.1']]
health_resor = health_resor[['Variable','Measure','Country','Year','Value']]
health_util = health_util[['Variable','Measure','Country','Year','Value']]
social_proc = social_proc[['Variable','Measure','Country','Year','Value']]
worker_migr = worker_migr[['Country','Variable','Country of origin','Year','Value']]
health_stat = health_stat[['Variable','Measure','Country','Year','Value']]

Remove unwanted rows of data. Typically rows containing data measured in units that cannot be compared between countries (e.g. in local currency or raw numbers of incidents). Also removing rows containing variables with few observations or which are unlikely to be helpful for our purposes. Not all data sets required this.

In [47]:
health_expen = health_expen.loc[health_expen['Measure']=='Share of gross domestic product']

health_qual = health_qual.loc[(health_qual['Value']=='Age-sex standardised rate per 100 000 population')|
               (health_qual['Value']=='Age-sex standardised rate per 100 patients')|
               (health_qual['Value']=='Age-standardised survival (%) ')] 

health_resor = health_resor.loc[(health_resor['Measure']=='Density per 1 000 population (head counts)')|
                (health_resor['Measure']=='Per million population')|
                (health_resor['Measure']=='% of total physicians (head counts)')|
                (health_resor['Measure']=='Per 1 000 population')|
                (health_resor['Measure']=='% of physicians (head counts)')|
                (health_resor['Measure']=='Per 100 000 population')|
                (health_resor['Measure']=='% of total hospital employment (head counts)')|
                (health_resor['Measure']=='Per 1 000 live births')|
                (health_resor['Measure']=='Salaried, income, US$ exchange rate')] 

health_util = health_util.loc[(health_util['Measure']=='Per 100 000 population')|
               (health_util['Measure']=='% performed as inpatient cases')|
               (health_util['Measure']=='% performed as day cases')|
               (health_util['Measure']=='Per 100 000 females')|
               (health_util['Measure']=='Inpatient cases per 100 000 population ')|
               (health_util['Measure']=='Total procedures per 100 000 population')|
               (health_util['Measure']=='Day cases per 100 000 population')|
               (health_util['Measure']=='Per 1 000 population')]

social_proc = social_proc.loc[(social_proc['Measure']=='% of total population')|(social_proc['Measure']=='% of total population covered')] 

health_stat = health_stat.loc[(health_stat['Measure']=='Deaths per 100 000 females (standardised rates)')|
               (health_stat['Measure']=='Deaths per 100 000 females (crude rates)')|
               (health_stat['Measure']=='Years lost, /100 000 females, aged 75 years old')|
               (health_stat['Measure']=='Deaths per 100 000 males (standardised rates)')|
               (health_stat['Measure']=='Deaths per 100 000 population (standardised rates)')|
               (health_stat['Measure']=='Deaths per 100 000 males (crude rates)')|
               (health_stat['Measure']=='Years lost, /100 000 males, aged 75 years old')|
               (health_stat['Measure']=='Years lost, /100 000 population, aged 75 years old')|
               (health_stat['Measure']=='Deaths per 100 000 population (crude rates)')|
               (health_stat['Measure']=='% of population (crude rate)')|
               (health_stat['Measure']=='Years')]

Next we have to clean and tidy the data.

In [46]:
demo_ref_pv = demo_ref.pivot_table(index = ['Country','Year'], columns = 'Variable', values = 'Value')
econ_ref_pv = econ_ref.pivot_table(index = ['Country','Year'], columns = 'Variable', values = 'Value')