In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
#load the raw data file
df = pd.read_excel("WDIEXCEL.xlsx", sheet_name = "Data")

In [3]:
#sample the data
df.sample(20)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
154676,Equatorial Guinea,GNQ,Imports of goods and services (constant 2010 US$),NE.IMP.GNFS.KD,,,,,,,...,7747964000.0,9592742000.0,7898337000.0,7452640000.0,6300999000.0,5274790000.0,5714161000.0,6135794000.0,6251749000.0,
267674,Nepal,NPL,"Stocks traded, turnover ratio of domestic shar...",CM.MKT.TRNR,,,,,,,...,,,,,,,,,,
293105,Poland,POL,"Mortality from CVD, cancer, diabetes or CRD be...",SH.DYN.NCOM.MA.ZS,,,,,,,...,,,,,26.4,25.3,,,,
192970,Hungary,HUN,Account ownership at a financial institution o...,FX.OWN.TOTL.60.ZS,,,,,,,...,75.89096,,,76.03838,,,79.77351,,,
126174,Chile,CHL,"Net official flows from UN agencies, UNECE (cu...",DT.NFL.UNEC.CD,,,,,,,...,,,,,,,,,,
364618,United States,USA,DEC alternative conversion factor (LCU per US$),PA.NUS.ATLS,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
135653,Cote d'Ivoire,CIV,Customs and other import duties (current LCU),GC.TAX.IMPT.CN,,,,,,,...,399200000000.0,670600000000.0,752217000000.0,840278600000.0,440162200000.0,448171000000.0,485538700000.0,524717600000.0,,
371604,Vietnam,VNM,Annualized average growth rate in per capita r...,SI.SPR.PC40.ZG,,,,,,,...,,,,,,,,4.91,,
371056,"Venezuela, RB",VEN,People with basic handwashing facilities inclu...,SH.STA.HYGN.UR.ZS,,,,,,,...,,,,,,,,,,
285611,Panama,PAN,Gini index (World Bank estimate),SI.POV.GINI,,,,,,,...,51.3,51.7,51.5,50.5,50.8,50.4,49.9,49.2,,


In [4]:
#check to confirm there are no missing values in the indicator code column
#because we will filter countries based on this code
df.isnull().sum()

Country Name           0
Country Code           0
Indicator Name         0
Indicator Code         0
1960              342527
                   ...  
2016              172632
2017              188422
2018              207180
2019              271505
2020              366138
Length: 65, dtype: int64

These are the variable of interest that could have an impact on life expectancy.

#Birth rate, crude (per 1,000 people)	SP.DYN.CBRT.IN
#Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)	SH.DTH.COMM.ZS
#Cause of death, by non-communicable diseases (% of total)	SH.DTH.NCOM.ZS
#Current health expenditure (% of GDP)	SH.XPD.CHEX.GD.ZS
#Death rate, crude (per 1,000 people)	SP.DYN.CDRT.IN
#Fertility rate, total (births per woman)	SP.DYN.TFRT.IN
#Hospital beds (per 1,000 people)	SH.MED.BEDS.ZS
#Immunization, DPT (% of children ages 12-23 months)	SH.IMM.IDPT
#Immunization, HepB3 (% of one-year-old children)	SH.IMM.HEPB
#Immunization, measles (% of children ages 12-23 months)	SH.IMM.MEAS
#Incidence of HIV, all (per 1,000 uninfected population)	SH.HIV.INCD.TL.P3
#Incidence of tuberculosis (per 100,000 people)	SH.TBS.INCD
#Intentional homicides (per 100,000 people)	VC.IHR.PSRC.P5
#Life expectancy at birth, total (years)	SP.DYN.LE00.IN
#Mortality caused by road traffic injury (per 100,000 people)	SH.STA.TRAF.P5
#Physicians (per 1,000 people)	SH.MED.PHYS.ZS
#Population density (people per sq. km of land area)	EN.POP.DNST
#Prevalence of overweight, weight for height (% of children under 5)	SH.STA.OWGH.ZS
#Smoking prevalence, total (ages 15+)	SH.PRV.SMOK
#Suicide mortality rate (per 100,000 population)	SH.STA.SUIC.P5
#Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)	SH.ALC.PCAP.LI

In [5]:
#drop all date columns except 2018 because we have data till 2018 only although 2019 and 2020 columns also exist
#there are no data for 2019 and 2020
df_2018 = df.drop(columns=['1960', '1961', '1962','1963','1964','1965','1966','1967','1968','1969','1970','1971','1972', '1973', '1974', '1975','1976', '1977','1978', '1979','1980', '1981','1982', '1983', '1984', '1985','1986', '1987','1988', '1989','1990', '1991','1992', '1993', '1994', '1995','1996', '1997','1998', '1999','1990','1991','1992','1993', '1994', '1995','1996', '1997','1998', '1999','2000', '2001','2002','2003', '2004', '2005', '2006','2007', '2008','2009', '2010','2011', '2012','2013', '2014', '2015', '2016','2017', '2019', '2020'])

"Country Name" column seems to have grouped region names in addition to country names. So, lets remove all the rows which have broader region in the dataset.

In [6]:
#check whether the "Country Name" column has only countries name.
df_2018["Country Name"].unique()

array(['Arab World', 'Caribbean small states',
       'Central Europe and the Baltics', 'Early-demographic dividend',
       'East Asia & Pacific',
       'East Asia & Pacific (excluding high income)',
       'East Asia & Pacific (IDA & IBRD countries)', 'Euro area',
       'Europe & Central Asia',
       'Europe & Central Asia (excluding high income)',
       'Europe & Central Asia (IDA & IBRD countries)', 'European Union',
       'Fragile and conflict affected situations',
       'Heavily indebted poor countries (HIPC)', 'High income',
       'IBRD only', 'IDA & IBRD total', 'IDA blend', 'IDA only',
       'IDA total', 'Late-demographic dividend',
       'Latin America & Caribbean',
       'Latin America & Caribbean (excluding high income)',
       'Latin America & the Caribbean (IDA & IBRD countries)',
       'Least developed countries: UN classification',
       'Low & middle income', 'Low income', 'Lower middle income',
       'Middle East & North Africa',
       'Middle East & No

In [7]:
#create a list with all the row values that needs to be removed. We are removing the rows that have data about regions of
#the world because we are only interested in countries
row_values_to_remove = ['Arab World', 
'Caribbean small states',                        
'Central Europe and the Baltics', 
'East Asia & Pacific (excluding high income)',
'Early-demographic dividend', 'East Asia & Pacific',
'Europe & Central Asia (excluding high income)',
'Europe & Central Asia','Euro area',
'European Union', 'Fragile and conflict affected situations',
'High income', 
'Heavily indebted poor countries (HIPC)', 
'IBRD only', 'IDA & IBRD total', 'IDA total',
'IDA blend', 'IDA only', 
'Not classified', 
'Latin America & Caribbean (excluding high income)',
'Latin America & Caribbean',
'Least developed countries: UN classification', 'Low income',
'Lower middle income',
'Low & middle income', 'Late-demographic dividend',
'Middle East & North Africa',
'Middle East & North Africa (excluding high income)', 
'North America', 
'OECD members',
'Other small states', 
'Pre-demographic dividend',
'West Bank and Gaza', 'Pacific island small states',
'South Asia',
'Sub-Saharan Africa (excluding high income)', 
'Sub-Saharan Africa', 'Small states', 'Sao Tome and Principe',
'East Asia & Pacific (IDA & IBRD countries)',
'Europe & Central Asia (IDA & IBRD countries)', 
'Latin America & the Caribbean (IDA & IBRD countries)',
'Middle East & North Africa (IDA & IBRD countries)',
'South Asia (IDA & IBRD)',
'Sub-Saharan Africa (IDA & IBRD countries)', 
'Upper middle income']

In [8]:
#create a for loop to iterate over the length of the list and remove rows with each value from list in each iteratation
for i in range(len(row_values_to_remove)):
    df_2018 = df_2018[df_2018["Country Name"] != row_values_to_remove[i]]

In [9]:
#extract values of variables of interest into separate series 
crude_birth_rate = df_2018.loc[df_2018['Indicator Code'] == 'SP.DYN.CBRT.IN','2018'].reset_index(drop = True)
death_comm_disease = df_2018.loc[df_2018['Indicator Code'] == 'SH.DTH.COMM.ZS','2018'].reset_index(drop = True)
death_noncomm_disease = df_2018.loc[df_2018['Indicator Code'] == 'SH.DTH.NCOM.ZS','2018'].reset_index(drop = True)
health_expenditure = df_2018.loc[df_2018['Indicator Code'] == 'SH.XPD.CHEX.GD.ZS','2018'].reset_index(drop = True)
crude_death_rate = df_2018.loc[df_2018['Indicator Code'] == 'SP.DYN.CDRT.IN','2018'].reset_index(drop = True)
fertility_rate = df_2018.loc[df_2018['Indicator Code'] == 'SP.DYN.TFRT.IN','2018'].reset_index(drop = True)
hospital_bed = df_2018.loc[df_2018['Indicator Code'] == 'SH.MED.BEDS.ZS','2018'].reset_index(drop = True)
DPT_immu = df_2018.loc[df_2018['Indicator Code'] == 'SH.IMM.IDPT','2018'].reset_index(drop = True)
HepB3_immu = df_2018.loc[df_2018['Indicator Code'] == 'SH.IMM.HEPB','2018'].reset_index(drop = True)
measles_immu = df_2018.loc[df_2018['Indicator Code'] == 'SH.IMM.MEAS','2018'].reset_index(drop = True)
HIV_incidence = df_2018.loc[df_2018['Indicator Code'] == 'SH.HIV.INCD.TL.P3','2018'].reset_index(drop = True)
TB_incidence = df_2018.loc[df_2018['Indicator Code'] == 'SH.TBS.INCD','2018'].reset_index(drop = True)
homicides_per100000 = df_2018.loc[df_2018['Indicator Code'] == 'VC.IHR.PSRC.P5','2018'].reset_index(drop = True)
life_expectancy = df_2018.loc[df_2018['Indicator Code'] == 'SP.DYN.LE00.IN','2018'].reset_index(drop = True)
road_traffic_mortality = df_2018.loc[df_2018['Indicator Code'] == 'SH.STA.TRAF.P5','2018'].reset_index(drop = True)
physicians_per1000_people = df_2018.loc[df_2018['Indicator Code'] == 'SH.MED.PHYS.ZS','2018'].reset_index(drop = True)
population_density = df_2018.loc[df_2018['Indicator Code'] == 'EN.POP.DNST','2018'].reset_index(drop = True)
overweight_prevalence = df_2018.loc[df_2018['Indicator Code'] == 'SH.STA.OWGH.ZS','2018'].reset_index(drop = True)
smoking_prevalence = df_2018.loc[df_2018['Indicator Code'] == 'SH.PRV.SMOK','2018'].reset_index(drop = True)
suicide_per1000000 = df_2018.loc[df_2018['Indicator Code'] == 'SH.STA.SUIC.P5','2018'].reset_index(drop = True)
alcohol_consumption_per_capita = df_2018.loc[df_2018['Indicator Code'] == 'SH.ALC.PCAP.LI','2018'].reset_index(drop = True)


In [10]:
#reassign each variable into a series so we can use them to create our dataframe
crude_birth_rate = pd.Series(crude_birth_rate)
death_comm_disease = pd.Series(death_comm_disease)
death_noncomm_disease = pd.Series(death_noncomm_disease)
health_expenditure = pd.Series(health_expenditure)
crude_death_rate = pd.Series(crude_death_rate)
fertility_rate = pd.Series(fertility_rate)
hospital_bed = pd.Series(hospital_bed)
DPT_immu = pd.Series(DPT_immu)
HepB3_immu = pd.Series(HepB3_immu)
measles_immu = pd.Series(measles_immu)
HIV_incidence = pd.Series(HIV_incidence)
TB_incidence = pd.Series(TB_incidence)
homicides_per100000 = pd.Series(homicides_per100000)
life_expectancy = pd.Series(life_expectancy)
road_traffic_mortality = pd.Series(road_traffic_mortality)
physicians_per1000_people = pd.Series(physicians_per1000_people)
population_density = pd.Series(population_density)
overweight_prevalence = pd.Series(overweight_prevalence)
smoking_prevalence = pd.Series(smoking_prevalence)
suicide_per1000000 = pd.Series(suicide_per1000000)
alcohol_consumption_per_capita = pd.Series(alcohol_consumption_per_capita)

In [11]:
#verify each series is same length
crude_birth_rate.shape,death_comm_disease.shape,health_expenditure.shape,crude_death_rate.shape,fertility_rate.shape,hospital_bed.shape,DPT_immu.shape, HepB3_immu.shape,measles_immu.shape,HIV_incidence.shape,TB_incidence.shape,homicides_per100000.shape,life_expectancy.shape,road_traffic_mortality.shape, physicians_per1000_people.shape,population_density.shape,overweight_prevalence.shape,smoking_prevalence.shape,suicide_per1000000.shape,alcohol_consumption_per_capita.shape 


((218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,),
 (218,))

In [12]:
#create a variable to store all variables to create a dataframe
frame = {'Crude_birth_rate':crude_birth_rate, 
         'Death_comm_disease':death_comm_disease,  
         'Death_noncomm_disease':death_noncomm_disease,
         'Health_expenditure':health_expenditure,
         'Crude_death_rate':crude_death_rate,
         'Fertility_rate':fertility_rate,
         'Hospital_bed':hospital_bed,  
         'DPT_immu':DPT_immu,  
         'HepB3_immu':HepB3_immu,  
         'Measles_immu':measles_immu,
         'HIV_incidence':HIV_incidence,  
         'TB_incidence':TB_incidence,  
         'Homicides_per100000':homicides_per100000,  
         'Life_expectancy':life_expectancy,  
         'Road_traffic_mortality':road_traffic_mortality,  
         'Physicians_per1000_people':physicians_per1000_people,  
         'Population_density':population_density,  
         'Overweight_prevalence':overweight_prevalence,  
         'Smoking_prevalence':smoking_prevalence,  
         'Suicide_per1000000':suicide_per1000000,  
         'Alcohol_consumption_per_capita':alcohol_consumption_per_capita
        }

In [13]:
#create the dataframe
result = pd.DataFrame(frame).reset_index(drop = True)

In [14]:
#check the dataframe
result.head(10)


Unnamed: 0,Crude_birth_rate,Death_comm_disease,Death_noncomm_disease,Health_expenditure,Crude_death_rate,Fertility_rate,Hospital_bed,DPT_immu,HepB3_immu,Measles_immu,...,TB_incidence,Homicides_per100000,Life_expectancy,Road_traffic_mortality,Physicians_per1000_people,Population_density,Overweight_prevalence,Smoking_prevalence,Suicide_per1000000,Alcohol_consumption_per_capita
0,17.995227,,,,7.195875,2.335178,,87.045374,86.436571,87.988729,...,148.0,,71.854614,,,71.864149,6.2,,,
1,9.828199,,,,9.55392,1.55168,,93.800214,89.719856,93.554442,...,,,80.659892,,,35.24983,,,,
2,18.174855,,,,7.528826,2.414975,,85.603028,84.737825,85.634452,...,132.0,5.77975,72.563274,,,59.617881,5.6,,,
3,32.487,,,,6.423,4.473,,66.0,66.0,64.0,...,189.0,6.655561,64.486,,,56.93776,4.1,,,
4,11.78,,,,7.898,1.617,,99.0,99.0,94.0,...,18.0,2.289492,78.458,,,104.612263,,,,
5,24.282,,,,4.716,3.023,,91.0,91.0,80.0,...,69.0,,76.693,,1.7193,17.730075,,,,
6,,,,,,,,,,,...,0.0,,,,,277.325,,,,
7,7.2,,,,4.4,,,99.0,98.0,99.0,...,3.0,,,,,163.842553,,,,
8,40.729,,,,8.19,5.519,,63.0,59.0,50.0,...,355.0,,60.782,,,24.713052,,,,
9,15.327,,,,6.366,1.994,,95.0,95.0,96.0,...,6.0,,76.885,,,218.831818,,,,


In [15]:
result.head(5)

Unnamed: 0,Crude_birth_rate,Death_comm_disease,Death_noncomm_disease,Health_expenditure,Crude_death_rate,Fertility_rate,Hospital_bed,DPT_immu,HepB3_immu,Measles_immu,...,TB_incidence,Homicides_per100000,Life_expectancy,Road_traffic_mortality,Physicians_per1000_people,Population_density,Overweight_prevalence,Smoking_prevalence,Suicide_per1000000,Alcohol_consumption_per_capita
0,17.995227,,,,7.195875,2.335178,,87.045374,86.436571,87.988729,...,148.0,,71.854614,,,71.864149,6.2,,,
1,9.828199,,,,9.55392,1.55168,,93.800214,89.719856,93.554442,...,,,80.659892,,,35.24983,,,,
2,18.174855,,,,7.528826,2.414975,,85.603028,84.737825,85.634452,...,132.0,5.77975,72.563274,,,59.617881,5.6,,,
3,32.487,,,,6.423,4.473,,66.0,66.0,64.0,...,189.0,6.655561,64.486,,,56.93776,4.1,,,
4,11.78,,,,7.898,1.617,,99.0,99.0,94.0,...,18.0,2.289492,78.458,,,104.612263,,,,


There seems to be a few columns with no data at all. Drop those columns from the  "result" dataframe.

In [16]:
#drop empty columns
result = result.drop(columns=['Death_comm_disease', 'Death_noncomm_disease','Health_expenditure','Hospital_bed','Road_traffic_mortality','Smoking_prevalence', 'Suicide_per1000000','Alcohol_consumption_per_capita'])


In [17]:
#verify new data frame
result.head()

Unnamed: 0,Crude_birth_rate,Crude_death_rate,Fertility_rate,DPT_immu,HepB3_immu,Measles_immu,HIV_incidence,TB_incidence,Homicides_per100000,Life_expectancy,Physicians_per1000_people,Population_density,Overweight_prevalence
0,17.995227,7.195875,2.335178,87.045374,86.436571,87.988729,,148.0,,71.854614,,71.864149,6.2
1,9.828199,9.55392,1.55168,93.800214,89.719856,93.554442,,,,80.659892,,35.24983,
2,18.174855,7.528826,2.414975,85.603028,84.737825,85.634452,0.23,132.0,5.77975,72.563274,,59.617881,5.6
3,32.487,6.423,4.473,66.0,66.0,64.0,0.04,189.0,6.655561,64.486,,56.93776,4.1
4,11.78,7.898,1.617,99.0,99.0,94.0,0.03,18.0,2.289492,78.458,,104.612263,


There are still columns with NaN values. Lets impute those values with mean of respective columns.

In [18]:
result['Crude_birth_rate'] = result['Crude_birth_rate'].fillna((result['Crude_birth_rate'].mean()))
result['Crude_death_rate'] = result['Crude_death_rate'].fillna((result['Crude_death_rate'].mean()))
result['Fertility_rate'] = result['Fertility_rate'].fillna((result['Fertility_rate'].mean()))
result['DPT_immu'] = result['DPT_immu'].fillna((result['DPT_immu'].mean()))
result['HepB3_immu'] = result['HepB3_immu'].fillna((result['HepB3_immu'].mean()))
result['Measles_immu'] = result['Measles_immu'].fillna((result['Measles_immu'].mean()))
result['HIV_incidence'] = result['HIV_incidence'].fillna((result['HIV_incidence'].mean()))
result['TB_incidence'] = result['TB_incidence'].fillna((result['TB_incidence'].mean()))
result['Homicides_per100000'] = result['Homicides_per100000'].fillna((result['Homicides_per100000'].mean()))
result['Life_expectancy'] = result['Life_expectancy'].fillna((result['Life_expectancy'].mean()))
result['Physicians_per1000_people'] = result['Physicians_per1000_people'].fillna((result['Physicians_per1000_people'].mean()))
result['Population_density'] = result['Population_density'].fillna((result['Population_density'].mean()))
result['Overweight_prevalence'] = result['Overweight_prevalence'].fillna((result['Overweight_prevalence'].mean()))


In [19]:
result.head()

Unnamed: 0,Crude_birth_rate,Crude_death_rate,Fertility_rate,DPT_immu,HepB3_immu,Measles_immu,HIV_incidence,TB_incidence,Homicides_per100000,Life_expectancy,Physicians_per1000_people,Population_density,Overweight_prevalence
0,17.995227,7.195875,2.335178,87.045374,86.436571,87.988729,0.61822,148.0,6.405792,71.854614,1.933048,71.864149,6.2
1,9.828199,9.55392,1.55168,93.800214,89.719856,93.554442,0.61822,102.948086,6.405792,80.659892,1.933048,35.24983,5.077778
2,18.174855,7.528826,2.414975,85.603028,84.737825,85.634452,0.23,132.0,5.77975,72.563274,1.933048,59.617881,5.6
3,32.487,6.423,4.473,66.0,66.0,64.0,0.04,189.0,6.655561,64.486,1.933048,56.93776,4.1
4,11.78,7.898,1.617,99.0,99.0,94.0,0.03,18.0,2.289492,78.458,1.933048,104.612263,5.077778


In [20]:
result.isna().sum()

Crude_birth_rate             0
Crude_death_rate             0
Fertility_rate               0
DPT_immu                     0
HepB3_immu                   0
Measles_immu                 0
HIV_incidence                0
TB_incidence                 0
Homicides_per100000          0
Life_expectancy              0
Physicians_per1000_people    0
Population_density           0
Overweight_prevalence        0
dtype: int64

In [21]:
#save the cleaned dataframe for EDA
result.to_csv("demographics_data.csv")