In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
#load the raw data file
df = pd.read_excel("WDIEXCEL.xlsx", sheet_name = "Data")

In [None]:
#sample the data
df.sample(20)

In [None]:
#check to confirm there are no missing values in the indicator code column
#because we will filter countries based on this code
df.isnull().sum()

In [None]:
#These are the variable of interest that could have an impact on life expectancy

#Birth rate, crude (per 1,000 people)	SP.DYN.CBRT.IN
#Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)	SH.DTH.COMM.ZS
#Cause of death, by non-communicable diseases (% of total)	SH.DTH.NCOM.ZS
#Current health expenditure (% of GDP)	SH.XPD.CHEX.GD.ZS
#Death rate, crude (per 1,000 people)	SP.DYN.CDRT.IN
#Fertility rate, total (births per woman)	SP.DYN.TFRT.IN
#Hospital beds (per 1,000 people)	SH.MED.BEDS.ZS
#Immunization, DPT (% of children ages 12-23 months)	SH.IMM.IDPT
#Immunization, HepB3 (% of one-year-old children)	SH.IMM.HEPB
#Immunization, measles (% of children ages 12-23 months)	SH.IMM.MEAS
#Incidence of HIV, all (per 1,000 uninfected population)	SH.HIV.INCD.TL.P3
#Incidence of tuberculosis (per 100,000 people)	SH.TBS.INCD
#Intentional homicides (per 100,000 people)	VC.IHR.PSRC.P5
#Life expectancy at birth, total (years)	SP.DYN.LE00.IN
#Mortality caused by road traffic injury (per 100,000 people)	SH.STA.TRAF.P5
#Physicians (per 1,000 people)	SH.MED.PHYS.ZS
#Population density (people per sq. km of land area)	EN.POP.DNST
#Prevalence of overweight, weight for height (% of children under 5)	SH.STA.OWGH.ZS
#Smoking prevalence, total (ages 15+)	SH.PRV.SMOK
#Suicide mortality rate (per 100,000 population)	SH.STA.SUIC.P5
#Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)	SH.ALC.PCAP.LI

In [3]:
#drop all date columns except 2018 because we have data till 2018 only although 2019 and 2020 columns also exist
#there are no data for 2019 and 2020
df_2018 = df.drop(columns=['1960', '1961', '1962','1963','1964','1965','1966','1967','1968','1969','1970','1971','1972', '1973', '1974', '1975','1976', '1977','1978', '1979','1980', '1981','1982', '1983', '1984', '1985','1986', '1987','1988', '1989','1990', '1991','1992', '1993', '1994', '1995','1996', '1997','1998', '1999','1990','1991','1992','1993', '1994', '1995','1996', '1997','1998', '1999','2000', '2001','2002','2003', '2004', '2005', '2006','2007', '2008','2009', '2010','2011', '2012','2013', '2014', '2015', '2016','2017', '2019', '2020'])

In [4]:
#check whether the "Country Name" column has only countrie name.
df_2018["Country Name"].unique()

array(['Arab World', 'Caribbean small states',
       'Central Europe and the Baltics', 'Early-demographic dividend',
       'East Asia & Pacific',
       'East Asia & Pacific (excluding high income)',
       'East Asia & Pacific (IDA & IBRD countries)', 'Euro area',
       'Europe & Central Asia',
       'Europe & Central Asia (excluding high income)',
       'Europe & Central Asia (IDA & IBRD countries)', 'European Union',
       'Fragile and conflict affected situations',
       'Heavily indebted poor countries (HIPC)', 'High income',
       'IBRD only', 'IDA & IBRD total', 'IDA blend', 'IDA only',
       'IDA total', 'Late-demographic dividend',
       'Latin America & Caribbean',
       'Latin America & Caribbean (excluding high income)',
       'Latin America & the Caribbean (IDA & IBRD countries)',
       'Least developed countries: UN classification',
       'Low & middle income', 'Low income', 'Lower middle income',
       'Middle East & North Africa',
       'Middle East & No

In [5]:
#create a list with all the row values that needs to be removed. We are removing the rows that have data about regions of
#the world because we are only interested in countries
row_values_to_remove = ['Arab World', 
'Caribbean small states',                        
'Central Europe and the Baltics', 
'East Asia & Pacific (excluding high income)',
'Early-demographic dividend', 'East Asia & Pacific',
'Europe & Central Asia (excluding high income)',
'Europe & Central Asia','Euro area',
'European Union', 'Fragile and conflict affected situations',
'High income', 
'Heavily indebted poor countries (HIPC)', 
'IBRD only', 'IDA & IBRD total', 'IDA total',
'IDA blend', 'IDA only', 
'Not classified', 
'Latin America & Caribbean (excluding high income)',
'Latin America & Caribbean',
'Least developed countries: UN classification', 'Low income',
'Lower middle income',
'Low & middle income', 'Late-demographic dividend',
'Middle East & North Africa',
'Middle East & North Africa (excluding high income)', 
'North America', 
'OECD members',
'Other small states', 
'Pre-demographic dividend',
'West Bank and Gaza', 'Pacific island small states',
'South Asia',
'Sub-Saharan Africa (excluding high income)', 
'Sub-Saharan Africa', 'Small states', 'Sao Tome and Principe',
'East Asia & Pacific (IDA & IBRD countries)',
'Europe & Central Asia (IDA & IBRD countries)', 
'Latin America & the Caribbean (IDA & IBRD countries)',
'Middle East & North Africa (IDA & IBRD countries)',
'South Asia (IDA & IBRD)',
'Sub-Saharan Africa (IDA & IBRD countries)', 
'Upper middle income']

In [6]:
#create a for loop to iterate over the length of the list and remove rows with each value from list in each iteratation
for i in range(len(row_values_to_remove)):
    df_2018 = df_2018[df_2018["Country Name"] != row_values_to_remove[i]]

In [8]:
#extract values of variables of interest into separate series 
crude_birth_rate = df_2018.loc[df_2018['Indicator Code'] == 'SP.DYN.CBRT.IN','2018'].reset_index(drop = True)
death_comm_disease = df_2018.loc[df_2018['Indicator Code'] == 'SH.DTH.COMM.ZS','2018'].reset_index(drop = True)
death_noncomm_disease = df_2018.loc[df_2018['Indicator Code'] == 'SH.DTH.NCOM.ZS','2018'].reset_index(drop = True)
health_expenditure = df_2018.loc[df_2018['Indicator Code'] == 'SH.XPD.CHEX.GD.ZS','2018'].reset_index(drop = True)
crude_death_rate = df_2018.loc[df_2018['Indicator Code'] == 'SP.DYN.CDRT.IN','2018'].reset_index(drop = True)
fertility_rate = df_2018.loc[df_2018['Indicator Code'] == 'SP.DYN.TFRT.IN','2018'].reset_index(drop = True)
hospital_bed = df_2018.loc[df_2018['Indicator Code'] == 'SH.MED.BEDS.ZS','2018'].reset_index(drop = True)
DPT_immu = df_2018.loc[df_2018['Indicator Code'] == 'SH.IMM.IDPT','2018'].reset_index(drop = True)
HepB3_immu = df_2018.loc[df_2018['Indicator Code'] == 'SH.IMM.HEPB','2018'].reset_index(drop = True)
measles_immu = df_2018.loc[df_2018['Indicator Code'] == 'SH.IMM.MEAS','2018'].reset_index(drop = True)
HIV_incidence = df_2018.loc[df_2018['Indicator Code'] == 'SH.HIV.INCD.TL.P3','2018'].reset_index(drop = True)
TB_incidence = df_2018.loc[df_2018['Indicator Code'] == 'SH.TBS.INCD','2018'].reset_index(drop = True)
homicides_per100000 = df_2018.loc[df_2018['Indicator Code'] == 'VC.IHR.PSRC.P5','2018'].reset_index(drop = True)
life_expectancy = df_2018.loc[df_2018['Indicator Code'] == 'SP.DYN.LE00.IN','2018'].reset_index(drop = True)
road_traffic_mortality = df_2018.loc[df_2018['Indicator Code'] == 'SH.STA.TRAF.P5','2018'].reset_index(drop = True)
physicians_per1000_people = df_2018.loc[df_2018['Indicator Code'] == 'SH.MED.PHYS.ZS','2018'].reset_index(drop = True)
population_density = df_2018.loc[df_2018['Indicator Code'] == 'EN.POP.DNST','2018'].reset_index(drop = True)
overweight_prevalence = df_2018.loc[df_2018['Indicator Code'] == 'SH.STA.OWGH.ZS','2018'].reset_index(drop = True)
smoking_prevalence = df_2018.loc[df_2018['Indicator Code'] == 'SH.PRV.SMOK','2018'].reset_index(drop = True)
suicide_per1000000 = df_2018.loc[df_2018['Indicator Code'] == 'SH.STA.SUIC.P5','2018'].reset_index(drop = True)
alcohol_consumption_per_capita = df_2018.loc[df_2018['Indicator Code'] == 'SH.ALC.PCAP.LI','2018'].reset_index(drop = True)


auth_series = pd.Series(author) 
article_series = pd.Series(article) 
  
frame = { 'Author': auth_series, 'Article': article_series } 
  
result = pd.DataFrame(frame) 
  
print(result) 

In [None]:
#reassign each variable into a series so we can use them to create our dataframe
crude_birth_rate = pd.Series(crude_birth_rate)
death_comm_disease = pd.Series(death_comm_disease)
death_noncomm_disease = pd.Series(death_noncomm_disease)
health_expenditure = pd.Series(health_expenditure)
crude_death_rate = pd.Series(crude_death_rate)
fertility_rate = pd.Series(fertility_rate)
hospital_bed = pd.Series(hospital_bed)
DPT_immu = pd.Series(DPT_immu)
HepB3_immu = pd.Series(HepB3_immu)
measles_immu = pd.Series(measles_immu)
HIV_incidence = pd.Series(HIV_incidence)
TB_incidence = pd.Series(TB_incidence)
homicides_per100000 = pd.Series(homicides_per100000)
life_expectancy = pd.Series(life_expectancy)
road_traffic_mortality = pd.Series(road_traffic_mortality)
physicians_per1000_people = pd.Series(physicians_per1000_people)
population_density = pd.Series(population_density)
overweight_prevalence = pd.Series(overweight_prevalence)
smoking_prevalence = pd.Series(smoking_prevalence)
suicide_per1000000 = pd.Series(suicide_per1000000)
alcohol_consumption_per_capita = pd.Series(alcohol_consumption_per_capita)

crude_birth_rate 
death_comm_disease  
death_noncomm_disease
health_expenditure 
crude_death_rate  
fertility_rate  
hospital_bed  
DPT_immu  
HepB3_immu  
measles_immu  
HIV_incidence  
TB_incidence  
homicides_per100000  
life_expectancy  
road_traffic_mortality  
physicians_per1000_people  
population_density  
overweight_prevalence  



In [None]:
#veiry each series is same length
#crude_birth_rate.shape,death_comm_disease.shape,health_expenditure.shape,crude_death_rate.shape,fertility_rate.shape,hospital_bed.shape,DPT_immu.shape, HepB3_immu.shape,measles_immu.shape,HIV_incidence.shape,TB_incidence.shape,homicides_per100000.shape,life_expectancy.shape,road_traffic_mortality.shape, physicians_per1000_people.shape,population_density.shape,overweight_prevalence.shape,smoking_prevalence.shape,suicide_per1000000.shape,alcohol_consumption_per_capita.shape 


In [9]:
#create a variable to store all variables to create a dataframe
frame = {'Crude_birth_rate':crude_birth_rate, 
         'Death_comm_disease':death_comm_disease,  
         'Death_noncomm_disease':death_noncomm_disease,
         'Health_expenditure':health_expenditure,
         'Crude_death_rate':crude_death_rate,
         'Fertility_rate':fertility_rate,
         'Hospital_bed':hospital_bed,  
         'DPT_immu':DPT_immu,  
         'HepB3_immu':HepB3_immu,  
         'Measles_immu':measles_immu,
         'HIV_incidence':HIV_incidence,  
         'TB_incidence':TB_incidence,  
         'Homicides_per100000':homicides_per100000,  
         'Life_expectancy':life_expectancy,  
         'Road_traffic_mortality':road_traffic_mortality,  
         'Physicians_per1000_people':physicians_per1000_people,  
         'Population_density':population_density,  
         'Overweight_prevalence':overweight_prevalence,  
         'Smoking_prevalence':smoking_prevalence,  
         'Suicide_per1000000':suicide_per1000000,  
         'Alcohol_consumption_per_capita':alcohol_consumption_per_capita
        }

In [13]:
#create the dataframe
result = pd.DataFrame(frame).reset_index(drop = True)

In [17]:
#check the dataframe
result.head(10)
result.shape

(218, 21)

#filter data frame to isolate rows with each variables of interest for every country in the dataframe

crude_birth_rate = df.loc[df['Indicator Code'] == 'SP.DYN.CBRT.IN']
death_comm_disease = df.loc[df['Indicator Code'] == 'SH.DTH.COMM.ZS']
death_noncomm_disease = df.loc[df['Indicator Code'] == 'SH.DTH.NCOM.ZS']
health_expenditure = df.loc[df['Indicator Code'] == 'SH.XPD.CHEX.GD.ZS']
crude_death_rate = df.loc[df['Indicator Code'] == 'SP.DYN.CDRT.IN']
fertility_rate = df.loc[df['Indicator Code'] == 'SP.DYN.TFRT.IN']
hospital_bed = df.loc[df['Indicator Code'] == 'SH.MED.BEDS.ZS']
DPT_immu = df.loc[df['Indicator Code'] == 'SH.IMM.IDPT']
HepB3_immu = df.loc[df['Indicator Code'] == 'SH.IMM.HEPB']
measles_immu = df.loc[df['Indicator Code'] == 'SH.IMM.MEAS']
HIV_incidence = df.loc[df['Indicator Code'] == 'SH.HIV.INCD.TL.P3']
TB_incidence = df.loc[df['Indicator Code'] == 'SH.TBS.INCD']
homicides_per100000 = df.loc[df['Indicator Code'] == 'VC.IHR.PSRC.P5']
life_expectancy = df.loc[df['Indicator Code'] == 'SP.DYN.LE00.IN']
road_traffic_mortality = df.loc[df['Indicator Code'] == 'SH.STA.TRAF.P5']
physicians_per1000_people = df.loc[df['Indicator Code'] == 'SH.MED.PHYS.ZS']
population_density = df.loc[df['Indicator Code'] == 'EN.POP.DNST']
overweight_prevalence = df.loc[df['Indicator Code'] == 'SH.STA.OWGH.ZS']
smoking_prevalence = df.loc[df['Indicator Code'] == 'SH.PRV.SMOK']
suicide_per1000000 = df.loc[df['Indicator Code'] == 'SH.STA.SUIC.P5']
alcohol_consumption_per_capita = df.loc[df['Indicator Code'] == 'SH.ALC.PCAP.LI']





"Country Name" column seems to have grouped region names in addition to country names. So, lets remove all the rows which have broader region in the dataset.

#remove rows with non country values. These inlucde 
'Arab World', 
'Central Europe and the Baltics', 
'East Asia & Pacific (excluding high income)',
'Early-demographic dividend', 'East Asia & Pacific',
'Europe & Central Asia (excluding high income)',
'Europe & Central Asia','Euro area',
'European Union', 'Fragile and conflict affected situations',
'High income', 
'Heavily indebted poor countries (HIPC)', 
'IBRD only', 'IDA & IBRD total', 'IDA total',
'IDA blend', 'IDA only', 
'Not classified', 
'Latin America & Caribbean (excluding high income)',
'Latin America & Caribbean',
'Least developed countries: UN classification', 'Low income',
'Lower middle income',
'Low & middle income', 'Late-demographic dividend',
'Middle East & North Africa',
'Middle East & North Africa (excluding high income)', 
'North America', 
'OECD members',
'Other small states', 
'Pre-demographic dividend',
'West Bank and Gaza', 'Pacific island small states',
'South Asia',
'Sub-Saharan Africa (excluding high income)', 
'Sub-Saharan Africa', 'Small states', 'Sao Tome and Principe',
'East Asia & Pacific (IDA & IBRD countries)',
'Europe & Central Asia (IDA & IBRD countries)', 
'Latin America & the Caribbean (IDA & IBRD countries)',
'Middle East & North Africa (IDA & IBRD countries)',
'South Asia (IDA & IBRD)',
'Sub-Saharan Africa (IDA & IBRD countries)', 
'Upper middle income', 



In [12]:
#save the cleaned dataframe for EDA
result.to_csv("demographics_data.csv")