In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
#load the raw data file
df = pd.read_excel("WDIEXCEL.xlsx", sheet_name = "Data")

In [3]:
df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,82.783289,83.120303,83.533457,83.897596,84.171599,84.510171,,,,
1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,87.199474,87.51226,88.129881,87.275323,88.720097,89.308602,90.283638,89.286856,,
2,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,75.958878,77.251714,78.165706,75.512153,78.211,79.065508,81.102134,79.2481,,
3,Arab World,ARB,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,,,,,,,...,96.466418,96.435957,96.772853,96.466705,96.936319,97.290083,97.467915,97.063959,,
4,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,,,,,,,...,22.260538,,,30.27713,,,37.165211,,,


In [4]:
#check to confirm there are no missing values in the indicator code column
#because we will filter countries based on this code
df["Indicator Code"].isnull().sum()

0

In [5]:
#inetesting variables

#Birth rate, crude (per 1,000 people)	SP.DYN.CBRT.IN
#Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)	SH.DTH.COMM.ZS
#Cause of death, by non-communicable diseases (% of total)	SH.DTH.NCOM.ZS
#Current health expenditure (% of GDP)	SH.XPD.CHEX.GD.ZS
#Death rate, crude (per 1,000 people)	SP.DYN.CDRT.IN
#Fertility rate, total (births per woman)	SP.DYN.TFRT.IN
#Hospital beds (per 1,000 people)	SH.MED.BEDS.ZS
#Immunization, DPT (% of children ages 12-23 months)	SH.IMM.IDPT
#Immunization, HepB3 (% of one-year-old children)	SH.IMM.HEPB
#Immunization, measles (% of children ages 12-23 months)	SH.IMM.MEAS
#Incidence of HIV, all (per 1,000 uninfected population)	SH.HIV.INCD.TL.P3
#Incidence of tuberculosis (per 100,000 people)	SH.TBS.INCD
#Intentional homicides (per 100,000 people)	VC.IHR.PSRC.P5
#Life expectancy at birth, total (years)	SP.DYN.LE00.IN
#Mortality caused by road traffic injury (per 100,000 people)	SH.STA.TRAF.P5
#Physicians (per 1,000 people)	SH.MED.PHYS.ZS
#Population density (people per sq. km of land area)	EN.POP.DNST
#Prevalence of overweight, weight for height (% of children under 5)	SH.STA.OWGH.ZS
#Smoking prevalence, total (ages 15+)	SH.PRV.SMOK
#Suicide mortality rate (per 100,000 population)	SH.STA.SUIC.P5
#Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)	SH.ALC.PCAP.LI

In [6]:
crude_birth_rate = df.loc[df['Indicator Code'] == 'SP.DYN.CBRT.IN']
death_comm_disease = df.loc[df['Indicator Code'] == 'SH.DTH.COMM.ZS']
death_noncomm_disease = df.loc[df['Indicator Code'] == 'SH.DTH.NCOM.ZS']
health_expenditure = df.loc[df['Indicator Code'] == 'SH.XPD.CHEX.GD.ZS']
crude_death_rate = df.loc[df['Indicator Code'] == 'SP.DYN.CDRT.IN']
fertility_rate = df.loc[df['Indicator Code'] == 'SP.DYN.TFRT.IN']
hospital_bed = df.loc[df['Indicator Code'] == 'SH.MED.BEDS.ZS']
DPT_immu = df.loc[df['Indicator Code'] == 'SH.IMM.IDPT']
HepB3_immu = df.loc[df['Indicator Code'] == 'SH.IMM.HEPB']
measles_immu = df.loc[df['Indicator Code'] == 'SH.IMM.MEAS']
HIV_incidence = df.loc[df['Indicator Code'] == 'SH.HIV.INCD.TL.P3']
TB_incidence = df.loc[df['Indicator Code'] == 'SH.TBS.INCD']
homicides_per100000 = df.loc[df['Indicator Code'] == 'VC.IHR.PSRC.P5']
life_expectancy = df.loc[df['Indicator Code'] == 'SP.DYN.LE00.IN']
road_traffic_mortality = df.loc[df['Indicator Code'] == 'SH.STA.TRAF.P5']
physicians_per1000_people = df.loc[df['Indicator Code'] == 'SH.MED.PHYS.ZS']
population_density = df.loc[df['Indicator Code'] == 'EN.POP.DNST']
overweight_prevalence = df.loc[df['Indicator Code'] == 'SH.STA.OWGH.ZS']
smoking_prevalence = df.loc[df['Indicator Code'] == 'SH.PRV.SMOK']
suicide_per1000000 = df.loc[df['Indicator Code'] == 'SH.STA.SUIC.P5']
alcohol_consumtion_per_capita = df.loc[df['Indicator Code'] == 'SH.ALC.PCAP.LI']





In [7]:
crude_birth_rate.shape, crude_death_rate.shape, fertility_rate.shape, life_expectancy.shape,suicide_per1000000.shape

((264, 65), (264, 65), (264, 65), (264, 65), (264, 65))

In [8]:
overweight_prevalence.shape, homicides_per100000.shape, death_comm_disease.shape

((264, 65), (264, 65), (264, 65))

In [9]:
df = pd.concat([crude_birth_rate, death_comm_disease, death_noncomm_disease, health_expenditure, crude_death_rate,fertility_rate,
               hospital_bed, DPT_immu, HepB3_immu, measles_immu, HIV_incidence, TB_incidence, homicides_per100000, life_expectancy,
               road_traffic_mortality, physicians_per1000_people, population_density, overweight_prevalence, smoking_prevalence,
               suicide_per1000000, alcohol_consumtion_per_capita])

In [10]:
df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
121,Arab World,ARB,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,47.790076,47.558391,47.327603,47.091616,46.844209,46.577098,...,27.484867,27.389298,27.211445,26.940781,26.576991,26.134803,25.648015,25.147542,,
1561,Caribbean small states,CSS,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,40.259404,39.949409,39.474934,38.842759,38.076287,37.212835,...,17.346224,17.13868,16.93437,16.655275,16.52699,16.318071,16.104703,15.886769,,
3001,Central Europe and the Baltics,CEB,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,19.117434,18.107956,17.204125,16.994222,16.504625,15.936754,...,9.973223,9.976633,9.638322,9.934669,9.88847,10.134697,10.275287,9.948651,,
4441,Early-demographic dividend,EAR,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,43.539861,43.297248,43.035008,42.754624,42.459891,42.154378,...,22.453785,22.078508,21.739149,21.413317,21.096953,20.785885,20.483421,20.203956,,
5881,East Asia & Pacific,EAS,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,26.299637,24.485282,36.371369,40.272212,37.494831,36.629391,...,13.835658,13.893971,13.759778,13.859393,13.601496,14.020415,13.585424,12.561121,,


In [11]:
df = df.sort_values(by=['Country Code'])

In [12]:
df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
81243,Aruba,ABW,"Incidence of HIV, all (per 1,000 uninfected po...",SH.HIV.INCD.TL.P3,,,,,,,...,,,,,,,,,,
80934,Aruba,ABW,"Death rate, crude (per 1,000 people)",SP.DYN.CDRT.IN,6.388,6.241,6.118,6.012,5.92,5.839,...,8.061,8.205,8.347,8.488,8.627,8.765,8.907,9.053,,
81198,Aruba,ABW,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,,,,,,,...,,,,,,,,,,
81422,Aruba,ABW,Mortality caused by road traffic injury (per 1...,SH.STA.TRAF.P5,,,,,,,...,,,,,,,,,,
81978,Aruba,ABW,Total alcohol consumption per capita (liters o...,SH.ALC.PCAP.LI,,,,,,,...,,,,,,,,,,


In [13]:
df = df.drop(columns=['1960', '1961', '1962','1963','1964','1965','1966','1967','1968','1969','1970','1971','1972', '1973', '1974', '1975','1976', '1977','1978', '1979','1980', '1981','1982', '1983', '1984', '1985','1986', '1987','1988', '1989','1990', '1991','1992', '1993', '1994', '1995','1996', '1997','1998', '1999','1990','2020'])

In [18]:
df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2000,2001,2002,2003,2004,2005,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
81243,Aruba,ABW,"Incidence of HIV, all (per 1,000 uninfected po...",SH.HIV.INCD.TL.P3,,,,,,,...,,,,,,,,,,
80934,Aruba,ABW,"Death rate, crude (per 1,000 people)",SP.DYN.CDRT.IN,6.971,7.022,7.084,7.154,7.233,7.32,...,7.918,8.061,8.205,8.347,8.488,8.627,8.765,8.907,9.053,
81198,Aruba,ABW,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,,,,,,,...,,,,,,,,,,
81422,Aruba,ABW,Mortality caused by road traffic injury (per 1...,SH.STA.TRAF.P5,,,,,,,...,,,,,,,,,,
81978,Aruba,ABW,Total alcohol consumption per capita (liters o...,SH.ALC.PCAP.LI,,,,,,,...,,,,,,,,,,


In [17]:
pd.set_option('display.max_rows', None)
df.isnull().sum(axis=1)

81243     20
80934      1
81198     20
81422     20
81978     20
81060      1
81918     20
81227     20
81278      6
81630     20
81245      1
80777     20
80761      1
81903     20
81340      1
80929     20
81228     20
81229     20
81691      1
80775     20
81738     20
68267      0
67974      1
67801      1
68778     17
68462     18
68943     20
68268      7
68100      1
68670      8
68269      0
69018     18
68731      1
67817     16
67815     16
68380      1
68958     15
68285      1
68283      0
67969      4
68238      4
68318     12
75015     16
76143     20
75518     18
75580      1
75001      1
75485      1
75469      0
75468      7
75978     19
75931      1
75467      0
75174      1
75169      2
75300      1
75438     19
75870     17
75483      0
75662     18
76158     15
76218     18
75017     16
69707      0
69725      1
69255     16
69709      0
69758      1
70398     15
69678      7
69540      1
69257     16
70110      7
69723      0
70171      1
69902     18
69414      1

In [19]:
df.describe()

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
count,3744.0,2803.0,2852.0,2870.0,2935.0,3400.0,2993.0,2972.0,2995.0,3018.0,4242.0,3220.0,3251.0,3383.0,3151.0,3777.0,4165.0,2854.0,2508.0,864.0
mean,59.362219,68.203507,68.126312,68.856608,68.073097,61.909433,68.586002,69.599759,69.61082,69.64034,56.610624,67.126267,66.912994,65.23439,68.876115,63.617634,59.007871,74.782872,75.71184,71.986874
std,468.337428,483.472264,479.227679,484.463325,479.531176,453.155286,485.915024,491.121786,497.181215,500.879818,428.470342,497.417357,503.522024,498.865803,524.631976,485.702146,468.774767,571.646975,479.971767,34.796369
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
25%,5.397378,4.717466,4.755869,4.9,4.6,4.98947,4.651,4.629017,4.55826,4.427603,4.7,4.651688,4.9005,5.1735,5.0175,5.721169,5.734745,4.797,5.89425,68.0
50%,19.378856,17.725,17.4995,17.814475,16.74,15.658373,17.0,17.70659,17.4,17.0795,14.0105,18.3675,17.779544,18.9,19.537,20.196815,16.8,21.6,41.648,88.107101
75%,70.0,74.885354,75.169488,76.0,77.0,72.849923,79.096999,80.0,80.485366,80.685976,72.63975,80.0,80.02561,78.945275,81.326829,78.1,74.8,84.739001,88.185782,95.0
max,21389.1,16843.769849,16622.999706,17043.147847,16842.750287,17244.928865,17265.699934,17277.773521,17645.170772,17844.067508,18121.851677,18422.474858,18864.180542,19055.181698,19478.812065,19805.42788,20159.0792,20479.769994,20777.500261,99.0


In [22]:
df.info

<bound method DataFrame.info of                                            Country Name Country Code  \
0                                                 Aruba          ABW   
1                                                 Aruba          ABW   
2                                                 Aruba          ABW   
3                                                 Aruba          ABW   
4                                                 Aruba          ABW   
5                                                 Aruba          ABW   
6                                                 Aruba          ABW   
7                                                 Aruba          ABW   
8                                                 Aruba          ABW   
9                                                 Aruba          ABW   
10                                                Aruba          ABW   
11                                                Aruba          ABW   
12                              

In [20]:
df = df.reset_index(drop=True)

In [21]:
df.head(4)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2000,2001,2002,2003,2004,2005,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aruba,ABW,"Incidence of HIV, all (per 1,000 uninfected po...",SH.HIV.INCD.TL.P3,,,,,,,...,,,,,,,,,,
1,Aruba,ABW,"Death rate, crude (per 1,000 people)",SP.DYN.CDRT.IN,6.971,7.022,7.084,7.154,7.233,7.32,...,7.918,8.061,8.205,8.347,8.488,8.627,8.765,8.907,9.053,
2,Aruba,ABW,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,,,,,,,...,,,,,,,,,,
3,Aruba,ABW,Mortality caused by road traffic injury (per 1...,SH.STA.TRAF.P5,,,,,,,...,,,,,,,,,,
