In [None]:
import pandas as pd
import numpy as np
import wbgapi as wb
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt


In [2]:
mortality = pd.DataFrame(list(wb.data.fetch('SH.DYN.MORT')))
literacy = pd.DataFrame(list(wb.data.fetch('SE.ADT.LITR.FE.ZS')))
labor = pd.DataFrame(list(wb.data.fetch('SL.TLF.TOTL.FE.ZS')))
poverty = pd.DataFrame(list(wb.data.fetch('SI.POV.NAHC')))
health = pd.DataFrame(list(wb.data.fetch('SH.XPD.CHEX.PC.CD')))
gdp = pd.DataFrame(list(wb.data.fetch('NY.GDP.PCAP.CD')))

In [3]:
def extract(df,name):
    tmp = df[['time','value']][df['economy']=='TUR']
    tmp.index = tmp.time
    del tmp['time']
    tmp.columns = [[name]]
    return tmp

In [4]:
mrt = extract(mortality, 'mortality')
ltr = extract(literacy, 'literacy')
lbr = extract(labor, 'labor')
pvr = extract(poverty, 'poverty')
hlt = extract(health, 'health')
gdpcp = extract(gdp, 'gdp')

In [5]:
son = mrt.join([ltr,lbr,pvr,hlt,gdpcp])
son.dropna(inplace=True)
son

Unnamed: 0_level_0,mortality,literacy,labor,poverty,health,gdp
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
YR2019,10.1,94.424042,33.360649,15.0,396.466827,9121.515167
YR2017,11.4,93.498268,32.799757,13.9,442.617615,10589.667725
YR2016,12.1,93.563011,32.250459,13.5,466.7948,10894.603378
YR2015,13.0,92.645813,31.719798,14.3,453.116486,11006.279524
YR2014,13.8,92.401817,31.027784,14.7,525.844727,12157.990434
YR2013,14.7,92.138077,31.199391,15.0,551.401245,12614.78161
YR2012,15.8,91.604523,30.464131,15.0,524.250305,11795.633457
YR2011,16.9,90.310097,29.832127,16.3,531.418579,11420.555456
YR2010,18.1,88.073174,29.161917,16.1,539.327148,10742.774979
YR2009,19.5,85.34716,28.071295,16.9,500.193054,9103.474051


In [6]:
res = smf.ols(formula='mortality ~ literacy + labor + poverty + health + gdp', data=son).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.986
Method:                 Least Squares   F-statistic:                     169.6
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           3.81e-07
Time:                        13:45:44   Log-Likelihood:                -7.8492
No. Observations:                  13   AIC:                             27.70
Df Residuals:                       7   BIC:                             31.09
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     69.8438      9.987      6.994      0.0



R-squared value for this model is 0.992, very close to 1, which means linear regression function line fits the data well. We are able to explain 99.2% of the data, so it's not a bad model at all. However, we notice that P value for literacy variable is 0.954. Which means that we can not conclude a relationship between Literacy and Mortality. To check it, lets create a new model without Literacy variable.

In [7]:
sm.stats.anova_lm(res)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
literacy,1.0,296.579278,296.579278,815.327082,1.661736e-08
labor,1.0,8.414644,8.414644,23.132725,0.001944516
poverty,1.0,0.940111,0.940111,2.584462,0.1519524
health,1.0,1.829951,1.829951,5.030724,0.05981897
gdp,1.0,0.769732,0.769732,2.116072,0.1890863
Residual,7.0,2.546285,0.363755,,


In [16]:
res = smf.ols(formula='mortality ~ labor + poverty + health + gdp', data=son).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.988
Method:                 Least Squares   F-statistic:                     242.2
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           2.23e-08
Time:                        14:07:21   Log-Likelihood:                -7.8525
No. Observations:                  13   AIC:                             25.70
Df Residuals:                       8   BIC:                             28.53
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     70.1838      7.670      9.151      0.0



Here, we notice something : Adj. R-squared value increased (compared to the 1st model). When we have more than one explanatory variable, it is actually better to look at that value, rather than R-squared. So, it means this model is better. We can also see it when we check P-values for the variables.

In [17]:
res1 = smf.ols(formula='mortality ~ literacy + labor + poverty + gdp', data=son).fit()
print(res1.summary())

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.984
Model:                            OLS   Adj. R-squared:                  0.976
Method:                 Least Squares   F-statistic:                     122.4
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           3.29e-07
Time:                        14:11:00   Log-Likelihood:                -12.235
No. Observations:                  13   AIC:                             34.47
Df Residuals:                       8   BIC:                             37.29
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     64.9877     12.858      5.054      0.0



In [19]:
res2 = smf.ols(formula='mortality ~ literacy + poverty + gdp', data=son).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.988
Method:                 Least Squares   F-statistic:                     242.2
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           2.23e-08
Time:                        14:11:58   Log-Likelihood:                -7.8525
No. Observations:                  13   AIC:                             25.70
Df Residuals:                       8   BIC:                             28.53
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     70.1838      7.670      9.151      0.0



In [10]:
res2 = smf.ols(formula='mortality ~ gdp', data=son).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.986
Method:                 Least Squares   F-statistic:                     169.6
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           3.81e-07
Time:                        13:46:14   Log-Likelihood:                -7.8492
No. Observations:                  13   AIC:                             27.70
Df Residuals:                       7   BIC:                             31.09
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     69.8438      9.987      6.994      0.0



In [11]:
res1 = smf.ols(formula='mortality ~ gdp + literacy + labor', data=son).fit()
print(res1.summary())

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.983
Model:                            OLS   Adj. R-squared:                  0.977
Method:                 Least Squares   F-statistic:                     173.7
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           2.78e-08
Time:                        13:46:20   Log-Likelihood:                -12.592
No. Observations:                  13   AIC:                             33.18
Df Residuals:                       9   BIC:                             35.44
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     67.9910     11.684      5.819      0.0



In [12]:
sm.stats.anova_lm(res1)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
gdp,1.0,121.486101,121.486101,207.008959,1.619147e-07
literacy,1.0,177.850062,177.850062,303.051592,3.076386e-08
labor,1.0,6.462062,6.462062,11.011174,0.008964433
Residual,9.0,5.281776,0.586864,,


In [14]:
res = smf.ols(formula='mortality ~ labor + poverty + health + gdp', data=son).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.988
Method:                 Least Squares   F-statistic:                     242.2
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           2.23e-08
Time:                        13:55:07   Log-Likelihood:                -7.8525
No. Observations:                  13   AIC:                             25.70
Df Residuals:                       8   BIC:                             28.53
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     70.1838      7.670      9.151      0.0



These other models didn't produce better results.