## Train and Evaluate Model

In [74]:
import pandas as pd

cleaned_data_10 = pd.read_pickle('./pickles/clean_data_10_20181010_205648.pkl')
cleaned_data_10.columns = ['CNT', 'YR', 'SUB_REGION', 'log_MORT_RATE', 'log_FORR_AREA',
       'log_GPC_ATL', 'log_POP_TOT', 'log_CO2_PERCAP', 'MORT_RATE', 'POP_GRO',
       'URB_GRO', 'FER_RATE', 'GDP_GROW', 'IMM_MEAS', 'LIF_EXP',
       'TEEN_FER_RATE', 'Australia_and_New_Zealand', 'Central_Asia',
       'Eastern_Asia', 'Eastern_Europe', 'Latin_America_and_the_Caribbean',
       'Melanesia', 'Micronesia', 'Northern_Africa', 'Northern_America',
       'Northern_Europe', 'Polynesia', 'South_eastern_Asia', 'Southern_Asia',
       'Southern_Europe', 'Sub_Saharan_Africa', 'Western_Asia',
       'Western_Europe']

In [75]:
cleaned_data_10.head()

Unnamed: 0,CNT,YR,SUB_REGION,log_MORT_RATE,log_FORR_AREA,log_GPC_ATL,log_POP_TOT,log_CO2_PERCAP,MORT_RATE,POP_GRO,...,Northern_Africa,Northern_America,Northern_Europe,Polynesia,South_eastern_Asia,Southern_Asia,Southern_Europe,Sub_Saharan_Africa,Western_Asia,Western_Europe
0,ARM,2017.0,Western Asia,2.564949,1.223775,8.29405,1.075002,0.641854,13.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,EGY,2017.0,Northern Africa,3.091042,-0.223144,8.009695,4.580365,0.788457,22.0,1.9,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,KAZ,2017.0,Central Asia,2.302585,3.50255,8.973351,2.892592,2.664447,10.0,1.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MEX,2017.0,Latin America and the Caribbean,2.564949,6.492997,9.06068,4.861052,1.353255,13.0,1.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ZAF,2017.0,Sub-Saharan Africa,3.610918,4.527209,8.599694,4.038127,2.195,37.0,1.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Linear Regression on log_GPC_ATL and log_MORT_RATE

In [153]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf



# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('log_MORT_RATE ~ log_GPC_ATL' , data=cleaned_data_10).fit()

# Inspect the results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          log_MORT_RATE   R-squared:                       0.812
Model:                            OLS   Adj. R-squared:                  0.811
Method:                 Least Squares   F-statistic:                     2497.
Date:                Thu, 11 Oct 2018   Prob (F-statistic):          3.90e-212
Time:                        21:44:57   Log-Likelihood:                -442.26
No. Observations:                 581   AIC:                             888.5
Df Residuals:                     579   BIC:                             897.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       8.6333      0.110     78.184      

In [154]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf



# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('log_MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + Sub_Saharan_Africa + IMM_MEAS + log_GPC_ATL + GDP_GROW' , data=cleaned_data_10).fit()

# Inspect the results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          log_MORT_RATE   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.920
Method:                 Least Squares   F-statistic:                     959.6
Date:                Thu, 11 Oct 2018   Prob (F-statistic):          1.18e-311
Time:                        21:45:07   Log-Likelihood:                -188.55
No. Observations:                 581   AIC:                             393.1
Df Residuals:                     573   BIC:                             428.0
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              8.8469      0

## What happens to F-statistic if we fit a feature with no predictive value (like GDP growth which has no correlation with MORT_RATE judging by eye). 

In [155]:
'./images/GDP_GROW_vs_MORT_RATE.png'

'./images/GDP_GROW_vs_MORT_RATE.png'

In [156]:
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('MORT_RATE ~ GDP_GROW', data=cleaned_data_10).fit()

# Inspect the results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:              MORT_RATE   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.8842
Date:                Thu, 11 Oct 2018   Prob (F-statistic):              0.347
Time:                        21:45:29   Log-Likelihood:                -3126.5
No. Observations:                 581   AIC:                             6257.
Df Residuals:                     579   BIC:                             6266.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     49.1991      2.935     16.762      0.0

In [157]:
from itertools import combinations 

#1. put considered features into a list
aic_trials = ['POP_GRO','URB_GRO', 'FER_RATE', 'GDP_GROW', 'IMM_MEAS', 'LIF_EXP','TEEN_FER_RATE']

aic_trials2 = ['LIF_EXP','FER_RATE','TEEN_FER_RATE','Sub_Saharan_Africa','IMM_MEAS','log_POP_TOT','log_GPC_ATL', 'GDP_GROW','log_FORR_AREA']


def all_combos_levels(trial_list):
    """
    Takes a list of features and returns patsy strings of every combination. Versus the hardcoded Y. In this case the Y
    is log_MORT_RATE. The strings generated are packaged into a list, which is returned. This list can be used
    as a parameter for best_combo() to pick the lowest BIC for the system. 
    """
    collection = []
    for i in range(1,len(trial_list)+1):
        collection = collection + list(combinations(trial_list, i))
        combos_as_lists = [list(x) for x in collection]
    collection = []
    for a_combo in combos_as_lists: 
        middle = ' + '.join(a_combo) 
        out = f'log_MORT_RATE ~ {middle}'
        collection.append(out)
    return collection
        
def best_combo(patsy_list, whole_list=False):
    """
    Takes a list of columns names that might be used in the model. It is good to include some negative control
    columns (which do not look like good linear model feature candidates). The effect of BIC minimization can 
    be judged by selection of more linear features, and exclusion of blob-like or skewed features.
    This function returns either the entire BIC scored feature set, if whole_list=True, or just returns the features
    with the lowest BIC score in tuple form, where t[0] provides the BIC score, and t[1] provides the feature string.
    """
    collection = []
    for item in all_combos_levels(patsy_list):
        results = smf.ols(item, data=cleaned_data_10).fit() 
        collection.append((round(results.bic, 4), item))
    out = sorted(collection, key=lambda x : x[0], reverse=False)
    if whole_list :
        return(out)
    else:
        return(out[0])

#3. select the best available AIC from all combination of features.    str(round(answer, 2))
    
#my-model

out = best_combo(aic_trials2, whole_list=False)
for o in out:
    print(o)

#best for MORT_RATE with aic_trials2    
#(4982.721321568792, 'MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + Sub_Saharan_Africa + IMM_MEAS + log_GPC_ATL')    
    
#best for log_MORT_RATE with aic_trials2    
#(391.2332055194836, 'log_MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + Sub_Saharan_Africa + log_GPC_ATL + GDP_GROW')

#print(all_combos_levels(aic_trials2)[0:20])

418.4326
log_MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + Sub_Saharan_Africa + log_GPC_ATL


In [148]:
combo_list = all_combos_levels(aic_trials2)

['log_MORT_RATE ~ LIF_EXP',
 'log_MORT_RATE ~ FER_RATE',
 'log_MORT_RATE ~ TEEN_FER_RATE',
 'log_MORT_RATE ~ Sub_Saharan_Africa',
 'log_MORT_RATE ~ IMM_MEAS',
 'log_MORT_RATE ~ log_POP_TOT',
 'log_MORT_RATE ~ log_GPC_ATL',
 'log_MORT_RATE ~ GDP_GROW',
 'log_MORT_RATE ~ log_FORR_AREA',
 'log_MORT_RATE ~ LIF_EXP + FER_RATE']

In [120]:
combo_list[-10:-1]

['log_MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + Sub_Saharan_Africa + IMM_MEAS + log_POP_TOT + log_GPC_ATL + GDP_GROW',
 'log_MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + Sub_Saharan_Africa + IMM_MEAS + log_POP_TOT + log_GPC_ATL + log_FORR_AREA',
 'log_MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + Sub_Saharan_Africa + IMM_MEAS + log_POP_TOT + GDP_GROW + log_FORR_AREA',
 'log_MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + Sub_Saharan_Africa + IMM_MEAS + log_GPC_ATL + GDP_GROW + log_FORR_AREA',
 'log_MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + Sub_Saharan_Africa + log_POP_TOT + log_GPC_ATL + GDP_GROW + log_FORR_AREA',
 'log_MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + IMM_MEAS + log_POP_TOT + log_GPC_ATL + GDP_GROW + log_FORR_AREA',
 'log_MORT_RATE ~ LIF_EXP + FER_RATE + Sub_Saharan_Africa + IMM_MEAS + log_POP_TOT + log_GPC_ATL + GDP_GROW + log_FORR_AREA',
 'log_MORT_RATE ~ LIF_EXP + TEEN_FER_RATE + Sub_Saharan_Africa + IMM_MEAS + log_POP_TOT + log_GPC_ATL + GDP_G

In [150]:
results = smf.ols('log_MORT_RATE ~ LIF_EXP + FER_RATE + TEEN_FER_RATE + Sub_Saharan_Africa + log_GPC_ATL', data=cleaned_data_10).fit()

In [152]:
results.summary()

0,1,2,3
Dep. Variable:,log_MORT_RATE,R-squared:,0.921
Model:,OLS,Adj. R-squared:,0.92
Method:,Least Squares,F-statistic:,1340.0
Date:,"Thu, 11 Oct 2018",Prob (F-statistic):,4.13e-314
Time:,21:41:37,Log-Likelihood:,-190.12
No. Observations:,581,AIC:,392.2
Df Residuals:,575,BIC:,418.4
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,8.8525,0.264,33.582,0.000,8.335,9.370
LIF_EXP,-0.0555,0.004,-14.110,0.000,-0.063,-0.048
FER_RATE,0.0955,0.019,5.066,0.000,0.058,0.133
TEEN_FER_RATE,0.0033,0.001,6.021,0.000,0.002,0.004
Sub_Saharan_Africa,-0.2210,0.053,-4.140,0.000,-0.326,-0.116
log_GPC_ATL,-0.2806,0.017,-16.672,0.000,-0.314,-0.248

0,1,2,3
Omnibus:,5.664,Durbin-Watson:,1.655
Prob(Omnibus):,0.059,Jarque-Bera (JB):,5.48
Skew:,-0.222,Prob(JB):,0.0646
Kurtosis:,3.169,Cond. No.,1810.0


In [None]:
## Addition of np.power(,2) and np.log() to selected terms improves the model by a little according to R^2adj, BIC

In [144]:
results = smf.ols('log_MORT_RATE ~ np.power(LIF_EXP,2) + np.log(FER_RATE) + np.log(TEEN_FER_RATE) + Sub_Saharan_Africa + log_GPC_ATL', data=cleaned_data_10).fit()

In [146]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          log_MORT_RATE   R-squared:                       0.942
Model:                            OLS   Adj. R-squared:                  0.941
Method:                 Least Squares   F-statistic:                     1867.
Date:                Thu, 11 Oct 2018   Prob (F-statistic):               0.00
Time:                        21:25:13   Log-Likelihood:                -100.37
No. Observations:                 581   AIC:                             212.7
Df Residuals:                     575   BIC:                             238.9
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 5.90

In [145]:
results.params

Intercept                5.902379
np.power(LIF_EXP, 2)    -0.000460
np.log(FER_RATE)         0.483231
np.log(TEEN_FER_RATE)    0.179653
Sub_Saharan_Africa      -0.214864
log_GPC_ATL             -0.194434
dtype: float64

## Train/Test Set Cross Validation