In [255]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import (
    MinMaxScaler, # No impact
    StandardScaler, # Best combination
    RobustScaler, # No impact
    MaxAbsScaler, # No impact
    QuantileTransformer, # Not good
    PowerTransformer,
    Normalizer, # Not good
    FunctionTransformer,
    KBinsDiscretizer
)
import statsmodels.api as sm
import statsmodels.tools
from sklearn.linear_model import LassoCV

In [40]:
df = pd.read_csv("Life Expectancy Data.csv")

In [41]:
# df.head()

In [42]:
def pre_split_feature_eng(df):
    df = df.copy()
    # df = pd.get_dummies(df, columns = ['Country'], drop_first = True, prefix = 'Country', dtype = int)
    df = pd.get_dummies(df, columns = ['Region'], drop_first = True, prefix = 'Region', dtype = int)
    df['log_GDP'] = np.log(df['GDP_per_capita'])
    df['immunisation_avg'] = (df['Polio'] + df['Diphtheria'] + df['Hepatitis_B']) / 3 # + df['Measles']
    return df

In [43]:
OHE_df = pre_split_feature_eng(df)

In [44]:
OHE_df.head()

Unnamed: 0,Country,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,...,Region_Asia,Region_Central America and Caribbean,Region_European Union,Region_Middle East,Region_North America,Region_Oceania,Region_Rest of Europe,Region_South America,log_GDP,immunisation_avg
0,Turkiye,2015,11.1,13.0,105.824,1.32,97,65,27.8,97,...,0,0,0,1,0,0,0,0,9.306196,97.0
1,Spain,2015,2.7,3.3,57.9025,10.35,97,94,26.0,97,...,0,0,1,0,0,0,0,0,10.155879,97.0
2,India,2007,51.5,67.9,201.0765,1.57,60,35,21.2,67,...,1,0,0,0,0,0,0,0,6.981006,63.666667
3,Guyana,2006,32.8,40.5,222.1965,5.68,93,74,25.3,92,...,0,0,0,0,0,0,0,1,8.329899,92.666667
4,Israel,2012,3.4,4.3,57.951,2.89,97,89,27.0,94,...,0,0,0,1,0,0,0,0,10.433969,95.0


In [45]:
feature_cols = list(OHE_df.columns)
feature_cols.remove('Life_expectancy')

In [46]:
X = OHE_df[feature_cols]
y = OHE_df['Life_expectancy']

In [266]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [267]:
X_train.columns

Index(['Country', 'Year', 'Infant_deaths', 'Under_five_deaths',
       'Adult_mortality', 'Alcohol_consumption', 'Hepatitis_B', 'Measles',
       'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed',
       'Economy_status_Developing', 'Region_Asia',
       'Region_Central America and Caribbean', 'Region_European Union',
       'Region_Middle East', 'Region_North America', 'Region_Oceania',
       'Region_Rest of Europe', 'Region_South America', 'log_GDP',
       'immunisation_avg'],
      dtype='object')

In [268]:
scale_cols = [
    'Year', 'Infant_deaths', 'Under_five_deaths',
    'Adult_mortality', 'Alcohol_consumption', 'Hepatitis_B', 'Measles',
    'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
    'Population_mln', 'Thinness_ten_nineteen_years',
    'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed',
    'Economy_status_Developing', 'log_GDP', 'immunisation_avg'
]

scaler = StandardScaler()
scaler.fit(X_train[scale_cols])

def feature_eng(df, scaler, scale_cols):
    df = df.copy()
    df[scale_cols] = scaler.transform(df[scale_cols])
    df = sm.add_constant(df)
    return df

In [269]:
X_train_fe = feature_eng(X_train, scaler, scale_cols)

In [320]:
# dropping 'Economy_status_Developing' at index: 17
# dropping 'Polio' at index: 8
# dropping 'Infant_deaths' at index: 1
# dropping 'Diphtheria' at index: 7
# dropping 'Schooling' at index: 12
# dropping 'Hepatitis_B' at index: 4
# dropping 'Thinness_ten_nineteen_years' at index: 9
# dropping 'Adult_mortality' at index: 2
# dropping 'BMI' at index: 4
# dropping 'Measles' at index: 3
# Cond 17.9 RMSE 2.2

# dropping 'Economy_status_Developing' at index: 17
# dropping 'Polio' at index: 8
# dropping 'Infant_deaths' at index: 1
# dropping 'Diphtheria' at index: 7
# dropping 'Schooling' at index: 12
# dropping 'Hepatitis_B' at index: 4
# Cond 39.6 RMSE 1.2

# If cond # doesn't error we're good (<100 maybe even 300, 1000 sounds like a lot)

#'Adult_mortality', 'Under_five_deaths', 'Economy_status_Developed', 'Region_Central America and Caribbean', 
#'GDP_per_capita', 'Region_South America', 'Region_Oceania', 'Region_European Union', 'Year', 'Incidents_HIV', 
#'Region_Asia', 'Region_Rest of Europe', 'Region_North America'

# feature_cols = [
#     'const', 
#     #'Country', 
#     'Year', 
#     #'Infant_deaths', 
#     'Under_five_deaths',
#     'Adult_mortality', 
#     #'Alcohol_consumption', 
#     #'Hepatitis_B', 
#     #'Measles',
#     #'BMI', 
#     #'Polio', 
#     #'Diphtheria', 
#     'Incidents_HIV', 
#     'GDP_per_capita',
#     #'Population_mln', 
#     #'Thinness_ten_nineteen_years',
#     #'Thinness_five_nine_years', 
#     #'Schooling', 
#     'Economy_status_Developed',
#     #'Economy_status_Developing', 
#     'Region_Asia',
#     'Region_Central America and Caribbean', 
#     'Region_European Union',
#     #'Region_Middle East', 
#     'Region_North America', 
#     'Region_Oceania',
#     'Region_Rest of Europe', 
#     'Region_South America',
#     #'log_GDP', 
#     #'immunisation_avg'
# ]

feature_cols = ['const', 'Schooling', 'Adult_mortality', 'Under_five_deaths', 'Economy_status_Developing', 'Region_Central America and Caribbean', 'Region_South America', 'GDP_per_capita', 'Region_Oceania', 'Region_European Union', 'Year', 'log_GDP', 'BMI', 'Incidents_HIV', 'Region_Rest of Europe', 'Region_North America']

# feature_cols = ['const', 'Year', 'Under_five_deaths', 'Adult_mortality', 'Alcohol_consumption',
#        'Measles', 'BMI', 'Incidents_HIV', 'GDP_per_capita', 'Population_mln',
#        'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling',
#        'Economy_status_Developing', 'Region_Asia',
#        'Region_Central America and Caribbean', 'Region_European Union',
#        'Region_Middle East', 'Region_North America', 'Region_Oceania',
#        'Region_Rest of Europe', 'Region_South America', 'immunisation_avg']]
lin_reg = sm.OLS(y_train, X_train_fe[feature_cols])
results = lin_reg.fit()
results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.984
Model:,OLS,Adj. R-squared:,0.984
Method:,Least Squares,F-statistic:,9259.0
Date:,"Thu, 10 Jul 2025",Prob (F-statistic):,0.0
Time:,12:26:13,Log-Likelihood:,-3674.0
No. Observations:,2291,AIC:,7380.0
Df Residuals:,2275,BIC:,7472.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,68.6217,0.048,1427.513,0.000,68.527,68.716
Schooling,0.3440,0.054,6.392,0.000,0.238,0.449
Adult_mortality,-5.4056,0.071,-76.183,0.000,-5.545,-5.266
Under_five_deaths,-3.4429,0.065,-52.861,0.000,-3.571,-3.315
Economy_status_Developing,-1.0182,0.066,-15.357,0.000,-1.148,-0.888
Region_Central America and Caribbean,1.8301,0.097,18.905,0.000,1.640,2.020
Region_South America,1.5339,0.109,14.012,0.000,1.319,1.749
GDP_per_capita,0.1304,0.056,2.345,0.019,0.021,0.239
Region_Oceania,-0.8664,0.125,-6.917,0.000,-1.112,-0.621

0,1,2,3
Omnibus:,33.967,Durbin-Watson:,2.016
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48.839
Skew:,0.165,Prob(JB):,2.48e-11
Kurtosis:,3.635,Cond. No.,21.6


# Testing

In [317]:
y_pred_train = results.predict(X_train_fe[feature_cols])

rmse_train = statsmodels.tools.eval_measures.rmse(y_train, y_pred_train)

print(rmse_train)

1.2028940271721988


In [318]:
X_test_fe = feature_eng(X_test, scaler, scale_cols)
X_test_fe = X_test_fe[feature_cols]

In [319]:
y_test_pred = results.predict(X_test_fe)
rmse = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)
print(rmse)

1.2263385948064993


## Calculate VIF

### Determine which features to use. Bigger threshold removes fewer features.

In [302]:
def calculate_vif(X, thresh = 10):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        # this bit uses list comprehension to gather all the VIF values of the different variables
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]
        
        maxloc = vif.index(max(vif)) # getting the index of the highest VIF value
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc] # we delete the highest VIF value on condition that it's higher than the threshold
            dropped = True # if we deleted anything, we set the 'dropped' value to True to stay in the while loop
    print('Remaining variables:')
    print(X.columns[variables]) # finally, we print the variables that are still in our set
    return X.iloc[:, variables] # and return our X cut down to the remaining variables

In [303]:
VIF_variables = calculate_vif(X_train_fe[['Year', 'Infant_deaths', 'Under_five_deaths',
       'Adult_mortality', 'Alcohol_consumption', 'Hepatitis_B', 'Measles',
       'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed',
       'Economy_status_Developing', 'Region_Asia',
       'Region_Central America and Caribbean', 'Region_European Union',
       'Region_Middle East', 'Region_North America', 'Region_Oceania',
       'Region_Rest of Europe', 'Region_South America', 'log_GDP', 'immunisation_avg']])

  vif = 1. / (1. - r_squared_i)


dropping 'Hepatitis_B' at index: 5
dropping 'Economy_status_Developed' at index: 15
dropping 'Infant_deaths' at index: 1
dropping 'Diphtheria' at index: 7
dropping 'Polio' at index: 6
dropping 'log_GDP' at index: 21
Remaining variables:
Index(['Year', 'Under_five_deaths', 'Adult_mortality', 'Alcohol_consumption',
       'Measles', 'BMI', 'Incidents_HIV', 'GDP_per_capita', 'Population_mln',
       'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling',
       'Economy_status_Developing', 'Region_Asia',
       'Region_Central America and Caribbean', 'Region_European Union',
       'Region_Middle East', 'Region_North America', 'Region_Oceania',
       'Region_Rest of Europe', 'Region_South America', 'immunisation_avg'],
      dtype='object')


# Stepwise

In [284]:
def stepwise_selection(X, y, threshold_in = 0.01, threshold_out = 0.3, verbose = True):
    # The function is checking for p-values (whether features are statistically significant) - lower is better
    included = [] # this is going to be the list of features we keep
    while True:
        changed = False
        # forward step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index = excluded, dtype = 'float64')
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        # we add the feature with the lowest (best) p-value under the threshold to our 'included' list
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval)) # specifying the verbose text


        # backward step: removing features if new features added to the list make them statistically insignificant
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        # if the p-value exceeds the upper threshold, the feature will be dropped from the 'included' list
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [285]:
result = stepwise_selection(X_train_fe[VIF_variables.columns], y_train)

print('resulting features:')
print(result)

Add  Schooling                      with p-value 0.0
Add  Adult_mortality                with p-value 0.0
Add  Under_five_deaths              with p-value 0.0
Add  Economy_status_Developing      with p-value 7.57568e-86
Add  Region_Central America and Caribbean with p-value 2.06412e-51
Add  Region_South America           with p-value 1.27536e-38
Add  GDP_per_capita                 with p-value 2.75002e-35
Add  Region_Oceania                 with p-value 1.61421e-27
Add  Region_European Union          with p-value 1.08404e-14
Add  Year                           with p-value 5.84386e-06
Add  log_GDP                        with p-value 1.44027e-06
Add  BMI                            with p-value 8.71722e-12
Add  Incidents_HIV                  with p-value 0.00048103
Add  Region_Rest of Europe          with p-value 0.00723277
Add  Region_North America           with p-value 0.00111448
resulting features:
['Schooling', 'Adult_mortality', 'Under_five_deaths', 'Economy_status_Developing', 'Re

# Lasso

In [315]:
lasso = LassoCV(cv=5)
lasso.fit(X_train_fe[['Schooling', 'Adult_mortality', 'Under_five_deaths', 'Economy_status_Developing', 'Region_Central America and Caribbean', 'Region_South America', 'GDP_per_capita', 'Region_Oceania', 'Region_European Union', 'Year', 'log_GDP', 'BMI', 'Incidents_HIV', 'Region_Rest of Europe', 'Region_North America']], y_train)
print(f'Selected features: {X_train_fe[['Schooling', 'Adult_mortality', 'Under_five_deaths', 'Economy_status_Developing', 'Region_Central America and Caribbean', 'Region_South America', 'GDP_per_capita', 'Region_Oceania', 'Region_European Union', 'Year', 'log_GDP', 'BMI', 'Incidents_HIV', 'Region_Rest of Europe', 'Region_North America']].columns[lasso.coef_ != 0].tolist()}')

Selected features: ['Schooling', 'Adult_mortality', 'Under_five_deaths', 'Economy_status_Developing', 'Region_Central America and Caribbean', 'Region_South America', 'GDP_per_capita', 'Region_Oceania', 'Region_European Union', 'Year', 'log_GDP', 'BMI', 'Incidents_HIV', 'Region_Rest of Europe', 'Region_North America']
