In [248]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import country_converter as coco
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,root_mean_squared_error

In [249]:
df = pd.read_csv("../Life Expectancy Data.csv")
print(df.shape)
df.head()

(2938, 22)


Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


# Strip white spaces from column names

In [250]:
df.columns = df.columns.str.strip()

In [251]:
df.columns

Index(['Country', 'Year', 'Status', 'Life expectancy', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles', 'BMI', 'under-five deaths', 'Polio', 'Total expenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness  1-19 years',
       'thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')

# Delete 10 rows where life expectancy is missing

In [252]:
df.dropna(subset=['Life expectancy'], inplace=True)
df["Life expectancy"].isnull().sum()

np.int64(0)

# drop correlated features 
    * under-five deaths/infant deaths
    * GDP / percentage expenditure
    * thinness 1-19/thinness 5-9
    * Schooling / Income composition of resources

In [253]:
df = df.drop(["infant deaths","percentage expenditure","thinness 5-9 years", "Income composition of resources",'thinness  1-19 years'], axis=1)
df.shape

(2928, 17)

In [254]:
cat_vars = [var for var in df.columns if df[var].dtype == 'O']
num_vars = [var for var in df.columns if var not in cat_vars]
num_vars_with_na = [var for var in num_vars if df[var].isnull().sum()]

## impute

In [255]:
df = df.sort_values(['Country', 'Year'])
df[num_vars_with_na] = df.groupby('Country')[num_vars_with_na].transform(lambda x: x.interpolate(limit_direction='both'))
df['GDP'] = df['GDP'].fillna(df.groupby('Status')['GDP'].transform('median'))
df['Population'] = df['Population'].fillna(df.groupby('Status')['Population'].transform('median'))

df[num_vars_with_na] = df[num_vars_with_na].fillna(df[num_vars_with_na].median())

In [256]:
df.isnull().sum()

Country              0
Year                 0
Status               0
Life expectancy      0
Adult Mortality      0
Alcohol              0
Hepatitis B          0
Measles              0
BMI                  0
under-five deaths    0
Polio                0
Total expenditure    0
Diphtheria           0
HIV/AIDS             0
GDP                  0
Population           0
Schooling            0
dtype: int64

# binarize 
    * HIV / Aids

In [257]:
df['HIV/AIDS'] = np.where(df['HIV/AIDS'] <= 0.1,0,1)
df['HIV/AIDS'].unique()

array([0, 1])

# log + 1 transform:
    * 'under-five deaths '
    * population
    * GDP - this one doesnt need the +1
    * Measles
    

In [258]:
log_plus_one_vars = ['under-five deaths','Population','Measles','GDP']
df[log_plus_one_vars] = np.log1p(df[log_plus_one_vars])

# country converter to see continents

In [259]:
cc = coco.CountryConverter()

# 2. Convert country names to continents
# .unique() makes it faster, then we map it back to the main dataframe
unique_countries = df['Country'].unique()
continent_map = cc.convert(names=unique_countries, to='continent')
country_to_continent = dict(zip(unique_countries, continent_map))

df['Continent'] = df['Country'].map(country_to_continent)
df['Continent'].unique()

array(['Asia', 'Europe', 'Africa', 'America', 'Oceania'], dtype=object)

# windsorization
    * Adult Mortality
    * Alcohol
    * BMI more agressive
    

In [260]:
vars_to_clip = ['Adult Mortality', 'Alcohol']

In [261]:
for var in vars_to_clip:
    q_limit = df[var].quantile(0.95)
    df[var] = df[var].clip(upper=q_limit)

In [262]:
df.loc[(df['BMI'] < 10) | (df['BMI'] > 65)  , 'BMI'] = np.nan

# 2. Use the "Connect the Dots" interpolation by country
df['BMI'] = df.groupby('Country')['BMI'].transform(lambda x: x.interpolate(limit_direction='both'))
df['BMI'] = df['BMI'].fillna(df.groupby('Status')['BMI'].transform('median'))

In [263]:
df.isnull().sum()

Country              0
Year                 0
Status               0
Life expectancy      0
Adult Mortality      0
Alcohol              0
Hepatitis B          0
Measles              0
BMI                  0
under-five deaths    0
Polio                0
Total expenditure    0
Diphtheria           0
HIV/AIDS             0
GDP                  0
Population           0
Schooling            0
Continent            0
dtype: int64

# drop columns:
    * year
    * country

In [264]:
df = df.drop(["Year","Country"],axis=1)

# Categorical Encoding


In [265]:
# Create dummies as integers from the start
df = pd.get_dummies(df, columns=['Status', 'Continent'], drop_first=True, dtype=int)
df.head()


Unnamed: 0,Life expectancy,Adult Mortality,Alcohol,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,Schooling,Status_Developing,Continent_America,Continent_Asia,Continent_Europe,Continent_Oceania
15,54.8,321.0,0.01,62.0,8.784622,12.2,4.812184,24.0,8.2,24.0,0,4.74979,12.590508,5.5,1,0,1,0,0
14,55.3,316.0,0.01,63.0,9.078294,12.6,4.812184,35.0,7.8,33.0,0,4.774887,14.902881,5.9,1,0,1,0,0
13,56.2,3.0,0.01,64.0,7.818832,13.0,4.812184,36.0,7.76,36.0,0,5.240932,16.90564,6.2,1,0,1,0,0
12,56.7,295.0,0.01,65.0,6.683361,13.4,4.812184,41.0,8.82,41.0,0,5.296959,14.676226,6.5,1,0,1,0,0
11,57.0,293.0,0.02,67.0,6.146329,13.8,4.795791,5.0,8.79,5.0,0,5.39427,16.99851,6.8,1,0,1,0,0


# check if vaccines are correlated
replace with imunization score


In [266]:
df.columns

Index(['Life expectancy', 'Adult Mortality', 'Alcohol', 'Hepatitis B',
       'Measles', 'BMI', 'under-five deaths', 'Polio', 'Total expenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'Schooling',
       'Status_Developing', 'Continent_America', 'Continent_Asia',
       'Continent_Europe', 'Continent_Oceania'],
      dtype='object')

In [267]:
imm_vars = ['Hepatitis B', 'Polio', 'Diphtheria']
df['Immunization_Score'] = df[imm_vars].mean(axis=1)
df.drop(imm_vars,axis=1, inplace=True)
df.columns

Index(['Life expectancy', 'Adult Mortality', 'Alcohol', 'Measles', 'BMI',
       'under-five deaths', 'Total expenditure', 'HIV/AIDS', 'GDP',
       'Population', 'Schooling', 'Status_Developing', 'Continent_America',
       'Continent_Asia', 'Continent_Europe', 'Continent_Oceania',
       'Immunization_Score'],
      dtype='object')

In [268]:
X = df.drop(["Life expectancy"],axis=1)
y = df["Life expectancy"]

In [269]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((2342, 16), (586, 16), (2342,), (586,))

# use RobustScaler to better deal with remaining outliers

In [270]:
scaler = RobustScaler()
scaler.set_output(transform="pandas")

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# X_train is now a DataFrame, so .head() works!
X_train.head()

Unnamed: 0,Adult Mortality,Alcohol,Measles,BMI,under-five deaths,Total expenditure,HIV/AIDS,GDP,Population,Schooling,Status_Developing,Continent_America,Continent_Asia,Continent_Europe,Continent_Oceania,Immunization_Score
2077,-0.419355,-0.355969,0.297613,0.616099,-0.473197,-0.97098,0.0,1.675874,0.0,-0.128205,0.0,0.0,1.0,0.0,0.0,0.277108
2825,-0.083871,0.436026,-0.491491,0.306502,-0.269402,0.657255,0.0,0.721196,0.369454,0.435897,0.0,1.0,0.0,0.0,0.0,-0.963855
1295,-0.464516,0.80772,1.035339,0.349845,-0.065607,0.785882,0.0,1.232015,-1.360194,0.74359,-1.0,0.0,0.0,1.0,0.0,0.13253
1581,1.640645,-0.355969,0.680034,-0.916409,0.75862,0.657255,1.0,-0.680791,-0.041429,-0.589744,0.0,0.0,0.0,0.0,0.0,-0.012048
671,-0.187097,0.062902,-0.491491,0.162539,-0.269402,0.17098,0.0,0.335082,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.26506


In [271]:
df.isnull().sum()
num_vars_with_na

['Alcohol',
 'Hepatitis B',
 'BMI',
 'Polio',
 'Total expenditure',
 'Diphtheria',
 'GDP',
 'Population',
 'Schooling']

In [276]:
from sklearn.linear_model import Lasso
reg = Lasso(alpha=0.01)
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)
r2 = r2_score (y_test,y_pred)
print(r2)

0.8171189220328864


Scale

In [277]:

from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators=300)
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)
r2 = r2_score (y_test,y_pred)
print(r2)

0.954038946386092


In [278]:

from sklearn.model_selection import cross_val_score

# Get a stable R2 score across 5 different splits
scores = cross_val_score(reg, X_train, y_train, cv=5, scoring='r2')
print(f"Mean CV R2: {scores.mean():.4f} (+/- {scores.std():.4f})")

Mean CV R2: 0.9511 (+/- 0.0073)


In [279]:
from xgboost import XGBRegressor

# XGBoost often beats Random Forest on this dataset
xgb_reg = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)
xgb_reg.fit(X_train, y_train)

print(f"XGBoost R2: {xgb_reg.score(X_test, y_test):.4f}")

XGBoost R2: 0.9367


In [280]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None] # 'auto' is deprecated
}

random_search = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_dist, 
                                   n_iter=20, cv=5, scoring='r2', n_jobs=-1)
random_search.fit(X_train, y_train)

print(f"Best Params: {random_search.best_params_}")
print(f"Best CV Score: {random_search.best_score_:.4f}")

Best Params: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 20}
Best CV Score: 0.9506
