In [62]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso

import pickle

%matplotlib inline

In [13]:
df = pd.read_csv('./data/led.csv')
df.head()

Unnamed: 0,Country,Year,Status,Lifeexpectancy,AdultMortality,infantdeaths,Alcohol,percentageexpenditure,HepatitisB,Measles,...,Polio,Totalexpenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness1-19years,thinness5-9years,Incomecompositionofresources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [14]:
df = df[df['Year'] == 2015]
df

Unnamed: 0,Country,Year,Status,Lifeexpectancy,AdultMortality,infantdeaths,Alcohol,percentageexpenditure,HepatitisB,Measles,...,Polio,Totalexpenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness1-19years,thinness5-9years,Incomecompositionofresources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
16,Albania,2015,Developing,77.8,74.0,0,4.60,364.975229,99.0,0,...,99.0,6.00,99.0,0.1,3954.227830,28873.0,1.2,1.3,0.762,14.2
32,Algeria,2015,Developing,75.6,19.0,21,,0.000000,95.0,63,...,95.0,,95.0,0.1,4132.762920,39871528.0,6.0,5.8,0.743,14.4
48,Angola,2015,Developing,52.4,335.0,66,,0.000000,64.0,118,...,7.0,,64.0,1.9,3695.793748,2785935.0,8.3,8.2,0.531,11.4
64,AntiguaandBarbuda,2015,Developing,76.4,13.0,0,,0.000000,99.0,0,...,86.0,,99.0,0.2,13566.954100,,3.3,3.3,0.784,13.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2858,Venezuela(BolivarianRepublicof),2015,Developing,74.1,157.0,9,,0.000000,87.0,0,...,87.0,,87.0,0.1,,,1.6,1.5,0.769,14.3
2874,VietNam,2015,Developing,76.0,127.0,28,,0.000000,97.0,256,...,97.0,,97.0,0.1,,,14.2,14.5,0.678,12.6
2890,Yemen,2015,Developing,65.7,224.0,37,,0.000000,69.0,468,...,63.0,,69.0,0.1,,,13.6,13.4,0.499,9.0
2906,Zambia,2015,Developing,61.8,33.0,27,,0.000000,9.0,9,...,9.0,,9.0,4.1,1313.889646,161587.0,6.3,6.1,0.576,12.5


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 2922
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country                       183 non-null    object 
 1   Year                          183 non-null    int64  
 2   Status                        183 non-null    object 
 3   Lifeexpectancy                183 non-null    float64
 4   AdultMortality                183 non-null    float64
 5   infantdeaths                  183 non-null    int64  
 6   Alcohol                       6 non-null      float64
 7   percentageexpenditure         183 non-null    float64
 8   HepatitisB                    174 non-null    float64
 9   Measles                       183 non-null    int64  
 10  BMI                           181 non-null    float64
 11  under-fivedeaths              183 non-null    int64  
 12  Polio                         183 non-null    float64
 13  Tota

In [16]:
def run_experiment(df, x_columns, y_column, imputer_class, scaler_class,
                   model_class, cv_params = {}, pipeline_params = {}):
    pipeline = Pipeline([
        ['imputer', imputer_class()],
        ['scaler', scaler_class()],
        ['model', model_class()]
    ])
    
    X = df[x_columns]
    y = df[y_column]
    
    scores = cross_val_score(pipeline, X, y, **cv_params, fit_params=pipeline_params)
    
    # After getting the cross-val score, break down the pipeline to get a model that we can analyze
    imputer = pipeline['imputer']
    imputed_data = imputer.fit_transform(X)
    
    scaler = pipeline['scaler']
    scaled_data = scaler.fit_transform(imputed_data)
    model = pipeline['model']
    model.fit(scaled_data, y)
    
    return scores, model, imputer, scaler

In [57]:
x_columns = [
            'AdultMortality', 'infantdeaths', 'percentageexpenditure', 'HepatitisB',
             'BMI', 'Polio', 'Schooling', 'Status_Developing'
            ]

y_column = 'Lifeexpectancy'

cv_params = {
    "scoring": "neg_mean_absolute_error"
}

scores, model, imputer, scaler = run_experiment(df, x_columns, y_column, IterativeImputer, MinMaxScaler, LinearRegression, cv_params)

In [18]:
def print_regression_equation(model):
    print(f"Intercept: {model.intercept_:.3f}")
    print("Coefficients:")
    for name, coef in zip(x_columns, model.coef_):
        print(f" {name}: {coef:.3f}")

In [59]:
equation = f"{model.intercept_}"
for coefficient, feature in zip(model.coef_, x_columns):
    equation = equation + f" + ({coefficient:.3f} x {feature})"
equation

'56.259537733134835 + (-0.036 x AdultMortality) + (-0.003 x infantdeaths) + (0.004 x percentageexpenditure) + (0.031 x HepatitisB) + (0.014 x BMI) + (0.031 x Polio) + (1.241 x Schooling) + (-1.128 x Status_Developing)'

In [19]:
print_regression_equation(model)

Intercept: 61.300
Coefficients:
 AdultMortality: -17.285
 infantdeaths: -4.372
 percentageexpenditure: 1.449
 HepatitisB: 2.781
 Measles: 2.903
 BMI: 0.913
 Polio: 2.824
 GDP: 2.629
 Schooling: 20.111


In [20]:
df['Lifeexpectancy'].std(), df['Lifeexpectancy'].mean(), scores.mean(), scores.std()

(8.123706147645304, 71.61693989071038, -2.682173819508433, 0.20091016961071903)

In [21]:
us = df[df['Country'] == 'UnitedStatesofAmerica'].iloc[0]
expected = us['Lifeexpectancy']
us_data = scaler.transform(imputer.transform([us[x_columns]]))
predicted = model.predict(us_data)
expected, predicted[0]

(79.3, 80.80025606450867)

In [47]:

imputers = [SimpleImputer, IterativeImputer, KNNImputer]
regressors = [LinearRegression, Ridge, Lasso]

best_model = None
best_score = -999

for i, imputer in enumerate(imputers):
    for j, regressor in enumerate(regressors):
        scores, model, curr_imputer, curr_scaler = run_experiment(df, x_columns, y_column,
                                                                  imputer, MinMaxScaler, regressor,
                                                                  cv_params)
        mae = scores.mean()
        
        if (abs(mae) < abs(best_score)):
            best_score = mae
            best_model = model
            print(f"New Best: Imputer {i}, Regression {j}")
            
print(best_score)

New Best: Imputer 0, Regression 0
New Best: Imputer 1, Regression 0
New Best: Imputer 2, Regression 0
-2.690806646347201


In [48]:
print_regression_equation(best_model)

Intercept: 62.640
Coefficients:
 AdultMortality: -17.012
 infantdeaths: -5.364
 percentageexpenditure: 1.735
 HepatitisB: 2.963
 Measles: 4.021
 BMI: 1.187
 Polio: 2.711
 GDP: 2.198
 Schooling: 18.649
 Status_Developing: -1.058


In [30]:
df = pd.get_dummies(df, columns=['Status'], drop_first=True)

KeyError: "None of [Index(['Status'], dtype='object')] are in the [columns]"

In [63]:
imputer = IterativeImputer()
X = imputer.fit_transform(df[x_columns])
y = df[y_column]
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
mae = mean_absolute_error(y_pred, y)
mae

2.568150877196889

In [41]:
print_regression_equation(model)

Intercept: 56.260
Coefficients:
 AdultMortality: -0.036
 infantdeaths: -0.003
 percentageexpenditure: 0.004
 HepatitisB: 0.031
 BMI: 0.014
 Polio: 0.031
 Schooling: 1.241
 Status_Developing: -1.128


In [64]:
exports = {
    "imputer": imputer,
    "model": model,
    "features": x_columns,
    "target": y_column,
    "mean_absolute_error": mae
}

pickle.dump(exports, open('./model.pickle', 'wb'))