In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import country_converter as coco
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,root_mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from feature_engine.selection import DropFeatures
from feature_engine.transformation import LogCpTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.compose import ColumnTransformer
from feature_engine.imputation import MeanMedianImputer
import sklearn
import preprocessors as pp

sklearn.set_config(transform_output="pandas")

In [18]:
df = pd.read_csv("../Life Expectancy Data.csv")

In [19]:
df.columns = df.columns.str.strip()
df.dropna(subset=['Life expectancy'], inplace=True)
df = df.sort_values(['Country', 'Year'])

In [20]:
df.columns 

Index(['Country', 'Year', 'Status', 'Life expectancy', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles', 'BMI', 'under-five deaths', 'Polio', 'Total expenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness  1-19 years',
       'thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')

In [21]:
# 2. Define X and y
X = df.drop('Life expectancy', axis=1)
y = df['Life expectancy']

# 3. THE SPLIT
# Do this BEFORE the pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
drop_vars = ["Year","Country",'Hepatitis B', 'Polio', 'Diphtheria',"infant deaths","percentage expenditure","thinness 5-9 years", "Income composition of resources",'thinness  1-19 years']
num_vars_with_na = ['Alcohol',
 'Hepatitis B',
 'BMI',
 'Polio',
 'Total expenditure',
 'Diphtheria',
 'GDP',
 'Population',
 'Schooling']
log_plus_one_vars = ['under-five deaths','Population','Measles','GDP']
binarize_vars = ['HIV/AIDS']
ohe_vars = ['Status', 'Continent']

In [36]:
pipe = Pipeline([
    # 1. Feature Generation (Needs 'Country')
    ('continent_gen', pp.ContinentConverter(country_col='Country')),

    # 2. Custom Imputers (Needs 'Country' and 'Status')
    ('by_country_imputer', pp.CountryInterpolator(variables=num_vars_with_na)),
    ('group_imputer', pp.GroupedMedianImputer(variables=num_vars_with_na)),
    ('final_imputer', MeanMedianImputer(
        imputation_method='median', 
        variables=num_vars_with_na
    )),
    # 3. CLEANING STEP (Crucial!) 
    # Drop 'Country' here so SimpleImputer only sees numbers and 'Status'/'Continent'
    # 'Status' and 'Continent' are still strings, so we must drop or encode them too.
    ('drop', DropFeatures(features_to_drop=drop_vars)), 


    
    # 5. Transformations
    ('binarizer', pp.Binarizer(variables=binarize_vars)),
    ('log', LogCpTransformer(variables=log_plus_one_vars, C=1)),
    
    # 6. Encode Strings to Numbers
    ('one_hot', OneHotEncoder(variables=ohe_vars, drop_last=True)),
    ('robust_scaler',RobustScaler()),
    ('reg', RandomForestRegressor(n_estimators=300))
])


In [37]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test,y_pred)
print(r2)

-0.10059773058649535
