In [28]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupShuffleSplit

RANDOM_STATE = 42
pd.set_option("display.max_columns", None)

In [29]:
# read data again
df = pd.read_csv("data/raw/Life Expectancy Data.csv")
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [30]:
# clean column name
df = df.copy()
df.columns = df.columns.str.strip().str.replace(' ','_', regex=False)

target = "Life_expectancy"
group_col = "Country"
time_col  = "Year"

df[[target, group_col, time_col]].head()

Unnamed: 0,Life_expectancy,Country,Year
0,65.0,Afghanistan,2015
1,59.9,Afghanistan,2014
2,59.9,Afghanistan,2013
3,59.5,Afghanistan,2012
4,59.2,Afghanistan,2011


In [31]:
# missing overview
print("Shape:", df.shape)
missing_ratio = df.isna().mean().sort_values(ascending=False)
display(missing_ratio.head(12))

Shape: (2938, 22)


Population                         0.221920
Hepatitis_B                        0.188223
GDP                                0.152485
Total_expenditure                  0.076923
Alcohol                            0.066031
Income_composition_of_resources    0.056841
Schooling                          0.055480
thinness_5-9_years                 0.011572
thinness__1-19_years               0.011572
BMI                                0.011572
Polio                              0.006467
Diphtheria                         0.006467
dtype: float64

In [32]:
# split target and X
y = df[target]
X = df.drop(columns=[target])

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

# remove time and group from features
if group_col in num_cols:
    num_cols.remove(group_col)
if time_col in num_cols:
    num_cols.remove(time_col)
if group_col in cat_cols:
    cat_cols.remove(group_col)
if time_col in cat_cols:
    cat_cols.remove(time_col)

print("Numeric cols:", len(num_cols), num_cols)
print("Categorical cols:", cat_cols)

Numeric cols: 18 ['Adult_Mortality', 'infant_deaths', 'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness__1-19_years', 'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling']
Categorical cols: ['Status']


In [33]:
# log transform
log_cols = [    # determine in eda, right skewed
    "GDP",
    "Population",
    "percentage_expenditure",
    "Measles",
    "HIV/AIDS",
    "infant_deaths",
    "under-five_deaths"
]

def log1p_selected(X_df):
    X_df = X_df.copy()
    for c in log_cols:
        if c in X_df.columns:
            X_df[c] = np.log1p(X_df[c].astype(float))
    return X_df

log_transformer = FunctionTransformer(log1p_selected, feature_names_out="one-to-one")

In [34]:
# missing value
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols)
    ],
    remainder="drop"
)

In [35]:
# full pipeline
full_pipeline = Pipeline(steps=[
    ("log", log_transformer),
    ("prep", preprocessor)
])

In [36]:
# split train/test
groups = df[group_col]
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train_raw, X_test_raw = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
y_train, y_test = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()

# drop columns that should not be used as features
drop_cols = [c for c in [group_col, time_col] if c in X_train_raw.columns]
X_train_fit = X_train_raw.drop(columns=drop_cols)
X_test_fit  = X_test_raw.drop(columns=drop_cols)

# run the full pipeline: log → impute → scale/encode
X_train_proc = full_pipeline.fit_transform(X_train_fit, y_train)
X_test_proc  = full_pipeline.transform(X_test_fit)

print("Train:", X_train_fit.shape, "→", X_train_proc.shape)
print("Test :", X_test_fit.shape,  "→", X_test_proc.shape)
print("NaN after preprocessing? Train:",
      np.isnan(X_train_proc).any(), " Test:", np.isnan(X_test_proc).any())

Train: (2344, 19) → (2344, 20)
Test : (594, 19) → (594, 20)
NaN after preprocessing? Train: False  Test: False
