In [1]:
# Make project root importable so "from src...." works
import sys, os
sys.path.insert(0, os.path.abspath(".."))  # from notebooks/ -> project root


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

TARGET = "Life expectancy " 

df = pd.read_csv("../data/Life Expectancy Data.csv")

# we cannot train on missing targets
df = df.dropna(subset=[TARGET]).reset_index(drop=True)

print("Shape:", df.shape)
df.head(3)

Shape: (2928, 22)


Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9


In [None]:
from sklearn.model_selection import KFold, cross_val_score
from src.model import build_model_pipeline  # uses FeatureMaker + Preprocessor + chosen model

X = df.drop(TARGET, axis=1)
y = df[TARGET]

# 5-fold CV object (shuffle=True is recommended)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Building a pipeline with Ridge(alpha=1.0) as an example
pipe = build_model_pipeline("Ridge", alpha=1.0)

# RMSE scores (note: scikit-learn returns NEGATIVE RMSE for minimization)
rmse_scores = -cross_val_score(pipe, X, y,
                               scoring="neg_root_mean_squared_error",
                               cv=kf, n_jobs=-1)

# R2 scores
r2_scores = cross_val_score(pipe, X, y, scoring="r2", cv=kf, n_jobs=-1)

print("RMSE (each fold):", np.round(rmse_scores, 3))
print("RMSE mean ± std :", f"{rmse_scores.mean():.3f} ± {rmse_scores.std():.3f}")
print("R2   (each fold):", np.round(r2_scores, 3))
print("R2   mean ± std :", f"{r2_scores.mean():.3f} ± {r2_scores.std():.3f}")

RMSE (each fold): [2.09  2.149 2.348 2.068 2.062]
RMSE mean ± std : 2.144 ± 0.107
R2   (each fold): [0.949 0.951 0.941 0.952 0.951]
R2   mean ± std : 0.949 ± 0.004


In [4]:
# GridSearch for Ridge (I choosed Ridge but it can be changed by Lasso)
from sklearn.model_selection import GridSearchCV

which_model = "Ridge" 

pipe = build_model_pipeline(which_model)

# logarithmic grid for alpha: from 1e-3 to 1e+2
alphas = np.logspace(-3, 2, 20)

param_grid = {
    "regressor__alpha": alphas
}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",  # minimize RMSE
    cv=5,
    n_jobs=-1,
    return_train_score=True
)

gs.fit(X, y)

best_rmse = -gs.best_score_
best_params = gs.best_params_
print("Best RMSE:", round(best_rmse, 3))
print("Best params:", best_params)

#inspecting full results as a DataFrame
cvres = pd.DataFrame(gs.cv_results_)
cvres_sorted = cvres.sort_values("mean_test_score", ascending=False)
cvres_sorted.head()


Best RMSE: 4.63
Best params: {'regressor__alpha': np.float64(54.555947811685144)}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
18,0.049389,0.008232,0.01747,0.008377,54.555948,{'regressor__alpha': 54.555947811685144},-4.654123,-4.030162,-4.424986,-4.502795,...,-4.62996,0.498515,1,-3.543819,-3.639641,-3.605942,-3.566529,-3.54081,-3.579348,0.03809
19,0.044275,0.00609,0.012994,0.00504,100.0,{'regressor__alpha': 100.0},-4.644341,-4.025408,-4.48247,-4.532503,...,-4.630827,0.469387,2,-3.748534,-3.854831,-3.823286,-3.764787,-3.755686,-3.789425,0.042052
17,0.046944,0.010187,0.017064,0.00723,29.763514,{'regressor__alpha': 29.763514416313193},-4.693577,-4.036146,-4.398035,-4.477669,...,-4.638306,0.519135,3,-3.270109,-3.350387,-3.315193,-3.299504,-3.251289,-3.297296,0.03464
16,0.050699,0.014533,0.015962,0.007533,16.237767,{'regressor__alpha': 16.23776739188721},-4.776412,-4.058564,-4.425299,-4.465566,...,-4.671587,0.531501,4,-2.950248,-3.006367,-2.973436,-2.981374,-2.91143,-2.964571,0.032053
15,0.057264,0.006683,0.016621,0.001023,8.858668,{'regressor__alpha': 8.858667904100823},-4.921162,-4.126653,-4.533927,-4.485778,...,-4.749261,0.528583,5,-2.636599,-2.657514,-2.633774,-2.658267,-2.577421,-2.632715,0.029467


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from src.model import build_model_pipeline

pipe = build_model_pipeline("RandomForest")

dist = {
    "regressor__n_estimators": randint(100, 200),
    "regressor__max_depth":    randint(3, 10),
    "regressor__min_samples_leaf": randint(3, 10),
    "regressor__min_samples_split": randint(5, 20),
    "regressor__max_features": ["sqrt", "log2"]
}

rs = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=dist,
    n_iter=40,
    scoring="neg_root_mean_squared_error",
    cv=5,
    random_state=42,
    n_jobs=-1,
    return_train_score=True
)

rs.fit(X, y)

best_rmse = -rs.best_score_
best_params = rs.best_params_

print("Best RMSE:", round(best_rmse, 3))
print("Best params:", best_params)


Best RMSE: 3.584
Best params: {'regressor__max_depth': 9, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 3, 'regressor__min_samples_split': 16, 'regressor__n_estimators': 107}


In [None]:
import os, joblib

os.makedirs("models", exist_ok=True)

best_model = rs.best_estimator_ 
save_path = "models/best_pipeline.pkl"
joblib.dump(best_model, save_path)

print("Saved to:", save_path)


Saved to: models/best_pipeline.pkl
