In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import joblib

# Paste your RAW file URL here
CSV_URL = "https://raw.githubusercontent.com/tshewangla/bhutan-healthcare-ds-project/refs/heads/main/data/processed/life_expectancy_btn_features.csv"

df = pd.read_csv(CSV_URL)
df.head()

Unnamed: 0,year,country,sex,life_expectancy,ci_low,ci_high,years_since_2000,is_female,is_male,ci_span,life_expectancy_3yr_ma
0,2000,Bhutan,Both sexes,65.989561,65.33172,66.840308,0,0,0,1.508588,65.989561
1,2001,Bhutan,Both sexes,67.180309,66.594776,67.916677,1,0,0,1.321901,66.584935
2,2002,Bhutan,Both sexes,67.845665,67.083124,68.558745,2,0,0,1.475621,67.005179
3,2003,Bhutan,Both sexes,68.384536,67.766933,69.153985,3,0,0,1.387051,67.803504
4,2004,Bhutan,Both sexes,68.885143,68.229492,69.604473,4,0,0,1.374981,68.371781


In [None]:
#life_expectancy
# Target
y = df['life_expectancy']

# Feature columns — all except target, country, sex (sex is already encoded)
feature_cols = [
    'year',
    'years_since_2000',
    'is_female',
    'is_male',
    'ci_low',
    'ci_high',
    'ci_span',
    'life_expectancy_3yr_ma'
]

X = df[feature_cols]

X.head()

Unnamed: 0,year,years_since_2000,is_female,is_male,ci_low,ci_high,ci_span,life_expectancy_3yr_ma
0,2000,0,0,0,65.33172,66.840308,1.508588,65.989561
1,2001,1,0,0,66.594776,67.916677,1.321901,66.584935
2,2002,2,0,0,67.083124,68.558745,1.475621,67.005179
3,2003,3,0,0,67.766933,69.153985,1.387051,67.803504
4,2004,4,0,0,68.229492,69.604473,1.374981,68.371781


In [None]:
#train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

((52, 8), (14, 8))

In [None]:
#Train Regression Model
model = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)

model.fit(X_train, y_train)

In [None]:
# Predict
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) # Calculate RMSE by taking the square root of MSE
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R²:", r2)

#comparison
results = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
})
results

RMSE: 0.12647108270692478
R²: 0.9974906152576739


Unnamed: 0,Actual,Predicted
0,70.739731,70.903863
1,72.69367,72.771425
2,65.989561,66.252911
3,66.748684,66.829945
4,69.368954,69.320938
5,72.868018,72.876511
6,73.284246,73.23959
7,72.455353,72.430651
8,73.927206,73.945034
9,71.954482,71.821955


In [12]:
joblib.dump(model, "model.pkl")


['model.pkl']

In [13]:
from google.colab import files
files.download("model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>