In [1]:
import pandas as pd
import numpy as np

CSV_URL = "https://raw.githubusercontent.com/tshewangla/bhutan-healthcare-ds-project/refs/heads/main/data/processed/life_expectancy_btn_clean.csv"

df = pd.read_csv(CSV_URL)
df.head()


Unnamed: 0,year,country,sex,life_expectancy,ci_low,ci_high
0,2000,Bhutan,Both sexes,65.989561,65.33172,66.840308
1,2000,Bhutan,Female,66.27221,65.492444,67.08742
2,2000,Bhutan,Male,65.733433,65.095977,66.570549
3,2001,Bhutan,Both sexes,67.180309,66.594776,67.916677
4,2001,Bhutan,Female,67.663072,66.964688,68.370021


In [2]:
# 1. Years since 2000 (time index)
df['years_since_2000'] = df['year'] - 2000

# 2. One-hot style flags for sex
df['is_female'] = (df['sex'] == 'Female').astype(int)
df['is_male']   = (df['sex'] == 'Male').astype(int)
# Both sexes will have 0 for both flags â†’ baseline category

# 3. Confidence interval span
df['ci_span'] = df['ci_high'] - df['ci_low']

# 4. 3-year moving average of life expectancy within each sex
df = df.sort_values(['sex', 'year']).reset_index(drop=True)
df['life_expectancy_3yr_ma'] = (
    df.groupby('sex')['life_expectancy']
      .transform(lambda s: s.rolling(window=3, min_periods=1).mean())
)

df.head(10)


Unnamed: 0,year,country,sex,life_expectancy,ci_low,ci_high,years_since_2000,is_female,is_male,ci_span,life_expectancy_3yr_ma
0,2000,Bhutan,Both sexes,65.989561,65.33172,66.840308,0,0,0,1.508588,65.989561
1,2001,Bhutan,Both sexes,67.180309,66.594776,67.916677,1,0,0,1.321901,66.584935
2,2002,Bhutan,Both sexes,67.845665,67.083124,68.558745,2,0,0,1.475621,67.005179
3,2003,Bhutan,Both sexes,68.384536,67.766933,69.153985,3,0,0,1.387051,67.803504
4,2004,Bhutan,Both sexes,68.885143,68.229492,69.604473,4,0,0,1.374981,68.371781
5,2005,Bhutan,Both sexes,69.368954,68.771921,70.041252,5,0,0,1.269331,68.879544
6,2006,Bhutan,Both sexes,69.95563,69.435534,70.512895,6,0,0,1.077361,69.403242
7,2007,Bhutan,Both sexes,70.476308,69.885664,71.191696,7,0,0,1.306032,69.933631
8,2008,Bhutan,Both sexes,70.909038,70.348314,71.601786,8,0,0,1.253471,70.446992
9,2009,Bhutan,Both sexes,71.270071,70.590981,71.964892,9,0,0,1.373911,70.885139


In [3]:
print("Columns after feature engineering:")
print(df.columns.tolist())

print("\nSummary of numeric columns:")
print(df.describe())


Columns after feature engineering:
['year', 'country', 'sex', 'life_expectancy', 'ci_low', 'ci_high', 'years_since_2000', 'is_female', 'is_male', 'ci_span', 'life_expectancy_3yr_ma']

Summary of numeric columns:
              year  life_expectancy     ci_low    ci_high  years_since_2000  \
count    66.000000        66.000000  66.000000  66.000000         66.000000   
mean   2010.500000        71.363622  70.742094  72.062175         10.500000   
std       6.392905         2.606439   2.624229   2.579809          6.392905   
min    2000.000000        65.733433  65.095977  66.570549          0.000000   
25%    2005.000000        69.435956  68.838191  70.116164          5.000000   
50%    2010.500000        71.814607  71.217063  72.552234         10.500000   
75%    2016.000000        73.281179  72.690825  73.954405         16.000000   
max    2021.000000        75.939756  75.477979  76.664141         21.000000   

       is_female    is_male    ci_span  life_expectancy_3yr_ma  
count  66.0

In [4]:
output_path = "life_expectancy_btn_features.csv"
df.to_csv(output_path, index=False)
output_path


'life_expectancy_btn_features.csv'

In [5]:
from google.colab import files
files.download("life_expectancy_btn_features.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>