In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [57]:
data=pd.read_csv("/kaggle/input/life-expectancy-who/Life Expectancy Data.csv")

In [58]:
data.shape

(2938, 22)

In [59]:
data.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [61]:
data=data.drop(columns=["Country","Year"])

In [62]:
data.columns

Index(['Status', 'Life expectancy ', 'Adult Mortality', 'infant deaths',
       'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
       'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
       ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years',
       ' thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')

In [63]:
data['Status'].unique()


array(['Developing', 'Developed'], dtype=object)

In [64]:
data['Status'] = data['Status'].map({
    'Developing': 1,
    'Developed': 0
})


In [65]:
data['Status'].unique()


array([1, 0])

In [66]:
data['Status'].value_counts()

Status
1    2426
0     512
Name: count, dtype: int64

In [67]:
data.head()
# Clean column names
data.columns = data.columns.str.strip()

# Continue with train-test split and model training


In [68]:
data.isnull().sum()


Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
BMI                                 34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
HIV/AIDS                             0
GDP                                448
Population                         652
thinness  1-19 years                34
thinness 5-9 years                  34
Income composition of resources    167
Schooling                          163
dtype: int64

In [69]:
data = data.fillna(data.mean())


In [70]:
data.isnull().sum()

Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
BMI                                0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
HIV/AIDS                           0
GDP                                0
Population                         0
thinness  1-19 years               0
thinness 5-9 years                 0
Income composition of resources    0
Schooling                          0
dtype: int64

In [72]:
X=data.drop("Life expectancy",axis=1)
Y=data['Life expectancy']

In [73]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)

In [74]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [75]:
y_pred = model.predict(X_test)

In [None]:
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")


In [79]:


# Custom input in the same order and with exact names as in training
custom_input = pd.DataFrame([{
    'Status': 1,  # 1 = Developing, 0 = Developed (example, adjust as per your encoding)
    'Adult Mortality': 150,
    'infant deaths': 5,
    'Alcohol': 3.2,
    'percentage expenditure': 1500,
    'Hepatitis B': 90,
    'Measles': 20,  # Removed extra space
    'BMI': 24.5,
    'under-five deaths': 2,  # Removed extra space
    'Polio': 90,
    'Total expenditure': 5.5,
    'Diphtheria': 85,  # Removed extra space
    'HIV/AIDS': 0.1,
    'GDP': 9000,
    'Population': 25000000,
    'thinness  1-19 years': 2.5,  # Leave as-is (double space is correct based on your data)
    'thinness 5-9 years': 2.3,
    'Income composition of resources': 0.65,
    'Schooling': 12
}])

# Ensure no extra spaces in column names
custom_input.columns = custom_input.columns.str.strip()

# Predict
predicted_life_expectancy = model.predict(custom_input)
print(f"🎯 Predicted Life Expectancy: {predicted_life_expectancy[0]:.2f} years")


🎯 Predicted Life Expectancy: 73.65 years


In [80]:
import joblib
joblib.dump(model, 'model.sav')  # Best choice for scikit-learn


['model.sav']

In [81]:
joblib.dump(model, 'life_expectancy_model.pkl')


['life_expectancy_model.pkl']