In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from pandas_profiling import ProfileReport
%matplotlib inline

In [3]:
wids = pd.read_csv('training_v2.csv')
wids.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


In [4]:
df = wids[['age', 'bmi', 'weight', 'height', 'aids', 
'cirrhosis',
'diabetes_mellitus',
'hepatic_failure',
'immunosuppression',
'leukemia',
'lymphoma',
'solid_tumor_with_metastasis']]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91713 entries, 0 to 91712
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   age                          87485 non-null  float64
 1   bmi                          88284 non-null  float64
 2   weight                       88993 non-null  float64
 3   height                       90379 non-null  float64
 4   aids                         90998 non-null  float64
 5   cirrhosis                    90998 non-null  float64
 6   diabetes_mellitus            90998 non-null  float64
 7   hepatic_failure              90998 non-null  float64
 8   immunosuppression            90998 non-null  float64
 9   leukemia                     90998 non-null  float64
 10  lymphoma                     90998 non-null  float64
 11  solid_tumor_with_metastasis  90998 non-null  float64
dtypes: float64(12)
memory usage: 8.4 MB


In [6]:
df2 = df[['age', 'bmi', 'weight', 'height']]

#### Use bmi, weight and height to predict age

In [14]:
# Create training dataset. 
X = df2[df2['age'].notna()].drop('age', axis=1)
y = df2[df2['age'].notna()]['age']

# Create holdout dataset
X_holdout = df2[df2['age'].isna()].drop('age', axis=1)
y_holdout = df2[df2['age'].isna()]['age']

In [41]:
# Split training dataset to test and train.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 30)


In [42]:
# Standardize X_train and X_test
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [46]:
N_FOLDS = 3
MAX_BOOST_ROUNDS = 700
LEARNING_RATE = .1

In [56]:
# Use LightGBM to predict age 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


# Create lgb dataset
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.1 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'mae'          # or 'mae'
params['sub_feature'] = 0.50      # feature_fraction 
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 600  # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval)

print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Starting training...
[1]	valid_0's l1: 13.6042
[2]	valid_0's l1: 13.5746
[3]	valid_0's l1: 13.5503
[4]	valid_0's l1: 13.5259
[5]	valid_0's l1: 13.5056
[6]	valid_0's l1: 13.4881
[7]	valid_0's l1: 13.474
[8]	valid_0's l1: 13.4635
[9]	valid_0's l1: 13.4545
[10]	valid_0's l1: 13.4451
[11]	valid_0's l1: 13.4371
[12]	valid_0's l1: 13.4303
[13]	valid_0's l1: 13.4248
[14]	valid_0's l1: 13.42
[15]	valid_0's l1: 13.4157
[16]	valid_0's l1: 13.4121
[17]	valid_0's l1: 13.4094
[18]	valid_0's l1: 13.4067
[19]	valid_0's l1: 13.4045
[20]	valid_0's l1: 13.4025
Saving model...
Starting predicting...
The rmse of prediction is: 16.653059073952342


In [55]:
cv_results = lgb.cv(params, lgb_train, num_boost_round=MAX_BOOST_ROUNDS, nfold=N_FOLDS, 
                    verbose_eval=20, early_stopping_rounds=40)

[20]	cv_agg's l1: 13.2883 + 0.012111
[40]	cv_agg's l1: 13.279 + 0.0142361
[60]	cv_agg's l1: 13.2721 + 0.0151502
[80]	cv_agg's l1: 13.2717 + 0.0157793
[100]	cv_agg's l1: 13.2754 + 0.0172355


#### Use everything to predict age


In [62]:
# Take out hospital death
df3 = wids.drop('hospital_death', axis=1 )

In [66]:
# Drop irrelevant columns
df3.drop(['encounter_id',
'hospital_id',
'patient_id','icu_id'], axis=1, inplace=True)

In [67]:
# Clean apache_2_diagnosis and apache_3j_diagnosis
df3['apache_3j_diagnosis'] = df3['apache_3j_diagnosis'].astype('str').str.split('.',n=1,expand=True)[0]
df3['apache_3j_diagnosis'] = df3['apache_3j_diagnosis'].astype(str)

In [68]:
df3['apache_2_diagnosis'] = df3['apache_2_diagnosis'].astype('str').str.split('.',n=1,expand=True)[0]
df3['apache_2_diagnosis'] = df3['apache_2_diagnosis'].astype(str)

In [73]:
# One-hot encoding
df3 = pd.get_dummies(df3)

In [75]:
# Create training dataset. 
X = df3[df3['age'].notna()].drop('age', axis=1)
y = df3[df3['age'].notna()]['age']

# Create holdout dataset
X_holdout = df3[df3['age'].isna()].drop('age', axis=1)
y_holdout = df3[df3['age'].isna()]['age']

In [76]:
# Split training dataset to test and train.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 30)


In [77]:
N_FOLDS = 3
MAX_BOOST_ROUNDS = 700
LEARNING_RATE = .1

In [81]:
# Use LightGBM to predict age 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


# Create lgb dataset
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.1 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'mae'          # or 'mae'
params['sub_feature'] = 0.50      # feature_fraction 
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 600  # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0
params['early_stopping_round'] = 5

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval)

print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Starting training...




[1]	valid_0's l1: 13.1986
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 12.8741
[3]	valid_0's l1: 12.5103
[4]	valid_0's l1: 12.241
[5]	valid_0's l1: 11.9591
[6]	valid_0's l1: 11.6984
[7]	valid_0's l1: 11.4527
[8]	valid_0's l1: 11.2816
[9]	valid_0's l1: 11.0813
[10]	valid_0's l1: 10.9407
[11]	valid_0's l1: 10.815
[12]	valid_0's l1: 10.655
[13]	valid_0's l1: 10.5063
[14]	valid_0's l1: 10.3779
[15]	valid_0's l1: 10.2712
[16]	valid_0's l1: 10.1942
[17]	valid_0's l1: 10.124
[18]	valid_0's l1: 10.0193
[19]	valid_0's l1: 9.9449
[20]	valid_0's l1: 9.86723
Did not meet early stopping. Best iteration is:
[20]	valid_0's l1: 9.86723
Saving model...
Starting predicting...
The rmse of prediction is: 12.457221242838877


In [82]:
cv_results = lgb.cv(params, lgb_train, num_boost_round=MAX_BOOST_ROUNDS, nfold=N_FOLDS, 
                    verbose_eval=20, early_stopping_rounds=40)



[20]	cv_agg's l1: 10.0861 + 0.030792
[40]	cv_agg's l1: 9.23013 + 0.0257659
[60]	cv_agg's l1: 8.83549 + 0.0160841
[80]	cv_agg's l1: 8.59302 + 0.0139971
[100]	cv_agg's l1: 8.44422 + 0.00403659
[120]	cv_agg's l1: 8.3349 + 0.00553144
[140]	cv_agg's l1: 8.24531 + 0.0152682
[160]	cv_agg's l1: 8.18206 + 0.0194748
[180]	cv_agg's l1: 8.136 + 0.01989
[200]	cv_agg's l1: 8.10186 + 0.018832
[220]	cv_agg's l1: 8.06277 + 0.0223721
[240]	cv_agg's l1: 8.03035 + 0.0229205
[260]	cv_agg's l1: 8.00973 + 0.0262268
[280]	cv_agg's l1: 7.99237 + 0.0287952
[300]	cv_agg's l1: 7.97687 + 0.0248108
[320]	cv_agg's l1: 7.96284 + 0.0268796
[340]	cv_agg's l1: 7.94835 + 0.0302806
[360]	cv_agg's l1: 7.93724 + 0.0296999
[380]	cv_agg's l1: 7.92916 + 0.028784


In [83]:
sum(wids['age'].isna())

4228