# Setting environment

In [None]:
## loading packages

import os
import time
import math
import random
from random import sample
from random import seed

import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from scipy.stats import randint
from scipy.stats import uniform
from scipy.stats import pearsonr

import joblib



In [None]:
from google.colab import drive
drive.mount('/content/drive')

# change to working directory
wd_path = '/content/drive/MyDrive/nhanes_crf_model'
os.chdir(wd_path)
os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive/nhanes_crf_model'

## Extended model

In [None]:
extended_variables = ['INDFMPIR', 'health_insurance', 'htn_history', 'age', 'ALT', 'AST',
       'BUN', 'GLU', 'LDH', 'CHOL', 'TOTPRO', 'POTAS', 'SODI', 'albuminuria',
       'CHL', 'HBA1C', 'CREATININE', 'BILIRUBIN', 'CALCIUM', 'BICARBONATE',
       'SY_mean', 'DI_mean', 'BMXBMI', 'BMXWAIST', 'BPXPLS', 'BMXWT', 'BMXHT',
       'WT_DIFF_KG', 'lwlstyr', 'exerc2lwlastyr', 'ARM_PF', 'ARM_BMD',
       'ARM_LEAN', 'LEG_PF', 'LEG_BMD', 'LEG_LEAN', 'TR_PF', 'TR_BMD',
       'TR_LEAN', 'TWMT_log', 'walk_bike', 'gender_Male',
       'race_ethn_Non-Hispanic Black', 'race_ethn_Non-Hispanic White',
       'race_ethn_Other', 'educ_High school diploma or GED',
       'educ_Less than high school', 'marital_status_other',
       'family_income_Low income', 'employment_status_Unemployed',
       'employment_status_Working', 'smoking_status_Former smoker',
       'smoking_status_Never smoker', 'alcohol_intake_Heavier drinker',
       'alcohol_intake_Light drinker', 'alcohol_intake_Moderate drinker',
       'alcohol_intake_Never drinker', 'physical_activity_Inactive',
       'physical_activity_Insufficient active']

## read the train/test data
train_extended_dmy = pd.read_csv('data/train_all_dummies.csv')[extended_variables + ["y_train"]]
test_extended_dmy = pd.read_csv('data/test_all_dummies.csv')[extended_variables + ["y_test"]]


## train
X_train_extended_dmy = train_extended_dmy.drop(['y_train'], axis =1)
y_train_extended_dmy = train_extended_dmy['y_train']

# test
X_test_extended_dmy = test_extended_dmy.drop(['y_test'], axis =1)
y_test_extended_dmy = test_extended_dmy['y_test']

In [None]:
## random search 100 times
search_iter = 100

param_dist_lgb = {
    'learning_rate' : [0.1, 0.01,0.05],
    'num_leaves': [15,31,63,127,255,511,1023,2047],
    'min_child_samples':[1,5,10,15,20],
    'subsample': uniform(loc = 0.4, scale =0.5),
    'reg_lambda' : uniform(loc = 0.0, scale =0.3),
    'bagging_freq' : [0,5,10,15,30],
    'n_estimators' :[30,50,100,200,500,1000],
    'feature_fraction' : uniform(loc = 0.4, scale =0.5)
}

lgb_reg_extended = lgb.LGBMRegressor(objective = 'regression',
                             random_state= 42,
                             min_split_gain = 0.00001)


## using rmse score
randomsearch_lgb_extended_dmy_rmse  = RandomizedSearchCV(
    estimator=lgb_reg_extended,
    param_distributions=param_dist_lgb,
    n_iter=search_iter,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs = -1,
    refit=True,
    random_state= 42,
    verbose=1)

## rmse score loss
start = time.time()

randomsearch_lgb_extended_dmy_rmse.fit(X=X_train_extended_dmy,
                                       y=y_train_extended_dmy)

print('Best score rmse in validation set:')
print(randomsearch_lgb_extended_dmy_rmse.best_score_)


end = time.time()
print('Execution time is:')
print((end - start)/60)


# save  model
## pickle files using joblib
joblib.dump(randomsearch_lgb_extended_dmy_rmse.best_estimator_, '/content/drive/MyDrive/nhanes_crf_model/model/lgb_extended.pkl')


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5699
[LightGBM] [Info] Number of data points in the train set: 4534, number of used features: 59
[LightGBM] [Info] Start training from score 41.556134
Best score rmse in validation set:
-8.786756794100942
Execution time is:
43.422518730163574


['/content/drive/MyDrive/nhanes_crf_model/model/lgb_extended.pkl']

In [None]:
lgb_extended =  joblib.load('/content/drive/MyDrive/nhanes_crf_model/model/lgb_extended.pkl')

pred_vo2max_lgb_extended_dmy = lgb_extended.predict(X_test_extended_dmy)

## rmse
lgb_extended_dmy_rmse = np.sqrt(mean_squared_error(y_test_extended_dmy, pred_vo2max_lgb_extended_dmy))
## mae
lgb_extended_dmy_mae = mean_absolute_error(y_test_extended_dmy,pred_vo2max_lgb_extended_dmy)
## r2
lgb_extended_dmy_r2 = r2_score(y_test_extended_dmy,pred_vo2max_lgb_extended_dmy)

print('test rmse: {}'.format(lgb_extended_dmy_rmse))
print('test mae: {}'.format(lgb_extended_dmy_mae))
print('test r2: {}'.format(lgb_extended_dmy_r2))



test rmse: 8.3120286753255
test mae: 5.907036352010363
test r2: 0.3098009927955049


## Parsimonious model

In [None]:
parsimonious_variables = ['INDFMPIR', 'health_insurance', 'htn_history', 'age', 'SY_mean',
       'DI_mean', 'BMXBMI', 'BMXWAIST', 'BPXPLS', 'BMXWT', 'BMXHT',
       'WT_DIFF_KG', 'lwlstyr', 'exerc2lwlastyr', 'TWMT_log', 'walk_bike',
       'gender_Male', 'race_ethn_Non-Hispanic Black',
       'race_ethn_Non-Hispanic White', 'race_ethn_Other',
       'educ_High school diploma or GED', 'educ_Less than high school',
       'marital_status_other', 'family_income_Low income',
       'employment_status_Unemployed', 'employment_status_Working',
       'smoking_status_Former smoker', 'smoking_status_Never smoker',
       'alcohol_intake_Heavier drinker', 'alcohol_intake_Light drinker',
       'alcohol_intake_Moderate drinker', 'alcohol_intake_Never drinker',
       'physical_activity_Inactive', 'physical_activity_Insufficient active']


## read the train/test data
train_parsimonious_dmy = pd.read_csv('data/train_all_dummies.csv')[parsimonious_variables+["y_train"]]
test_parsimonious_dmy = pd.read_csv('data/test_all_dummies.csv')[parsimonious_variables+["y_test"]]

## train
X_train_parsimonious_dmy = train_parsimonious_dmy.drop(['y_train'], axis =1)
y_train_parsimonious_dmy = train_parsimonious_dmy['y_train']

# test
X_test_parsimonious_dmy = test_parsimonious_dmy.drop(['y_test'], axis =1)
y_test_parsimonious_dmy = test_parsimonious_dmy['y_test']



In [None]:
# random search 100
search_iter = 100

param_dist_lgb = {
    'learning_rate' : [0.1, 0.01,0.05],
    'num_leaves': [15,31,63,127,255,511,1023,2047],
    'min_child_samples':[1,5,10,15,20],
    'subsample': uniform(loc = 0.4, scale =0.5),
    'reg_lambda' : uniform(loc = 0.0, scale =0.3),
    'bagging_freq' : [0,5,10,15,30],
    'n_estimators' :[30,50,100,200,500,1000],
    'feature_fraction' : uniform(loc = 0.4, scale =0.5)
}

lgb_reg_parsimonious = lgb.LGBMRegressor(objective = 'regression',
                             random_state= 42,
                             min_split_gain = 0.00001)


## using rmse score
randomsearch_lgb_parsimonious_dmy_rmse  = RandomizedSearchCV(
    estimator=lgb_reg_parsimonious,
    param_distributions=param_dist_lgb,
    n_iter=search_iter,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs = -1,
    refit=True,
    random_state= 42,
    verbose=1)

## rmse score loss
start = time.time()

randomsearch_lgb_parsimonious_dmy_rmse.fit(X=X_train_parsimonious_dmy,
                                           y=y_train_parsimonious_dmy)

print('Best score rmse in validation set:')
print(randomsearch_lgb_parsimonious_dmy_rmse.best_score_)

end = time.time()
print('Execution time:')
print((end - start)/60)


# save best model
## pickle files using joblib
joblib.dump(randomsearch_lgb_parsimonious_dmy_rmse.best_estimator_, '/content/drive/MyDrive/nhanes_crf_model/model/lgb_parsimonious.pkl')


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1999
[LightGBM] [Info] Number of data points in the train set: 4534, number of used features: 34
[LightGBM] [Info] Start training from score 41.556134
Best score rmse in validation set:
-9.063324016536665
Execution time:
20.565823829174043


['/content/drive/MyDrive/nhanes_crf_model/model/lgb_parsimonious.pkl']

In [None]:
lgb_parsimonious =  joblib.load('/content/drive/MyDrive/nhanes_crf_model/model/lgb_parsimonious.pkl')

pred_vo2max_lgb_parsimonious_dmy = lgb_parsimonious.predict(X_test_parsimonious_dmy)

## rmse
lgb_parsimonious_dmy_rmse = np.sqrt(mean_squared_error(y_test_parsimonious_dmy, pred_vo2max_lgb_parsimonious_dmy))
## mae
lgb_parsimonious_dmy_mae = mean_absolute_error(y_test_parsimonious_dmy,pred_vo2max_lgb_parsimonious_dmy)
## r2
lgb_parsimonious_dmy_r2 = r2_score(y_test_parsimonious_dmy,pred_vo2max_lgb_parsimonious_dmy)

print('test rmse: {}'.format(lgb_parsimonious_dmy_rmse))
print('test mae: {}'.format(lgb_parsimonious_dmy_mae))
print('test r2: {}'.format(lgb_parsimonious_dmy_r2))

test rmse: 8.523236665565024
test mae: 6.128975659586947
test r2: 0.2742795518882555
