In [13]:
target_cols = ['Glycohemoglobin',
       'Fasting glucose (mg/dL)', 'Fasting glucose (mmol/L)',
       'Insulin (pmol/L)', 'HDL cholesterol (mg/dL)',
       'HDL cholesterol (mmol/L)',
       'Total cholesterol (mmol/L)', 'Triglycerides (mmol/L)',
       'LDL cholesterol (mg/dL)', 'Trunk Fat (%)','Total Fat (%)']

# Step 2: Select the relevant columns for input features and target variables
input_cols = ['Weight (kg)', 
              'Height (cm)',
       'Leg Length (cm)', 
       'Arm Length (cm)', 
       'Arm Circumference (cm)',
       'Waist Circumference (cm)', 
       'Gender', 'Age (years)',
       'Vigorous work activity (Y/N)',
         'Minutes vigorous-intensity work',
       'Minutes moderate-intensity work',
       'Minutes walk/bicycle for transportation',
       'Minutes vigorous recreational activities',
       'Minutes sedentary activity', 
       'Hours watch TV or videos past 30 days',
       'Hours use computer past 30 days']

In [14]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pickle
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, explained_variance_score
import optuna
import pandas as pd

# Step 1: Load your dataset into a Pandas DataFrame
data = pd.read_csv('populated.csv')

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[input_cols], data[target_cols], test_size=0.2, random_state=0)

# Step 4: Check and preprocess the target variable to handle invalid values
for col in target_cols:
    # Check for NaN or infinity values
    nan_indices = np.isnan(y_train[col])
    inf_indices = np.isinf(y_train[col])
    valid_indices = ~(nan_indices | inf_indices)

    # Remove invalid values
    X_train = X_train[valid_indices]
    y_train = y_train[valid_indices]
    
# Step 4: Scale the input features
scaler = StandardScaler()
scaler.fit(X_train[input_cols])
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [15]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pickle
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, explained_variance_score
import optuna
import pandas as pd

# Create a dictionary to store the models and best parameters for each target column
models = {}
best_params = {}
best_rmse = {}
best_mae = {}

# Define the objective function for Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }

    model = XGBRegressor(random_state=0, **params)
    model.fit(X_train_scaled, y_train[col])

    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test[col], y_pred))

    return rmse

# Fit XGBoost models for each target column with hyperparameter tuning
for col in target_cols:
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100, timeout=20)  # Adjust the timeout value according to your time-bound constraint

    best_params[col] = study.best_params
    best_model = XGBRegressor(random_state=0, **best_params[col])
    best_model.fit(X_train_scaled, y_train[col])
    models[col] = best_model

    # Print evaluation metrics
    y_pred = best_model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test[col], y_pred))
    mae = mean_absolute_error(y_test[col], y_pred)
    r2 = r2_score(y_test[col], y_pred)
    explained_var = explained_variance_score(y_test[col], y_pred)

    best_rmse[col] = rmse
    best_mae[col] = mae

    print(f"Column: {col}")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"R2: {r2}")
    print(f"Explained Variance: {explained_var}")
    print()

# Save the best model parameters for each column in a text file
with open('best_model_params.txt', 'w') as file:
    for col, params in best_params.items():
        file.write(f"Column: {col}\n")
        file.write(f"Best Parameters: {params}\n\n")

# Save the best RMSE and MAE for each column in a text file
with open('best_metrics.txt', 'w') as file:
    for col, rmse in best_rmse.items():
        file.write(f"Column: {col}\n")
        file.write(f"Best RMSE: {rmse}\n")
        file.write(f"Best MAE: {best_mae[col]}\n\n")

# Pickle the models and scaler
with open('models.pkl', 'wb') as file:
    pickle.dump(models, file)


[I 2023-06-30 08:27:20,394] A new study created in memory with name: no-name-2e6f3010-c832-46f8-8a32-9a7c7fcdeaf8
[I 2023-06-30 08:27:24,467] Trial 0 finished with value: 0.39512041129082653 and parameters: {'n_estimators': 168, 'learning_rate': 0.024415468685401863, 'max_depth': 10}. Best is trial 0 with value: 0.39512041129082653.
[I 2023-06-30 08:27:25,249] Trial 1 finished with value: 0.6714068444893633 and parameters: {'n_estimators': 51, 'learning_rate': 0.06015251528558077, 'max_depth': 7}. Best is trial 0 with value: 0.39512041129082653.
[I 2023-06-30 08:27:27,697] Trial 2 finished with value: 0.6881711408310122 and parameters: {'n_estimators': 149, 'learning_rate': 0.03704314732601082, 'max_depth': 5}. Best is trial 0 with value: 0.39512041129082653.
[I 2023-06-30 08:27:31,998] Trial 3 finished with value: 1.0000078103784409 and parameters: {'n_estimators': 172, 'learning_rate': 0.01167987192943215, 'max_depth': 7}. Best is trial 0 with value: 0.39512041129082653.
[I 2023-06-3

Column: Glycohemoglobin
RMSE: 0.3738854511667039
MAE: 0.2406635546418074
R2: 0.860721620512657
Explained Variance: 0.8608852129533108



[I 2023-06-30 08:27:55,033] Trial 0 finished with value: 6.862940052223307 and parameters: {'n_estimators': 195, 'learning_rate': 0.30621471958819, 'max_depth': 10}. Best is trial 0 with value: 6.862940052223307.
[I 2023-06-30 08:27:57,389] Trial 1 finished with value: 8.610272852719657 and parameters: {'n_estimators': 84, 'learning_rate': 0.3635401931868705, 'max_depth': 7}. Best is trial 0 with value: 6.862940052223307.
[I 2023-06-30 08:27:59,232] Trial 2 finished with value: 19.6872906747668 and parameters: {'n_estimators': 162, 'learning_rate': 0.2818492911994847, 'max_depth': 3}. Best is trial 0 with value: 6.862940052223307.
[I 2023-06-30 08:28:01,253] Trial 3 finished with value: 24.185933490396568 and parameters: {'n_estimators': 71, 'learning_rate': 0.03005702540412554, 'max_depth': 7}. Best is trial 0 with value: 6.862940052223307.
[I 2023-06-30 08:28:05,879] Trial 4 finished with value: 16.57228027798559 and parameters: {'n_estimators': 184, 'learning_rate': 0.03706927494031

Column: Fasting glucose (mg/dL)
RMSE: 6.862940052223307
MAE: 2.9855078449294323
R2: 0.9530391883587291
Explained Variance: 0.9530937908788663



[I 2023-06-30 08:28:23,400] Trial 0 finished with value: 1.3258639499709315 and parameters: {'n_estimators': 101, 'learning_rate': 0.019119686009535602, 'max_depth': 8}. Best is trial 0 with value: 1.3258639499709315.
[I 2023-06-30 08:28:26,377] Trial 1 finished with value: 0.6664707422563959 and parameters: {'n_estimators': 128, 'learning_rate': 0.16596299457923222, 'max_depth': 6}. Best is trial 1 with value: 0.6664707422563959.
[I 2023-06-30 08:28:28,329] Trial 2 finished with value: 0.9837642156025314 and parameters: {'n_estimators': 171, 'learning_rate': 0.4744181396352374, 'max_depth': 3}. Best is trial 1 with value: 0.6664707422563959.
[I 2023-06-30 08:28:29,851] Trial 3 finished with value: 1.0945935129251765 and parameters: {'n_estimators': 130, 'learning_rate': 0.40100327913792355, 'max_depth': 3}. Best is trial 1 with value: 0.6664707422563959.
[I 2023-06-30 08:28:30,997] Trial 4 finished with value: 1.6656928655893104 and parameters: {'n_estimators': 71, 'learning_rate': 0.

Column: Fasting glucose (mmol/L)
RMSE: 0.4213822699789756
MAE: 0.23583638976324822
R2: 0.9427209200384645
Explained Variance: 0.9427356337066961



[I 2023-06-30 08:28:49,988] Trial 0 finished with value: 5.94485502711475 and parameters: {'n_estimators': 69, 'learning_rate': 0.037806256815614274, 'max_depth': 8}. Best is trial 0 with value: 5.94485502711475.
[I 2023-06-30 08:28:52,229] Trial 1 finished with value: 9.672153651727907 and parameters: {'n_estimators': 191, 'learning_rate': 0.018140767690956967, 'max_depth': 3}. Best is trial 0 with value: 5.94485502711475.
[I 2023-06-30 08:28:57,090] Trial 2 finished with value: 2.5923422851422635 and parameters: {'n_estimators': 106, 'learning_rate': 0.07783729027400665, 'max_depth': 10}. Best is trial 2 with value: 2.5923422851422635.
[I 2023-06-30 08:29:01,844] Trial 3 finished with value: 3.3805672032869887 and parameters: {'n_estimators': 191, 'learning_rate': 0.3634260306424065, 'max_depth': 6}. Best is trial 2 with value: 2.5923422851422635.
[I 2023-06-30 08:29:07,063] Trial 4 finished with value: 4.8827661677577225 and parameters: {'n_estimators': 173, 'learning_rate': 0.03350

Column: Insulin (pmol/L)
RMSE: 2.5923422851422635
MAE: 1.383434239984421
R2: 0.9538389871909835
Explained Variance: 0.9538415227367193



[I 2023-06-30 08:29:15,924] Trial 0 finished with value: 11.140447746010377 and parameters: {'n_estimators': 112, 'learning_rate': 0.03860768390945853, 'max_depth': 5}. Best is trial 0 with value: 11.140447746010377.
[I 2023-06-30 08:29:20,232] Trial 1 finished with value: 3.2945536420108397 and parameters: {'n_estimators': 109, 'learning_rate': 0.4273398547912394, 'max_depth': 10}. Best is trial 1 with value: 3.2945536420108397.
[I 2023-06-30 08:29:23,729] Trial 2 finished with value: 3.111932436914887 and parameters: {'n_estimators': 82, 'learning_rate': 0.20314555362227255, 'max_depth': 10}. Best is trial 2 with value: 3.111932436914887.
[I 2023-06-30 08:29:25,704] Trial 3 finished with value: 13.097232738268895 and parameters: {'n_estimators': 175, 'learning_rate': 0.017551516078044203, 'max_depth': 3}. Best is trial 2 with value: 3.111932436914887.
[I 2023-06-30 08:29:30,002] Trial 4 finished with value: 9.493455782129757 and parameters: {'n_estimators': 182, 'learning_rate': 0.03

Column: HDL cholesterol (mg/dL)
RMSE: 3.111932436914887
MAE: 2.0446950504662436
R2: 0.9587050703164286
Explained Variance: 0.9587464237153017



[I 2023-06-30 08:29:45,029] Trial 0 finished with value: 0.12098917123348289 and parameters: {'n_estimators': 141, 'learning_rate': 0.04441059104869336, 'max_depth': 10}. Best is trial 0 with value: 0.12098917123348289.
[I 2023-06-30 08:29:47,174] Trial 1 finished with value: 0.3212491200632996 and parameters: {'n_estimators': 195, 'learning_rate': 0.03555160500718109, 'max_depth': 3}. Best is trial 0 with value: 0.12098917123348289.
[I 2023-06-30 08:29:50,178] Trial 2 finished with value: 0.13632979361749123 and parameters: {'n_estimators': 117, 'learning_rate': 0.3376839075585318, 'max_depth': 7}. Best is trial 0 with value: 0.12098917123348289.
[I 2023-06-30 08:29:51,409] Trial 3 finished with value: 0.29264743793468806 and parameters: {'n_estimators': 117, 'learning_rate': 0.20685590745116722, 'max_depth': 3}. Best is trial 0 with value: 0.12098917123348289.
[I 2023-06-30 08:29:52,872] Trial 4 finished with value: 0.31090015455713843 and parameters: {'n_estimators': 138, 'learning_

Column: HDL cholesterol (mmol/L)
RMSE: 0.12098917123348289
MAE: 0.09178642906961218
R2: 0.90822192459123
Explained Variance: 0.9083608532153888



[I 2023-06-30 08:30:09,602] Trial 0 finished with value: 0.8374281940905909 and parameters: {'n_estimators': 159, 'learning_rate': 0.019959487101414278, 'max_depth': 6}. Best is trial 0 with value: 0.8374281940905909.
[I 2023-06-30 08:30:17,317] Trial 1 finished with value: 0.7042650357819826 and parameters: {'n_estimators': 184, 'learning_rate': 0.013127916329328882, 'max_depth': 10}. Best is trial 1 with value: 0.7042650357819826.
[I 2023-06-30 08:30:22,285] Trial 2 finished with value: 0.7217280945777067 and parameters: {'n_estimators': 195, 'learning_rate': 0.02249227564567069, 'max_depth': 7}. Best is trial 1 with value: 0.7042650357819826.
[I 2023-06-30 08:30:23,605] Trial 3 finished with value: 0.8594420416336225 and parameters: {'n_estimators': 122, 'learning_rate': 0.13427136845096538, 'max_depth': 3}. Best is trial 1 with value: 0.7042650357819826.
[I 2023-06-30 08:30:29,083] Trial 4 finished with value: 0.6313482113323525 and parameters: {'n_estimators': 189, 'learning_rate'

Column: Total cholesterol (mmol/L)
RMSE: 0.6313482113323525
MAE: 0.4806609244118549
R2: 0.6431111353440443
Explained Variance: 0.6450994913374242



[I 2023-06-30 08:30:40,043] Trial 0 finished with value: 0.7445449031080289 and parameters: {'n_estimators': 128, 'learning_rate': 0.0182600021564917, 'max_depth': 5}. Best is trial 0 with value: 0.7445449031080289.
[I 2023-06-30 08:30:46,963] Trial 1 finished with value: 0.2010362493394328 and parameters: {'n_estimators': 135, 'learning_rate': 0.2761226710062154, 'max_depth': 9}. Best is trial 1 with value: 0.2010362493394328.
[I 2023-06-30 08:30:52,298] Trial 2 finished with value: 0.2578218801570129 and parameters: {'n_estimators': 162, 'learning_rate': 0.35851793771129353, 'max_depth': 7}. Best is trial 1 with value: 0.2010362493394328.
[I 2023-06-30 08:30:55,841] Trial 3 finished with value: 0.9319156337184249 and parameters: {'n_estimators': 66, 'learning_rate': 0.010666100642576552, 'max_depth': 4}. Best is trial 1 with value: 0.2010362493394328.
[I 2023-06-30 08:31:00,451] Trial 4 finished with value: 0.46018153424666786 and parameters: {'n_estimators': 132, 'learning_rate': 0.

Column: Triglycerides (mmol/L)
RMSE: 0.2010362493394328
MAE: 0.13175632858123912
R2: 0.9525371417209397
Explained Variance: 0.9525404559754337



[I 2023-06-30 08:31:10,482] Trial 0 finished with value: 23.329138862915176 and parameters: {'n_estimators': 182, 'learning_rate': 0.11945060664331612, 'max_depth': 5}. Best is trial 0 with value: 23.329138862915176.
[I 2023-06-30 08:31:16,677] Trial 1 finished with value: 18.27208677632505 and parameters: {'n_estimators': 195, 'learning_rate': 0.1392915643898134, 'max_depth': 6}. Best is trial 1 with value: 18.27208677632505.
[I 2023-06-30 08:31:21,308] Trial 2 finished with value: 16.92337719037189 and parameters: {'n_estimators': 140, 'learning_rate': 0.07307302591576073, 'max_depth': 8}. Best is trial 2 with value: 16.92337719037189.
[I 2023-06-30 08:31:23,271] Trial 3 finished with value: 30.55748386907709 and parameters: {'n_estimators': 133, 'learning_rate': 0.05177217551203112, 'max_depth': 3}. Best is trial 2 with value: 16.92337719037189.
[I 2023-06-30 08:31:25,631] Trial 4 finished with value: 26.850697266674583 and parameters: {'n_estimators': 100, 'learning_rate': 0.047425

Column: LDL cholesterol (mg/dL)
RMSE: 16.92337719037189
MAE: 12.729094496186582
R2: 0.762112828787461
Explained Variance: 0.7621128288355276



[I 2023-06-30 08:31:39,933] Trial 0 finished with value: 0.9030975203845353 and parameters: {'n_estimators': 179, 'learning_rate': 0.2041600609992856, 'max_depth': 8}. Best is trial 0 with value: 0.9030975203845353.
[I 2023-06-30 08:31:45,716] Trial 1 finished with value: 1.0927755115347957 and parameters: {'n_estimators': 181, 'learning_rate': 0.1127310082624359, 'max_depth': 8}. Best is trial 0 with value: 0.9030975203845353.
[I 2023-06-30 08:31:47,694] Trial 2 finished with value: 2.692792338881918 and parameters: {'n_estimators': 178, 'learning_rate': 0.05976658404953586, 'max_depth': 3}. Best is trial 0 with value: 0.9030975203845353.
[I 2023-06-30 08:31:48,487] Trial 3 finished with value: 2.6927320830718586 and parameters: {'n_estimators': 53, 'learning_rate': 0.09640825370012306, 'max_depth': 4}. Best is trial 0 with value: 0.9030975203845353.
[I 2023-06-30 08:31:54,066] Trial 4 finished with value: 0.8100250051821997 and parameters: {'n_estimators': 152, 'learning_rate': 0.162

Column: Trunk Fat (%)
RMSE: 0.8100250051821997
MAE: 0.5157294107255387
R2: 0.9914947639079421
Explained Variance: 0.9914948219253704



[I 2023-06-30 08:32:02,406] Trial 0 finished with value: 15.852409044439534 and parameters: {'n_estimators': 66, 'learning_rate': 0.011412276910268169, 'max_depth': 8}. Best is trial 0 with value: 15.852409044439534.
[I 2023-06-30 08:32:04,892] Trial 1 finished with value: 1.851519659009503 and parameters: {'n_estimators': 124, 'learning_rate': 0.25128765354194765, 'max_depth': 5}. Best is trial 1 with value: 1.851519659009503.
[I 2023-06-30 08:32:08,162] Trial 2 finished with value: 5.33913551886877 and parameters: {'n_estimators': 94, 'learning_rate': 0.020402540834086543, 'max_depth': 9}. Best is trial 1 with value: 1.851519659009503.
[I 2023-06-30 08:32:09,506] Trial 3 finished with value: 2.18719261348133 and parameters: {'n_estimators': 90, 'learning_rate': 0.37538986530216445, 'max_depth': 4}. Best is trial 1 with value: 1.851519659009503.
[I 2023-06-30 08:32:10,834] Trial 4 finished with value: 2.814448544474159 and parameters: {'n_estimators': 57, 'learning_rate': 0.0536190168

Column: Total Fat (%)
RMSE: 0.8097163372146976
MAE: 0.528037032159889
R2: 0.9903154292999047
Explained Variance: 0.990316953358431

