In [1]:
import numpy as np
import pandas as pd
import os
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [2]:
# 定义不同的模型
models = {
    "SVM": make_pipeline(StandardScaler(), SVR(kernel='linear')),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=123),
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=6, random_state=123),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=123),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "MLP": MLPRegressor(hidden_layer_sizes=(50, 50), activation='relu', solver='adam', max_iter=500, random_state=123)
}

In [3]:
label = pd.read_csv('/share2/pub/zhangyr/zhangyr/myIdea/bioAge/data/UKB/label/health_self_i0.csv')
sex = pd.read_csv('/share2/pub/zhangyr/zhangyr/myIdea/bioAge/data/UKB/body/502137_sex_20241226.csv',header=0,index_col=0)
age = pd.read_csv('/share2/pub/zhangyr/zhangyr/myIdea/bioAge/data/UKB/body/502137_age_20241226.csv',index_col=0)
baseline = pd.read_csv('/share2/pub/zhangyr/zhangyr/myIdea/bioAge/data/UKB/body/502137_baseline_78features_20241226.csv',header=0,index_col=0)


In [4]:
body = pd.read_csv('/share/pub/zhangyr/database/UKB-old//body_fileID_20241216.csv',header=0)
grouped_dict = body.groupby('Organ')['Filed ID'].apply(list).to_dict()
organ_dic = {}


In [5]:
for k in grouped_dict:
    if k != 'Body':
        organ_dic[k] = ['participant.p'+str(x)+'_i0' for x in grouped_dict[k]]

organ_dic['Pulmonary'] = organ_dic['Pulmonary']+['FEV1-FVC_ratio']
organ_dic['Musculoskeletal'] = organ_dic['Musculoskeletal']+['Waist-hip_ratio','BMD_avg',
                                                             'Ankle_spacing_width_avg',
                                                             'Hand_grip_strength_avg']



In [7]:
organ_models = {}

In [8]:
for k in organ_dic:
    print(f"Running organ {k}...")
    #health_data = baseline.loc[label.index[label.sum(1) == 0]]
    health_data = baseline.loc[label['eid']]
    health_organ_data = health_data[set(health_data.columns) & set(organ_dic[k])]
    health_organ_data = health_organ_data.dropna()
    print(health_organ_data.shape)

    ## organ features
    test_age = age.loc[health_organ_data.index]
    #test_sex = sex.loc[health_organ_data.index]
    x = health_organ_data.values
    y = test_age.values.ravel()
    #eids = health_organ_data.index
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=123)
    model.fit(x, y)
    organ_models[k] = model

Running organ Cardiovascular...
(109418, 3)


  health_organ_data = health_data[set(health_data.columns) & set(organ_dic[k])]


Running organ Hepatic...
(86170, 8)


  health_organ_data = health_data[set(health_data.columns) & set(organ_dic[k])]


Running organ Immune...
(104792, 32)


  health_organ_data = health_data[set(health_data.columns) & set(organ_dic[k])]


Running organ Metabolic...
(76203, 9)


  health_organ_data = health_data[set(health_data.columns) & set(organ_dic[k])]


Running organ Musculoskeletal...
(85756, 11)


  health_organ_data = health_data[set(health_data.columns) & set(organ_dic[k])]


Running organ Pulmonary...
(85565, 4)


  health_organ_data = health_data[set(health_data.columns) & set(organ_dic[k])]


Running organ Renal...
(96896, 10)


  health_organ_data = health_data[set(health_data.columns) & set(organ_dic[k])]


In [11]:
brain = pd.read_csv('/share2/pub/zhangyr/zhangyr/myIdea/bioAge/data/UKB/brain/502137_baseline_803features_20241226.csv',index_col=0)
#health_brain = brain.loc[label.index[label.sum(1) == 0]].dropna()
health_brain = brain.loc[label['eid']].dropna()


In [12]:
test_age = age.loc[health_brain.index]
x = health_brain.values
y = test_age.values.ravel()

In [13]:
model = make_pipeline(StandardScaler(), SVR(kernel='linear'))
model.fit(x, y)
organ_models['Brain'] = model

In [14]:
organ_models

{'Cardiovascular': GradientBoostingRegressor(max_depth=6, random_state=123),
 'Hepatic': GradientBoostingRegressor(max_depth=6, random_state=123),
 'Immune': GradientBoostingRegressor(max_depth=6, random_state=123),
 'Metabolic': GradientBoostingRegressor(max_depth=6, random_state=123),
 'Musculoskeletal': GradientBoostingRegressor(max_depth=6, random_state=123),
 'Pulmonary': GradientBoostingRegressor(max_depth=6, random_state=123),
 'Renal': GradientBoostingRegressor(max_depth=6, random_state=123),
 'Brain': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('svr', SVR(kernel='linear'))])}

In [16]:
organ_res = {}
for k in organ_dic:
    print(f"Predicting organ {k}...")
    #health_data = baseline.loc[label.index[label.sum(1) == 0]]
    disease_data = baseline.loc[list(set(baseline.index) - set(label['eid']))]
    disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]
    disease_organ_data = disease_organ_data.dropna()
    print(disease_organ_data.shape)
    predictions = organ_models[k].predict(disease_organ_data.values)
    organ_res[k] = predictions

Predicting organ Cardiovascular...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(362612, 3)
Predicting organ Hepatic...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(278208, 8)
Predicting organ Immune...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(343733, 32)
Predicting organ Metabolic...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(245156, 9)
Predicting organ Musculoskeletal...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(278885, 11)
Predicting organ Pulmonary...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(267502, 4)
Predicting organ Renal...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(316294, 10)


In [17]:
disease_brain = brain.loc[list(set(brain.index) - set(label['eid']))].dropna()

In [18]:
organ_res['Brain'] = organ_models['Brain'].predict(disease_brain.values)

In [19]:
organ_res.keys()

dict_keys(['Cardiovascular', 'Hepatic', 'Immune', 'Metabolic', 'Musculoskeletal', 'Pulmonary', 'Renal', 'Brain'])

In [20]:
np.save("/share2/pub/zhangyr/zhangyr/myIdea/bioAge/results/202503_8organs_predictions.npy", organ_res)

In [26]:
for k in organ_dic:
    print(f"Running organ {k}...")
    if k == 'Cardiovascular':
        next
    #health_data = baseline.loc[label.index[label.sum(1) == 0]]
    disease_data = baseline.loc[list(set(baseline.index) - set(label['eid']))]
    disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]
    disease_organ_data = disease_organ_data.dropna()
    print(disease_organ_data.shape)
    organ_res[k] = pd.Series(list(organ_res[k]),index = disease_organ_data.index)

Running organ Cardiovascular...
(362612, 3)
Running organ Hepatic...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]
  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(278208, 8)
Running organ Immune...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(343733, 32)
Running organ Metabolic...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(245156, 9)
Running organ Musculoskeletal...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(278885, 11)
Running organ Pulmonary...


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


(267502, 4)
Running organ Renal...
(316294, 10)


  disease_organ_data = disease_data[set(disease_data.columns) & set(organ_dic[k])]


In [27]:
organ_res['Brain'] = pd.Series(list(organ_res['Brain']),index = disease_brain.index)

In [28]:
np.save("/share2/pub/zhangyr/zhangyr/myIdea/bioAge/results/202503_8organs_predictions.npy", organ_res)

In [29]:
organ_res['Brain']

participant.eid
4194348    45.619489
1048668    50.619777
1048673    58.169868
5243019    48.963368
4194475    58.910777
             ...    
3145644    51.847736
4194231    48.383904
3145658    44.597603
5242857    48.582966
3145722    40.376412
Length: 30536, dtype: float64

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test

ModuleNotFoundError: No module named 'lifelines'

In [31]:
eid = set(organ_res['Brain'].index) & set(age.index) & 

Unnamed: 0_level_0,participant.p21022
participant.eid,Unnamed: 1_level_1
1000019.0,62.0
1000022.0,59.0
1000035.0,56.0
1000046.0,59.0
1000054.0,42.0
...,...
6023625.0,53.0
6023638.0,64.0
6023642.0,64.0
6023656.0,53.0
