In [1]:
import numpy as np
import pandas as pd
import random
import argparse
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sksurv.linear_model import CoxnetSurvivalAnalysis as CoxPH
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from utils import *
from model_functions import *

random.seed(7)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [14]:
trainset, set2, set3, eids_train, eids_set2, eids_set3, _ = get_data({'dset': 'cmb', 'target': 'frailty'})
trainset_cmbmet, set2_cmbmet, set3_cmbmet, eids_train_cmbmet, eids_set2_cmbmet, eids_set3_cmbmet, _ = get_data({'dset': 'cmb_met', 'target': 'frailty'})

mort_trainset, mort_set2, mort_set3, mort_eids_train, mort_eids_set2, mort_eids_set3, _ = get_data({'dset': 'cmb', 'target': 'mort'})
mort_trainset_cmbmet, mort_set2_cmbmet, mort_set3_cmbmet, mort_eids_train_cmbmet, mort_eids_set2_cmbmet, mort_eids_set3_cmbmet, _ = get_data({'dset': 'cmb_met', 'target': 'mort'})

In [95]:
#Mortality
set1_mort = pd.read_csv("Data/Processed/Full/mort_full_train.csv")
set2_mort = pd.read_csv("Data/Processed/Full/mort_full_test.csv")
set3_mort = pd.read_csv("Data/Processed/Full/mort_full_val.csv") #differently named, chosen based on number of individuals

#Frailty
frailty_set1 = pd.read_csv('Data/frailty_clean_train.csv')
frailty_set2 = pd.read_csv('Data/frailty_clean_set2.csv')
frailty_set3 = pd.read_csv('Data/frailty_clean_set3.csv')


lifestyle1 = pd.read_csv('Data/UKB_Lifestyle.csv') 
#PPP
PPP_participants_selected = pd.read_csv('Data/PPP_Participant_table.csv')
#CVD
cvd = pd.read_csv('Data/FollowUpCVD.csv')

#Cancer
cancer = pd.read_csv('Data/FollowUpCancer.csv')

#Handgrip and selfperceived health
#Note: originally higher score on self perceived health means lower self perceived health but changed it for easier interpretability
hgsph = pd.read_csv('Data/HandgripSelfperceived.csv')
hgsph['poor_health'] = np.where(hgsph['self_perceived_health'].isin([0, 1]), 1, 0)
hgsph['poor_health'] = hgsph['poor_health'].where(hgsph['self_perceived_health'].notna(), np.nan)

#Basic info
basicinfo = pd.read_csv('Data/basicinfo_instance_0.csv')
basicinfo = basicinfo[['eid', 'hbp.0.0']]

# Concatenate mortality and frailty datasets
mortality_combined = pd.concat([set1_mort, set2_mort, set3_mort], ignore_index=True)
mortality_combined = mortality_combined[['eid', 'died']]
frailty_combined = pd.concat([frailty_set1, frailty_set2, frailty_set3], ignore_index=True)

# Merge all datasets on 'eid' using outer join
lifestyle = mortality_combined \
    .merge(frailty_combined, on='eid', how='outer') \
    .merge(lifestyle1, on='eid', how='outer') \
    .merge(PPP_participants_selected, on='eid', how='outer') \
    .merge(cvd, on='eid', how='outer') \
    .merge(cancer, on='eid', how='outer') \
    .merge(hgsph, on='eid', how='outer') \
    .merge(basicinfo, on='eid', how='outer')

In [88]:
mortality_combined.columns

Index(['Unnamed: 0', 'eid', 'died', 'censorage', 'sex'], dtype='object')

In [96]:
lifestyle.head()
# Convert numpy arrays to sets for faster lookup
mort_eids_train_set = set(mort_eids_train)
mort_eids_set2_set = set(mort_eids_set2)
mort_eids_set3_set = set(mort_eids_set3)
mort_eids_train_cmbmet_set = set(mort_eids_train_cmbmet)
mort_eids_set2_cmbmet_set = set(mort_eids_set2_cmbmet)
mort_eids_set3_cmbmet_set = set(mort_eids_set3_cmbmet)

# Filter the DataFrame where 'eid' is in any of the three arrays
df_met = lifestyle[lifestyle['eid'].isin(mort_eids_train_cmbmet_set) |
                       lifestyle['eid'].isin(mort_eids_set2_cmbmet_set) |
                       lifestyle['eid'].isin(mort_eids_set3_cmbmet_set)]

# Filter the DataFrame where 'eid' is not in any of the three arrays
df_not_met = lifestyle[~lifestyle['eid'].isin(mort_eids_train_cmbmet_set) &
                           ~lifestyle['eid'].isin(mort_eids_set2_cmbmet_set) &
                           ~lifestyle['eid'].isin(mort_eids_set3_cmbmet_set) &
                        lifestyle['eid'].isin(mort_eids_train_set) |
                       lifestyle['eid'].isin(mort_eids_set2_set) |
                       lifestyle['eid'].isin(mort_eids_set3_set)
]

In [78]:
list(df_met.columns)

['Unnamed: 0',
 'eid',
 'died',
 'censorage',
 'sex_x',
 'FI_0',
 'waist.0.0',
 'waist.1.0',
 'waist.2.0',
 'waist.3.0',
 'bmi.0.0',
 'bmi.1.0',
 'bmi.2.0',
 'bmi.3.0',
 'metmod.0.0',
 'metvig.0.0',
 'actday.0.0',
 'actmin.0.0',
 'smoking.0.0',
 'smoking.1.0',
 'smoking.2.0',
 'smoking.3.0',
 'alcohol.0.0',
 'alcohol.1.0',
 'alcohol.2.0',
 'alcohol.3.0',
 'sleepdr.0.0',
 'sleepdr.1.0',
 'sleepdr.2.0',
 'sleepdr.3.0',
 'veggies.0.0',
 'veggies.1.0',
 'veggies.2.0',
 'veggies.3.0',
 'salad.0.0',
 'salad.1.0',
 'salad.2.0',
 'salad.3.0',
 'frfruit.0.0',
 'frfruit.1.0',
 'frfruit.2.0',
 'frfruit.3.0',
 'drfruit.0.0',
 'drfruit.1.0',
 'drfruit.2.0',
 'drfruit.3.0',
 'oilfish.0.0',
 'oilfish.1.0',
 'oilfish.2.0',
 'oilfish.3.0',
 'noilfish.0.0',
 'noilfish.1.0',
 'noilfish.2.0',
 'noilfish.3.0',
 'prmeat.0.0',
 'prmeat.1.0',
 'prmeat.2.0',
 'prmeat.3.0',
 'beef.0.0',
 'beef.1.0',
 'beef.2.0',
 'beef.3.0',
 'lamb.0.0',
 'lamb.1.0',
 'lamb.2.0',
 'lamb.3.0',
 'pork.0.0',
 'pork.1.0',
 'pork.2.

In [100]:
def summarize_data(df, numeric_cols, categorical_conditions):
    summary = {}
    
    # Descriptive statistics for numeric columns
    if numeric_cols:
        summary['numeric_stats'] = df[numeric_cols].agg(['mean', 'std']).T
    
    # N and Proportions for categorical conditions
    categorical_summary = {}
    for col, condition in categorical_conditions.items():
        count = (df[col] == condition).sum()
        proportion = count / df.shape[0] * 100
        categorical_summary[col] = {'count': count, 'proportion': proportion}
    
    summary['categorical_summary'] = categorical_summary
    
    return summary


# Define numeric columns and categorical conditions
numeric_cols = ['age.0.0', 'bmi.0.0', 'lymf.0.0', 'mono.0.0', 'FI_0', 'max_handgrip']
categorical_conditions_met = {
    'sex': 'w',
    'smokingyn.0.0': 1,
    'UKB-PPP Consortium selected participant | Instance 0': 'Yes',
    'CVD_prev' : 1,
    'CVD_inc': 1,
    'cancer_prev' : 1,
    'cancer_inc' : 1,
    'died' : 1,
    'poor_health': 1,
    'hbp.0.0': 1
    
}
categorical_conditions_not_met = categorical_conditions_met.copy()

# Summarize df_met
summary_met = summarize_data(df_met, numeric_cols, categorical_conditions_met)

# Summarize df_not_met
summary_not_met = summarize_data(df_not_met, numeric_cols, categorical_conditions_not_met)

# Print summaries
print("Summary for df_met:")
print("Numeric Stats (Mean and Std):\n", summary_met['numeric_stats'])
print("\nCategorical Summary:")
for col, stats in summary_met['categorical_summary'].items():
    print(f"{col}: Count = {stats['count']} ({stats['proportion']:.2f})")

print("\nSummary for df_not_met:")
print("Numeric Stats (Mean and Std):\n", summary_not_met['numeric_stats'])
print("\nCategorical Summary:")
for col, stats in summary_not_met['categorical_summary'].items():
    print(f"{col}: Count = {stats['count']} ({stats['proportion']:.2f})")

Summary for df_met:
Numeric Stats (Mean and Std):
                    mean        std
age.0.0       57.585412   8.158157
bmi.0.0       27.463915   4.731376
lymf.0.0      28.765919   7.615295
mono.0.0       7.081803   2.597622
FI_0           0.128632   0.077235
hbp.0.0        0.286321   0.452052
max_handgrip  32.537598  11.495143

Categorical Summary:
sex: Count = 11559 (53.93)
smokingyn.0.0: Count = 2205 (10.29)
UKB-PPP Consortium selected participant | Instance 0: Count = 2635 (12.29)
CVD_prev: Count = 91 (0.42)
CVD_inc: Count = 184 (0.86)
cancer_prev: Count = 195 (0.91)
cancer_inc: Count = 326 (1.52)
died: Count = 2329 (10.87)
poor_health: Count = 542 (2.53)
hbp.0.0: Count = 6137 (28.63)

Summary for df_not_met:
Numeric Stats (Mean and Std):
                    mean        std
age.0.0       57.362063   8.208127
bmi.0.0       27.489036   4.813677
lymf.0.0      28.900060   7.752818
mono.0.0       7.068076   2.557174
FI_0           0.128920   0.077796
hbp.0.0        0.284717   0.451288


In [24]:
(len(mort_eids_train) + len(mort_eids_set2) +len(mort_eids_set3)) - (len(mort_eids_train_cmbmet) + len(mort_eids_set2_cmbmet) +len(mort_eids_set3_cmbmet))
#len(mort_eids_train_cmbmet)
#len(mort_eids_set2_cmbmet)
#len(mort_eids_set3_cmbmet)

19262

In [8]:
# Check number of participants in combination
trained_combined = set(mort_eids_train_cmbmet).union(mort_eids_train)
len(trained_combined)

28465

In [53]:
PPP_participants_selected = pd.read_csv('Data/PPP_Participant_table.csv', index_col = "eid")
sum_selected_train = sum(eid in eids_train for eid in PPP_participants_selected.index)
sum_selected_set2 = sum(eid in eids_set2 for eid in PPP_participants_selected.index)
print(sum_selected_train, sum_selected_set2)
print(sum_selected_train/len(eids_train), sum_selected_set2/len(eids_set2))
print(PPP_participants_selected.shape)

3456 957
0.12141226067099947 0.11761091311294089
(6229, 1)


In [55]:
sum_selected_train = sum(eid in eids_train_cmbmet for eid in PPP_participants_selected.index)
sum_selected_set2 = sum(eid in eids_set2_cmbmet for eid in PPP_participants_selected.index)
print(sum_selected_train, sum_selected_set2)
print(sum_selected_train/len(eids_train_cmbmet), sum_selected_set2/len(eids_set2_cmbmet))
print(PPP_participants_selected.shape)

1877 513
0.12471760797342192 0.11875
(6229, 1)


In [39]:
# #PPP_participants_selected = PPP_participants_selected.drop(PPP_participants_selected.columns[1], axis = 1)
# PPP_participants_selected['eid'] = PPP_participants_selected.index
# PPP_participants_selected.head()
# #PPP_participants_selected.to_csv('Data/PPP_Participant_table.csv')

Unnamed: 0_level_0,UKB-PPP Consortium selected participant | Instance 0,eid
Participant ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1001389,Yes,1001389
1001843,Yes,1001843
1002479,Yes,1002479
1004582,Yes,1004582
1008170,Yes,1008170


In [45]:
# PPP_participants_selected_save = PPP_participants_selected[['eid', 'UKB-PPP Consortium selected participant | Instance 0']]
# PPP_participants_selected_save.head()
# PPP_participants_selected_save.to_csv('Data/PPP_Participant_table.csv', index=False)

9