# Imputation fro Lab Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Start Imputation Lab data")

Start Imputation Lab data


In [3]:
df_impute = pd.read_csv(r'../data/train_data_not_imputed.csv')

In [4]:
df_impute_test = pd.read_csv(r'../data/test_data_not_imputed.csv')

In [5]:
print(f'Sparsity of lab values of trainset before imputation, in percent is:\n{df_impute.iloc[:,6:13].isna().mean()*100}')

Sparsity of lab values of trainset before imputation, in percent is:
COR     16.923077
FSH     21.538462
FT4      9.743590
IGF1    12.307692
LH      21.538462
PROL     3.589744
TEST    46.153846
dtype: float64


In [6]:
print(f'Sparsity of lab values of testset before imputation, in percent is:\n{df_impute_test.iloc[:,6:13].isna().mean()*100}')

Sparsity of lab values of testset before imputation, in percent is:
COR     28.0
FSH     40.0
FT4     12.0
IGF1    40.0
LH      40.0
PROL    20.0
TEST    80.0
dtype: float64


In [7]:
def impute_based_on_age(row, column, age_ranges):
    age = row['Patient_age']
    col_value = row[column]
    if pd.isna(col_value):
        for start, end, imputation_value in age_ranges:
            if start <= age <= end:
                return imputation_value
        return np.nan  
    else:
        return col_value
    
def impute_based_on_age_gender(row, column, age_gender_ranges):
    age = row['Patient_age']
    gender = row['Patient_gender']
    col_value = row[column]
    if pd.isna(col_value):
        for age_range in age_gender_ranges:
            if (age_range['age_range'][0] <= age <= age_range['age_range'][1]) and age_range['gender'] == gender:
                return age_range['imputation_value']
    
    return col_value 

## Imputation Hormones

### Impute Testosteron

In [8]:
# we impute the mean of our population
testo_mean_male = df_impute[df_impute['Patient_gender'] == 'male']['TEST'].mean()
testo_mean_female = df_impute[df_impute['Patient_gender'] == 'female']['TEST'].mean()
age_gender_ranges_prol= [
    {'age_range': (df_impute['Patient_age'].min(), df_impute['Patient_age'].max()), 'gender': 'male', 'imputation_value': testo_mean_male},
    {'age_range': (df_impute['Patient_age'].min(), df_impute['Patient_age'].max()), 'gender': 'female', 'imputation_value': testo_mean_female}]
df_impute['TEST'] = df_impute.apply(impute_based_on_age_gender,args=(['TEST',age_gender_ranges_prol]), axis=1)
df_impute_test['TEST'] = df_impute_test.apply(impute_based_on_age_gender,args=(['TEST',age_gender_ranges_prol]), axis=1)

### Impute LH

In [9]:
# we impute the mean of our trainingset population, we differentiate by gender
lh_mean_male = df_impute[df_impute['Patient_gender'] == 'male']['LH'].mean()
lh_mean_female = df_impute[df_impute['Patient_gender'] == 'female']['LH'].mean()
age_gender_ranges_prol= [
    {'age_range': (df_impute['Patient_age'].min(), df_impute['Patient_age'].max()), 'gender': 'male', 'imputation_value': lh_mean_male},
    {'age_range': (df_impute['Patient_age'].min(), df_impute['Patient_age'].max()), 'gender': 'female', 'imputation_value': lh_mean_female}]
df_impute['LH'] = df_impute.apply(impute_based_on_age_gender,args=(['LH',age_gender_ranges_prol]), axis=1)
df_impute_test['LH'] = df_impute_test.apply(impute_based_on_age_gender,args=(['LH',age_gender_ranges_prol]), axis=1)

### Impute FSH

In [10]:
# we impute the mean of our trainingset population, we differentiate by gender
fsh_mean_male = df_impute[df_impute['Patient_gender'] == 'male']['FSH'].mean()
fsh_mean_female = df_impute[df_impute['Patient_gender'] == 'female']['FSH'].mean()
age_gender_ranges_prol= [
    {'age_range': (df_impute['Patient_age'].min(), df_impute['Patient_age'].max()), 'gender': 'male', 'imputation_value': fsh_mean_male},
    {'age_range': (df_impute['Patient_age'].min(), df_impute['Patient_age'].max()), 'gender': 'female', 'imputation_value': fsh_mean_female}]
df_impute['FSH'] = df_impute.apply(impute_based_on_age_gender,args=(['FSH',age_gender_ranges_prol]), axis=1)
df_impute_test['FSH'] = df_impute_test.apply(impute_based_on_age_gender,args=(['FSH',age_gender_ranges_prol]), axis=1)

### Impute FT4

In [11]:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5794073/ - Table 3 - Mean
age_ranges_ft4 = [(df_impute['Patient_age'].min(), 19, 1.30*12.871), (20, 29, 1.31*12.871), (30, 39, 1.26*12.871), (40, 49, 1.22*12.871), (50, 59, 1.20*12.871), (60, 69, 1.2*12.871), (70, df_impute['Patient_age'].max(), 1.2*12.871)]
df_impute['FT4'] = df_impute.apply(impute_based_on_age,args=(['FT4',age_ranges_ft4]), axis=1)
df_impute_test['FT4'] = df_impute_test.apply(impute_based_on_age,args=(['FT4',age_ranges_ft4]), axis=1)

### Impute IGF1

In [12]:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9206165/ - Table 1 - Median
age_ranges_igf1 = [(df_impute['Patient_age'].min(), 25, 265.00*0.131), (26, 30, 222*0.131), (31, 35, 183*0.131  ),
                   (36, 40, 171*0.131), (41, 45, 148*0.131), (46, 50, 129*0.131),
                   (51, 55, 129.5*0.131),(56, 60, 130*0.131),(61, 65, 129.5*0.131),(66, 70, 128*0.131)
                   ,(71, 75, 123*0.131),(76, df_impute['Patient_age'].max(), 109*0.131)]
df_impute['IGF1'] = df_impute.apply(impute_based_on_age,args=(['IGF1',age_ranges_igf1]), axis=1)
df_impute_test['IGF1'] = df_impute_test.apply(impute_based_on_age,args=(['IGF1',age_ranges_igf1]), axis=1)


### Impute Cortisol

In [13]:
# regular value for cortisol is at least 550 nmol/l
df_impute['COR'] = df_impute['COR'].fillna(550)
df_impute_test['COR'] = df_impute_test['COR'].fillna(550)

### Impute Prolactin

In [14]:
# https://labosud.fr/wp-content/uploads/sites/7/2023/08/Verification_of_Roche_reference_ranges_for_serum_prolactin_in_children_adolescents_adults_and_the_elderly-2023-07-11-05-21.pdf - Table 2 - Median by age and gender
age_gender_ranges_prol = [
    {'age_range': (df_impute['Patient_age'].min(), 9), 'gender': 'male', 'imputation_value': 194.8},
    {'age_range': (10, 12), 'gender': 'male', 'imputation_value': 273.9},
    {'age_range': (13, 16), 'gender': 'male', 'imputation_value': 352.6},
    {'age_range': (17, 19), 'gender': 'male', 'imputation_value': 397.},
    {'age_range': (20, 30), 'gender': 'male', 'imputation_value': 356.5},
    {'age_range': (31, 40), 'gender': 'male', 'imputation_value': 325.4},
    {'age_range': (41, 50), 'gender': 'male', 'imputation_value': 292.9},
    {'age_range': (51, 60), 'gender': 'male', 'imputation_value': 272.4},
    {'age_range': (61, 70), 'gender': 'male', 'imputation_value': 227.9},
    {'age_range': (71, df_impute['Patient_age'].max()), 'gender': 'male', 'imputation_value': 250.3},
    
    {'age_range': (df_impute['Patient_age'].min(), 10), 'gender': 'female', 'imputation_value': 211.2},
    {'age_range': (11, 13), 'gender': 'female', 'imputation_value': 211.2},
    {'age_range': (14, 16), 'gender': 'female', 'imputation_value': 262.0},
    {'age_range': (17, 19), 'gender': 'female', 'imputation_value': 283.8},
    {'age_range': (20, 30), 'gender': 'female', 'imputation_value': 291.7},
    {'age_range': (31, 40), 'gender': 'female', 'imputation_value': 260.5},
    {'age_range': (41, 50), 'gender': 'female', 'imputation_value': 252.5},
    {'age_range': (51, 60), 'gender': 'female', 'imputation_value': 241.},
    {'age_range': (61, 70), 'gender': 'female', 'imputation_value': 232.2},
    {'age_range': (71, df_impute['Patient_age'].max()), 'gender': 'female', 'imputation_value': 252.4},
    ]

df_impute['PROL'] = df_impute.apply(impute_based_on_age_gender,args=(['PROL',age_gender_ranges_prol]), axis=1)
df_impute_test['PROL'] = df_impute_test.apply(impute_based_on_age_gender,args=(['PROL',age_gender_ranges_prol]), axis=1)


In [15]:
print(f'Sparsity of lab values in trainset after imputation, in percent is:\n{df_impute.iloc[:,5:12].isna().mean()*100}')
print(f'Sparsity of lab values in testset after imputation, in percent is:\n{df_impute_test.iloc[:,5:12].isna().mean()*100}')

Sparsity of lab values in trainset after imputation, in percent is:
MRI_Case_ID    0.0
COR            0.0
FSH            0.0
FT4            0.0
IGF1           0.0
LH             0.0
PROL           0.0
dtype: float64
Sparsity of lab values in testset after imputation, in percent is:
MRI_Case_ID    0.0
COR            0.0
FSH            0.0
FT4            0.0
IGF1           0.0
LH             0.0
PROL           0.0
dtype: float64


In [16]:
df_impute.to_csv(r'../data/train_data.csv',index=False)
df_impute_test.to_csv(r'../data/test_data.csv',index=False)

In [17]:
print("End Imputation Lab data")

End Imputation Lab data
