# Imputation fro Lab Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
print("Start Imputation Lab data")

In [None]:
df_impute = pd.read_csv(r'../raw_data/data_full_merge.csv')

In [None]:
#TODO: impute hyperprolaktin patients, impute ausfälle von anderen hormonen

In [None]:
# impute geschlechtshormone (Testo und andere wahrscheinlich )

In [None]:
# TODO: remove gender placeholders
df_impute["Patient_gender"] = 'M'
df_impute.loc[0:300, 'Patient_gender'] = 'F'

In [None]:
def impute_based_on_age(row,column,age_ranges):
    age = row['Patient_age']
    col_value = row[column]
    if np.isnan(col_value):
        for start, end, imputation_value in age_ranges:
            if start <= age <= end:
                return imputation_value
        return np.nan  
    else:
        return col_value
    
def impute_based_on_age_gender(row,column, age_gender_ranges):
    age = row['Patient_age']
    gender = row['Patient_gender']
    col_value = row[column]
    if np.isnan(col_value):
        for age_range in age_gender_ranges:
            if (age_range['age_range'][0] <= age <= age_range['age_range'][1]) and age_range['gender'] == gender:
                return age_range['imputation_value']
    
    return col_value 

In [None]:
df_impute.Patient_age.min()

In [None]:
df_impute['FT4'].isna().sum()

In [None]:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5794073/ - Table 3 - Mean
age_ranges_ft4 = [(df_impute['Patient_age'].min(), 19, 1.30*12.871), (20, 29, 1.31*12.871), (30, 39, 1.26*12.871), (40, 49, 1.22*12.871), (50, 59, 1.20*12.871), (60, 69, 1.2*12.871), (70, df_impute['Patient_age'].max(), 1.2*12.871)]
df_impute['FT4'] = df_impute.apply(impute_based_on_age,args=(['FT4',age_ranges_ft4]), axis=1)


In [None]:
df_impute['IGF1'].isna().sum()

In [None]:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9206165/ - Table 1 - Median
age_ranges_igf1 = [(df_impute['Patient_age'].min(), 25, 265.00*0.131), (26, 30, 222*0.131), (31, 35, 183*0.131  ),
                   (36, 40, 171*0.131), (41, 45, 148*0.131), (46, 50, 129*0.131),
                   (51, 55, 129.5*0.131),(56, 60, 130*0.131),(61, 65, 129.5*0.131),(66, 70, 128*0.131)
                   ,(71, 75, 123*0.131),(76, df_impute['Patient_age'].max(), 109*0.131)]
df_impute['IGF1'] = df_impute.apply(impute_based_on_age,args=(['IGF1',age_ranges_igf1]), axis=1)


In [None]:
df_impute['COR60'].isna().sum()

In [None]:
# regular value for cortisol is at least 500 nmol/l
df_impute['COR60']= df_impute['COR60'].fillna(500)

In [None]:
df_impute['PROL'].isna().sum()

In [None]:
# https://labosud.fr/wp-content/uploads/sites/7/2023/08/Verification_of_Roche_reference_ranges_for_serum_prolactin_in_children_adolescents_adults_and_the_elderly-2023-07-11-05-21.pdf - Table 2 - Median by age and gender
age_gender_ranges_prol = [
    {'age_range': (df_impute['Patient_age'].min(), 9), 'gender': 'F', 'imputation_value': 194.8},
    {'age_range': (10, 12), 'gender': 'F', 'imputation_value': 273.9},
    {'age_range': (13, 16), 'gender': 'F', 'imputation_value': 352.6},
    {'age_range': (17, 19), 'gender': 'F', 'imputation_value': 397.},
    {'age_range': (20, 30), 'gender': 'F', 'imputation_value': 356.5},
    {'age_range': (31, 40), 'gender': 'F', 'imputation_value': 325.4},
    {'age_range': (41, 50), 'gender': 'F', 'imputation_value': 292.9},
    {'age_range': (51, 60), 'gender': 'F', 'imputation_value': 272.4},
    {'age_range': (61, 70), 'gender': 'F', 'imputation_value': 227.9},
    {'age_range': (71, df_impute['Patient_age'].max()), 'gender': 'F', 'imputation_value': 250.3},
    
    {'age_range': (df_impute['Patient_age'].min(), 10), 'gender': 'M', 'imputation_value': 211.2},
    {'age_range': (11, 13), 'gender': 'M', 'imputation_value': 211.2},
    {'age_range': (14, 16), 'gender': 'M', 'imputation_value': 262.0},
    {'age_range': (17, 19), 'gender': 'M', 'imputation_value': 283.8},
    {'age_range': (20, 30), 'gender': 'M', 'imputation_value': 291.7},
    {'age_range': (31, 40), 'gender': 'M', 'imputation_value': 260.5},
    {'age_range': (41, 50), 'gender': 'M', 'imputation_value': 252.5},
    {'age_range': (51, 60), 'gender': 'M', 'imputation_value': 241.},
    {'age_range': (61, 70), 'gender': 'M', 'imputation_value': 232.2},
    {'age_range': (71, df_impute['Patient_age'].max()), 'gender': 'M', 'imputation_value': 252.4},
    ]

df_impute['PROL'] = df_impute.apply(impute_based_on_age_gender,args=(['PROL',age_gender_ranges_prol]), axis=1)


In [None]:
print(f'Sparsity of lab values in percent is:\n{df_impute.iloc[:,2:9].isna().mean()*100}')

In [None]:
df_impute.to_csv(r'../raw_data/data_imputed.csv',index=False)

In [None]:
print("End Imputation Lab data")