In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [None]:
df = pd.read_csv("../03_datasets/frailty_ntpro_raw.csv")

In [None]:
def check_icd_prefix(icd, codes):
    """ Check if any of the patient's ICD-10 codes start with the specified prefix

    """
    return any(icd.startswith(code) for code in codes)

def calculate_charlson_score(diagnosis_codes):

    """ Check if any of the patient's ICD-10 codes start with the specified prefix

    Create a column per comorbidity, return 1 if code found, 0 if not. 
    Return empty if episode not found in diagnosis code database.

    """

    flags = {'Cancer_flag': 0,
        'Acute_myocardial_infarction_flag': 0,
        'Congestive_heart_failure_flag': 0,
        'Peripheral_vascular_disease_flag': 0,
        'Cerebral_vascular_accident_flag': 0,
        'Dementia_flag': 0,
        'Pulmonary_disease_flag': 0,
        'Connective_tissue_disorder_flag': 0,
        'Peptic_ulcer_flag': 0,
        'Liver_disease_flag': 0,
        'Paraplegia_flag': 0,
        'Renal_disease_flag': 0,
        'Metastatic_cancer_flag': 0,
        'HIV_flag': 0,
        'Severe_liver_disease_flag': 0,
        'Diabetes_flag': 0,
        'Diabetes_complications_flag': 0,
    }

    if not isinstance(diagnosis_codes, str):
        return flags  

    diagnosis_codes = diagnosis_codes.split(' / ')    ### Database format code / code / code / None / None / None etc.

    for ICD in diagnosis_codes:
        if ICD == 'None':
            continue  # Skip 'None' entries
        
        # Cancer 
        if check_icd_prefix(ICD, ['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6']):
            flags['Cancer_flag'] = 1
        if check_icd_prefix(ICD, ['C70', 'C71', 'C72', 'C73', 'C74', 'C75', 'C76', 'C81', 'C82', 'C83', 'C84', 'C85', 'C86', 'C87', 'C88', 'C89', 
                                  'C90', 'C91', 'C92', 'C93', 'C94', 'C95', 'C96', 'C97']):
            flags['Cancer_flag'] = 1
        if check_icd_prefix(ICD, ['C77', 'C78', 'C79', 'C80']):
            flags['Metastatic_cancer_flag'] = 1

        # Acute myocardial infarction
        if check_icd_prefix(ICD, ['I21', 'I22', 'I23']):
            flags['Acute_myocardial_infarction_flag'] = 1
        if check_icd_prefix(ICD, ['I252', 'I258']):
            flags['Acute_myocardial_infarction_flag'] = 1

        # Congestive heart failure
        if check_icd_prefix(ICD, ['I50']):
            flags['Congestive_heart_failure_flag'] = 1

        # Peripheral vascular disease
        if check_icd_prefix(ICD, ['I71', 'R02']):
            flags['Peripheral_vascular_disease_flag'] = 1
        if check_icd_prefix(ICD, ['I790', 'I739', 'Z958', 'Z959']):
            flags['Peripheral_vascular_disease_flag'] = 1

        # Cerebral vascular accident
        if check_icd_prefix(ICD, ['I60', 'I61', 'I62', 'I63', 'I65', 'I66', 'G46', 'I64', 'I69', 'I67', 'I68']):
            flags['Cerebral_vascular_accident_flag'] = 1
        if check_icd_prefix(ICD, ['G450', 'G451', 'G452', 'G458', 'G459', 'G454']):
            flags['Cerebral_vascular_accident_flag'] = 1

        # Dementia
        if check_icd_prefix(ICD, ['F00', 'F01', 'F02', 'F03']):
            flags['Dementia_flag'] = 1
        if check_icd_prefix(ICD, ['F051']):
            flags['Dementia_flag'] = 1

        # Pulmonary disease
        if check_icd_prefix(ICD, ['J40', 'J41', 'J42', 'J44', 'J43', 'J45', 'J46', 'J47', 'J67', 'J60', 'J61', 'J62', 'J63', 'J66', 'J64', 'J65']):
            flags['Pulmonary_disease_flag'] = 1

        # Connective tissue disorder
        if check_icd_prefix(ICD, ['M32', 'M34', 'M05']):
            flags['Connective_tissue_disorder_flag'] = 1
        if check_icd_prefix(ICD, ['M332',  'M060', 'M063', 'M069', 'M353']):
            flags['Connective_tissue_disorder_flag'] = 1

        # Peptic ulcer
        if check_icd_prefix(ICD, ['K25', 'K26', 'K27', 'K28']):
            flags['Peptic_ulcer_flag'] = 1

        # Liver disease
        if check_icd_prefix(ICD, ['K73', 'K74']):
            flags['Liver_disease_flag'] = 1
        if check_icd_prefix(ICD, ['K702', 'K703', 'K717']):
            flags['Liver_disease_flag'] = 1

        # Paraplegia
        if check_icd_prefix(ICD, ['G81']):
            flags['Paraplegia_flag'] = 1
        if check_icd_prefix(ICD, ['G041', 'G820', 'G821', 'G822']):
            flags['Paraplegia_flag'] = 1

        # Renal disease
        if check_icd_prefix(ICD, ['N03', 'N01', 'N18', 'N19', 'N25', 'I12', 'I13']):
            flags['Renal_disease_flag'] = 1
        if check_icd_prefix(ICD, ['N052', 'N053', 'N054', 'N055', 'N056', 'N072', 'N073', 'N074']):
            flags['Renal_disease_flag'] = 1

        # HIV
        if check_icd_prefix(ICD, ['B20', 'B21', 'B22', 'B23', 'B24']):
            flags['HIV_flag'] = 1
        if check_icd_prefix(ICD, ['O987']):
            flags['HIV_flag'] = 1

        # Severe liver disease
        if check_icd_prefix(ICD, ['K729', 'K766', 'K767', 'K721']):
            flags['Severe_liver_disease_flag'] = 1

        # Diabetes
        if check_icd_prefix(ICD, ['E101', 'E105', 'E106', 'E108', 'E109', 'E111', 'E115', 'E116', 'E118', 'E119', 'E131', 'E136', 'E138', 'E139', 'E141', 'E145', 'E146', 'E148', 'E149']):
            flags['Diabetes_flag'] = 1
        if check_icd_prefix(ICD, ['E102','E112','E132','E142','E103','E113','E133','E143','E104','E114','E134','E144', 'E107', 'E117', 'E137', 'E147']):
            flags['Diabetes_complications_flag'] = 1

    return flags

# Find diagnosis codes but return empty cell if not present
df_flags = df['Diagnosis Codes'].apply(lambda x: calculate_charlson_score(x) if pd.notna(x) and x.strip() else {})
df_flags = pd.json_normalize(df_flags)

# Merge the computed flags back to the original dataframe
df = pd.concat([df, df_flags], axis=1)

In [None]:
#### Inclusion in NICOR means they have HF
df.loc[df['Diabetes_NICOR'].notna(), 'Congestive_heart_failure_flag'] = 1

### Diagnosis confirmed in HI
df.loc[(df['HF_Diagnosis_Confirmed'] == 1), 'Congestive_heart_failure_flag'] = 1

In [None]:
#### Comorbidities from NICOR -- 0 can be changed to 1, 1 cannot be changed to 0
df.loc[df['Cerebral_vascular_accident_flag'].isin([0, np.nan]) & (df['4.15 Cerebral vascular accident (CVA)'] == '1. Yes'), 'Cerebral_vascular_accident_flag'] = 1
df.loc[df['Cerebral_vascular_accident_flag'].isna() & (df['4.15 Cerebral vascular accident (CVA)'] == '0. No'), 'Cerebral_vascular_accident_flag'] = 0

df.loc[df['Cancer_flag'].isin([0, np.nan]) & (df['5.05 Current malignancy'] == '1. Yes'), 'Cancer_flag'] = 1
df.loc[df['Cancer_flag'].isna() & (df['5.05 Current malignancy'] == '0. No'), 'Cancer_flag'] = 0

df.loc[df['Pulmonary_disease_flag'].isin([0, np.nan]) & (df['4.17 Chronic obstructive pulmonary disease (COPD)'] == '1. Yes'), 'Pulmonary_disease_flag'] = 1
df.loc[df['Pulmonary_disease_flag'].isna() & (df['4.17 Chronic obstructive pulmonary disease (COPD)'] == '0. No'), 'Pulmonary_disease_flag'] = 0

df.loc[df['Diabetes_flag'].isin([0, np.nan]) & df['4.14 Diabetes'].isin(['1. Yes', 1]) & (df['Diabetes_complications_flag'] != 1), 'Diabetes_flag'] = 1
df.loc[df['Diabetes_flag'].isna() & df['4.14 Diabetes'].isin(['0. No', 0]), 'Diabetes_flag'] = 0


In [None]:
### HBA1C greater than 48 indicates diabetes
df.loc[(df['HBA1C_Result 1'] >= 48) & (df['Diabetes_complications_flag'] != 1), 'Diabetes_flag'] = 1

In [None]:
df = df.sort_values(by=['MRN', 'Admission_Date'])

In [None]:
def consistent_inbetween(df, columns):
    ''' Correct inconsistent coding if between two consistent codes 
    ie. if patient has at least three episodes and code is recorded for two episodes, and inconsistent or missing in the middle episode, make middle episode consistent

    '''
    for col in columns:
        mask_one = (                                    # Set 1 if the column is empty or 0 between 1s
            (df[col].isna() | (df[col] == 0)) & 
            (df.groupby("MRN")[col].shift(1) == 1) &  
            (df.groupby("MRN")[col].shift(-1) == 1)   
        )
        
        
        mask_zero = (                                   # Set 0 if the column is empty or 0 between 0s
            df[col].isna() &  
            (df.groupby("MRN")[col].shift(1) == 0) &  
            (df.groupby("MRN")[col].shift(-1) == 0)   
        )

        df.loc[mask_one, col] = 1
        df.loc[mask_zero, col] = 0

    return df

columns_to_process = ["Cancer_flag", 'Congestive_heart_failure_flag', 'Peripheral_vascular_disease_flag',
                     'Dementia_flag', 'Pulmonary_disease_flag', 'Connective_tissue_disorder_flag', 'Peptic_ulcer_flag', 
                      'Liver_disease_flag', 'Paraplegia_flag', 'Renal_disease_flag', 'Metastatic_cancer_flag', 'HIV_flag', 
                      'Severe_liver_disease_flag']

df = consistent_inbetween(df, columns_to_process)


In [None]:
def consistent_after(df, columns):
    ''' Correct inconsistent coding if following a positive episode
    Should only be used for conditions that patients don't recover from

    '''
    for mrn, group in df.groupby('MRN'):
        for col in columns:
            flag_index = group[group[col] == 1].index      # Set flag as 1 if patient has a history of it
            if not flag_index.empty:
                group.loc[flag_index[0]:, col] = 1
    return df

columns_to_process = ['Peripheral_vascular_disease_flag', 'Congestive_heart_failure_flag', 'Dementia_flag','Pulmonary_disease_flag',
                      'Connective_tissue_disorder_flag', 'Liver_disease_flag', 'Paraplegia_flag',
                     'Renal_disease_flag', 'HIV_flag', 'Severe_liver_disease_flag']

df = consistent_after(df, columns_to_process)



In [None]:
def consistent_inbetween_diabetes(df, columns):
    ''' Correct inconsistent coding for diabetes
    Diabetes is a special case as don't want to double count for diabetes with complicatoins

    '''
    for col in columns:
        valid_rows = df['Diabetes_complications_flag'] != 1      # Don't consider when patient has diabetes with complications

        mask_one = (                                                # Positive case if epsiode is between two other positive cases
            ((df[col].isna()) | (df[col] == 0)) & 
            valid_rows &                                           
            (df.groupby("MRN")[col].shift(1) == 1) &  
            (df.groupby("MRN")[col].shift(-1) == 1)
        )                                       
        
       
        mask_zero = (                                                # Negitive case if epsiode is between two other negitive cases
            ((df[col].isna()) | (df[col] == 0)) &  
            valid_rows & 
            (df.groupby("MRN")[col].shift(1) == 0) &  
            (df.groupby("MRN")[col].shift(-1) == 0)
        )

        df.loc[mask_one, col] = 1
        df.loc[mask_zero, col] = 0

    return df

columns_to_process = ["Diabetes_flag"]

df = consistent_inbetween_diabetes(df, columns_to_process)


In [None]:
def consistent_after_diabetes(df, columns):
    ''' Make sure any episodes following a positive diabetes case are all positive
    Diabetes is a special case as don't want to double count for diabetes with complicatoins

    '''
    for mrn, group in df.groupby('MRN'):
        for col in columns:
            flag_index = group[group[col] == 1].index
            if not flag_index.empty:
                df.loc[group.loc[flag_index[0]:].index, col] = group.loc[flag_index[0]:].apply(
                    lambda row: 1 if row['Diabetes_complications_flag'] != 1 else row[col], axis=1
                )
    return df

columns_to_process = ['Diabetes_flag']

df = consistent_after_diabetes(df, columns_to_process)


In [None]:
def calculate_charlson_scoring(df):
    ''' Calculate Charlson score
    Based on new Dr Foster methodology

    '''
    df['Charlson_Score'] = (
        df.get('Acute_myocardial_infarction_flag', 0) * 5 +
        df.get('Congestive_heart_failure_flag', 0) * 13 +
        df.get('Peripheral_vascular_disease_flag', 0) * 6 +
        df.get('Cerebral_vascular_accident_flag', 0) * 11 +
        df.get('Dementia_flag', 0) * 14 +
        df.get('Pulmonary_disease_flag', 0) * 4 +
        df.get('Connective_tissue_disorder_flag', 0) * 4 +
        df.get('Peptic_ulcer_flag', 0) * 9 +
        df.get('Liver_disease_flag', 0) * 8 +    
        df.get('Diabetes_flag', 0) * 3 +
        df.get('Diabetes_complications_flag', 0) * -1 +
        df.get('Paraplegia_flag', 0) * 1 +
        df.get('Renal_disease_flag', 0) * 10 +
        (df['Cancer_flag'] * (df['Metastatic_cancer_flag'] == 0)) * 8 +     # Include Cancer_flag only if Metastatic_cancer_flag is 0
        df.get('Metastatic_cancer_flag', 0) * 14 +
        df.get('Severe_liver_disease_flag', 0) * 18 +
        df.get('HIV_flag', 0) * 2 
    )
    return df

df = calculate_charlson_scoring(df)

## Set negative values to zero
df.loc[(df['Charlson_Score'] < 0), 'Charlson_Score'] = 0


In [None]:
def age_adjust_charlson(df):
    ''' Calculate age-adjusted Charlson score
    Based on new Dr Foster methodology

    '''
    conditions = [
        (df['Age_on_Admission'] >= 50) & (df['Age_on_Admission'] < 60),
        (df['Age_on_Admission'] >= 60) & (df['Age_on_Admission'] < 70),
        (df['Age_on_Admission'] >= 70) & (df['Age_on_Admission'] < 80),
        (df['Age_on_Admission'] >= 80) & (df['Age_on_Admission'] < 90),
        (df['Age_on_Admission'] >= 90) & (df['Age_on_Admission'] < 100),
        (df['Age_on_Admission'] >= 100)
    ]
    adjustments = [1, 2, 3, 4, 5, 6]

    df['AgeAdjust_Charlson_Score'] = df['Charlson_Score'] + np.select(conditions, adjustments, default=0)
    return df

df = age_adjust_charlson(df)


In [None]:
df.to_csv("../03_datasets/frailty_ntpro_analysis.csv")

In [None]:
for col in df.columns:
    if col.startswith("NTproBNP_Result "):
        df['NTproBNP_Result_1'] = df['NTproBNP_Result 1'].replace({'<50': 50,
                                                                   '>35000': 35000,
                                                                   '>70000': 70000})

In [None]:
columns_to_check = ['Age_on_Admission', 'Frailty_Score', 'NTproBNP_Result_1', 'Cancer_flag', 'Acute_myocardial_infarction_flag', 
                    'Congestive_heart_failure_flag', 'Peripheral_vascular_disease_flag', 'Cerebral_vascular_accident_flag', 'Dementia_flag', 
                    'Pulmonary_disease_flag', 'Connective_tissue_disorder_flag', 'Peptic_ulcer_flag', 'Liver_disease_flag', 'Paraplegia_flag', 
                    'Renal_disease_flag', 'Metastatic_cancer_flag', 'HIV_flag', 'Severe_liver_disease_flag', 'Diabetes_flag', 
                    'Diabetes_complications_flag']

for col in columns_to_check:
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [None]:
# Logistic regression of charlson comorbidies, age, NT-proBNP, and frailty on all outcomes

outcomes = ['Died_within_30days_of_Discharge', 'Am_Died_within_90days_of_Discharge', 'An_Died_within_365days_of_Discharge',
            'Readmission7d', 'Readmission14d', 'Readmission30d', 'Readmission180d']

for outcome in outcomes:
    df_clean_model = df.dropna(subset=['Age_on_Admission', 'NTproBNP_Result_1', 'Frailty_Score', outcome,'AgeAdjust_Charlson_Score'])  ### Charlson score is applicable to all charlson cols but don't want it in final model
    
    X = df_clean_model[['Age_on_Admission','Frailty_Score', 'NTproBNP_Result_1', 'Cancer_flag', 'Acute_myocardial_infarction_flag', 
                        'Congestive_heart_failure_flag', 'Peripheral_vascular_disease_flag', 'Cerebral_vascular_accident_flag', 'Dementia_flag', 
                        'Pulmonary_disease_flag', 'Connective_tissue_disorder_flag', 'Peptic_ulcer_flag', 'Liver_disease_flag', 'Paraplegia_flag', 
                        'Renal_disease_flag', 'Metastatic_cancer_flag', 'HIV_flag', 'Severe_liver_disease_flag', 'Diabetes_flag', 
                        'Diabetes_complications_flag']]
    
    X = sm.add_constant(X)
    
    y = df_clean_model[outcome]
    
    model = sm.Logit(y, X)
    result = model.fit()
    
    print(result.summary())
