In [1]:
##### REQUIRES THE DATAFRAME FOLDER TO BE NAMED 'Cohorts', WHICH INCLUDES ALL PRECOMPUTED DATAFRAMES #####
import fiber
from fiber.cohort import Cohort
from fiber.condition import Patient, MRNs
from fiber.condition import Diagnosis
from fiber.condition import Measurement, Encounter, Drug
from fiber.storage import yaml as fiberyaml
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import os
from functools import reduce 

DB Password: ········


In [2]:
#load Cohorts
Case_filtered_15_540 = pq.read_table('Cohorts/Case/Case_filtered_15_540.parquet').to_pandas()
Control_filtered_15_540 = pq.read_table('Cohorts/Control/Control_filtered_15_540.parquet').to_pandas()

In [3]:
#generic function get cohort 
def df_to_cohort(df):
    mrns = list(df.index.values)
    condition = MRNs(mrns)
    return Cohort(condition)

In [4]:
#condition= FIBER Condition 
#df_mrn = pandas Data frame with Patient MRNS 
#name_df = Name o the new Dataframe
#name_feature =Feature name 
#frequency = type of occurence: EVER, COUNT,WINDOW
#Cohort_type = Case/Control
def get_has_certain_condition(condition, df_mrn, name_feature, gap_in_days, frequency, cohort_type):
    #get cohort
    Onset_column = "HT_Onset" if cohort_type == "Case" else "last_encounter"
    cohort = df_to_cohort(df_mrn)
    #get cohort with condition
    cohort_condition = cohort.get(condition)
    # Prefilter condition frame, so there are only the rows of interest --> rows which fulfill the gap condition
    Filtered_DF = df_mrn.merge(cohort_condition, left_index=True, right_on = "medical_record_number")
    Filtered_DF = Filtered_DF.set_index("medical_record_number")
    # Use only necessary columns age_in_days + HT_Onset
    Filtered_DF = Filtered_DF[["age_in_days", Onset_column]]
    Filtered_DF = Filtered_DF.loc[(Filtered_DF[Onset_column] - Filtered_DF["age_in_days"]) >= gap_in_days]
    # Filtered_DF contains all gap relevant entries
    
    # get MRNs of input file 
    #cohort_indexes = df_mrn.index
    #get 0 for no diagnosis and 1 for has diagnosis
    cohort_mrn_diagnosis=[]
    if frequency=='EVER':
        #Remove all duplicate MRNs (index)
        Filtered_DF = Filtered_DF.loc[~Filtered_DF.index.duplicated(keep='first')]
        # Set all Rows to 1, because they occur in the dataframe
        Filtered_DF["has_condition_" + name_feature] = 1
        # Prepare merging to the whole cohort
        to_merge = Filtered_DF["has_condition_" + name_feature]
        to_merge = to_merge.to_frame()
        #Merge it with a left outer join
        cohort_mrn_diagnosis = df_mrn.merge(to_merge, left_index = True, right_index = True, how="left")
        # Fill all Rows, which did not appear in the condition with 0
        cohort_mrn_diagnosis["has_condition_" + name_feature].fillna(0, inplace=True)
    elif frequency=='COUNT':
        # Group by Index (MRNs)
        Count_DF = Filtered_DF.groupby([Filtered_DF.index]).count()
        # Count random column, does not matter which
        Count_DF = Count_DF[Onset_column]
        #Convert Series to Frame
        Count_DF = Count_DF.to_frame()
        # Rename column
        Count_DF.rename(columns={Onset_column:'number_of_occurences_' + name_feature},inplace=True)
        # Merge as above
        cohort_mrn_diagnosis = df_mrn.merge(Count_DF, left_index = True, right_index = True, how="left")
        cohort_mrn_diagnosis["number_of_occurences_" + name_feature].fillna(0, inplace=True)
    elif frequency=='WINDOW':
        for mrn in cohort_indexes: 
            x = get_mrn_has_certain_condition_WINDOW(mrn,cohort_condition,df_mrn.loc[mrn]["HT_Onset"]-gap_in_days)
            a = [mrn, x]
            cohort_mrn_diagnosis.append(a)
    else:
        print("function not available")
    #convert list cohort_mrn_diagnosis to panda
    #condition_name= 'has_condition_'+name_feature
    #col_names = ['medical_record_number',condition_name]
    #df_final=pd.DataFrame(cohort_mrn_diagnosis,columns=col_names)
    #df_final.set_index('medical_record_number', inplace=True)
    #df_final=df_mrn.merge( df_final, left_index=True, right_index=True)
    #df_final.to_parquet(name_df)
    #print(df_final)
    df_final = cohort_mrn_diagnosis
    return df_final

In [5]:
condition_ihd = (Diagnosis("410%", "ICD-9")| Diagnosis("I20%", "ICD-10") | 
                Diagnosis("I21%", "ICD-10")| Diagnosis("I22%", "ICD-10") |
                Diagnosis("I23%", "ICD-10")| Diagnosis("I24%", "ICD-10") |
                Diagnosis("I25%", "ICD-10")| Diagnosis("411%", "ICD-9") |
                Diagnosis("412%", "ICD-9")| Diagnosis("413%", "ICD-9") |
                Diagnosis("414%", "ICD-9")  
                )



In [6]:
ihd_COUNT = get_has_certain_condition(condition_ihd , Case_filtered_15_540, "ihd", 180 , "COUNT", "Case")


Fetching data for Diagnosis (...)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
ihd_COUNT

Unnamed: 0_level_0,number_of_encounters,Age_BP_condition,Age_ICD_condition,Age_MED_condition,HT_Onset,Earliest_Condition,first_encounter,total_record_time,year_of_birth,race,religion,gender,number_of_occurences_ihd
medical_record_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100067499,82,24103.0,,24106.0,24103.0,Age_BP_condition,21049,3054.0,1951,White,Other,Male,0.0
100091035,154,11369.0,12938.0,12881.0,11369.0,Age_BP_condition,9141,2228.0,1978,Ba,Other,Male,0.0
1004105957,43,22961.0,,24104.0,22961.0,Age_BP_condition,19914,3047.0,1952,White,Greek Orthodox,Male,1.0
100423963,20,,23584.0,23584.0,23584.0,Age_ICD_condition,19827,3757.0,1945,Other,Pt Declined,Male,0.0
1005188482,279,20636.0,20514.0,20514.0,20514.0,Age_ICD_condition,19884,630.0,1958,Ba,Jewish,Male,0.0
1006954100,28,15242.0,15536.0,15536.0,15242.0,Age_BP_condition,13828,1414.0,1973,Unknown,Catholic,Female,0.0
1006987597,72,27618.0,27618.0,27618.0,27618.0,Age_BP_condition,26706,912.0,1941,White,Catholic,Male,0.0
1007749798,93,,27233.0,27233.0,27233.0,Age_ICD_condition,25235,1998.0,1939,White,Catholic,Female,0.0
1007977434,221,25355.0,24347.0,24347.0,24347.0,Age_ICD_condition,23605,742.0,1943,White,Unknown,Male,0.0
1008498682,87,23048.0,23112.0,23112.0,23048.0,Age_BP_condition,18738,4310.0,1948,Black Or African-American,Baptist,Male,0.0


In [8]:
ihd_COUNT.loc[ihd_COUNT['number_of_occurences_ihd'] != 0]

Unnamed: 0_level_0,number_of_encounters,Age_BP_condition,Age_ICD_condition,Age_MED_condition,HT_Onset,Earliest_Condition,first_encounter,total_record_time,year_of_birth,race,religion,gender,number_of_occurences_ihd
medical_record_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1004105957,43,22961.0,,24104.0,22961.0,Age_BP_condition,19914,3047.0,1952,White,Greek Orthodox,Male,1.0
1103697264,67,19522.0,19305.0,19305.0,19305.0,Age_ICD_condition,18511,794.0,1962,Unknown,,Male,4.0
1105883263,96,,27035.0,25975.0,25975.0,Age_MED_condition,24820,1155.0,1937,White,Jewish,Male,1.0
1179362152,16,,27168.0,27168.0,27168.0,Age_ICD_condition,26333,835.0,1935,White,,Male,1.0
120727686,294,24961.0,24790.0,24790.0,24790.0,Age_ICD_condition,23209,1581.0,1938,Other,Christian,Male,3.0
1219715906,232,21285.0,,19905.0,19905.0,Age_MED_condition,18547,1358.0,1958,White,Jewish,Male,2.0
1249160854,54,,25767.0,25767.0,25767.0,Age_ICD_condition,23172,2595.0,1939,White,,Male,1.0
1265134213,175,29101.0,28606.0,28609.0,28606.0,Age_ICD_condition,27282,1324.0,1932,White,Unknown,Female,1.0
1331507545,77,22681.0,22429.0,22429.0,22429.0,Age_ICD_condition,21724,705.0,1952,Unknown,,Male,1.0
1334811141,413,26840.0,25350.0,26726.0,25350.0,Age_ICD_condition,23946,1404.0,1933,Cambodian,Catholic,Female,1.0


In [9]:
ihd_EVER = get_has_certain_condition(condition_ihd , Case_filtered_15_540, "ihd", 180 , "EVER", "Case")


Fetching data for Diagnosis (...)


In [10]:
ihd_EVER

Unnamed: 0_level_0,number_of_encounters,Age_BP_condition,Age_ICD_condition,Age_MED_condition,HT_Onset,Earliest_Condition,first_encounter,total_record_time,year_of_birth,race,religion,gender,has_condition_ihd
medical_record_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100067499,82,24103.0,,24106.0,24103.0,Age_BP_condition,21049,3054.0,1951,White,Other,Male,0.0
100091035,154,11369.0,12938.0,12881.0,11369.0,Age_BP_condition,9141,2228.0,1978,Ba,Other,Male,0.0
1004105957,43,22961.0,,24104.0,22961.0,Age_BP_condition,19914,3047.0,1952,White,Greek Orthodox,Male,1.0
100423963,20,,23584.0,23584.0,23584.0,Age_ICD_condition,19827,3757.0,1945,Other,Pt Declined,Male,0.0
1005188482,279,20636.0,20514.0,20514.0,20514.0,Age_ICD_condition,19884,630.0,1958,Ba,Jewish,Male,0.0
1006954100,28,15242.0,15536.0,15536.0,15242.0,Age_BP_condition,13828,1414.0,1973,Unknown,Catholic,Female,0.0
1006987597,72,27618.0,27618.0,27618.0,27618.0,Age_BP_condition,26706,912.0,1941,White,Catholic,Male,0.0
1007749798,93,,27233.0,27233.0,27233.0,Age_ICD_condition,25235,1998.0,1939,White,Catholic,Female,0.0
1007977434,221,25355.0,24347.0,24347.0,24347.0,Age_ICD_condition,23605,742.0,1943,White,Unknown,Male,0.0
1008498682,87,23048.0,23112.0,23112.0,23048.0,Age_BP_condition,18738,4310.0,1948,Black Or African-American,Baptist,Male,0.0


In [12]:
ihd_EVER.loc[ihd_EVER['has_condition_ihd'] == 1]

Unnamed: 0_level_0,number_of_encounters,Age_BP_condition,Age_ICD_condition,Age_MED_condition,HT_Onset,Earliest_Condition,first_encounter,total_record_time,year_of_birth,race,religion,gender,has_condition_ihd
medical_record_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1004105957,43,22961.0,,24104.0,22961.0,Age_BP_condition,19914,3047.0,1952,White,Greek Orthodox,Male,1.0
1103697264,67,19522.0,19305.0,19305.0,19305.0,Age_ICD_condition,18511,794.0,1962,Unknown,,Male,1.0
1105883263,96,,27035.0,25975.0,25975.0,Age_MED_condition,24820,1155.0,1937,White,Jewish,Male,1.0
1179362152,16,,27168.0,27168.0,27168.0,Age_ICD_condition,26333,835.0,1935,White,,Male,1.0
120727686,294,24961.0,24790.0,24790.0,24790.0,Age_ICD_condition,23209,1581.0,1938,Other,Christian,Male,1.0
1219715906,232,21285.0,,19905.0,19905.0,Age_MED_condition,18547,1358.0,1958,White,Jewish,Male,1.0
1249160854,54,,25767.0,25767.0,25767.0,Age_ICD_condition,23172,2595.0,1939,White,,Male,1.0
1265134213,175,29101.0,28606.0,28609.0,28606.0,Age_ICD_condition,27282,1324.0,1932,White,Unknown,Female,1.0
1331507545,77,22681.0,22429.0,22429.0,22429.0,Age_ICD_condition,21724,705.0,1952,Unknown,,Male,1.0
1334811141,413,26840.0,25350.0,26726.0,25350.0,Age_ICD_condition,23946,1404.0,1933,Cambodian,Catholic,Female,1.0


In [13]:
Ischemic_heart_disease = ihd_EVER.merge(ihd_COUNT['number_of_occurences_ihd'], left_index=True, right_index=True)


In [14]:
Ischemic_heart_disease

Unnamed: 0_level_0,number_of_encounters,Age_BP_condition,Age_ICD_condition,Age_MED_condition,HT_Onset,Earliest_Condition,first_encounter,total_record_time,year_of_birth,race,religion,gender,has_condition_ihd,number_of_occurences_ihd
medical_record_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
100067499,82,24103.0,,24106.0,24103.0,Age_BP_condition,21049,3054.0,1951,White,Other,Male,0.0,0.0
100091035,154,11369.0,12938.0,12881.0,11369.0,Age_BP_condition,9141,2228.0,1978,Ba,Other,Male,0.0,0.0
1004105957,43,22961.0,,24104.0,22961.0,Age_BP_condition,19914,3047.0,1952,White,Greek Orthodox,Male,1.0,1.0
100423963,20,,23584.0,23584.0,23584.0,Age_ICD_condition,19827,3757.0,1945,Other,Pt Declined,Male,0.0,0.0
1005188482,279,20636.0,20514.0,20514.0,20514.0,Age_ICD_condition,19884,630.0,1958,Ba,Jewish,Male,0.0,0.0
1006954100,28,15242.0,15536.0,15536.0,15242.0,Age_BP_condition,13828,1414.0,1973,Unknown,Catholic,Female,0.0,0.0
1006987597,72,27618.0,27618.0,27618.0,27618.0,Age_BP_condition,26706,912.0,1941,White,Catholic,Male,0.0,0.0
1007749798,93,,27233.0,27233.0,27233.0,Age_ICD_condition,25235,1998.0,1939,White,Catholic,Female,0.0,0.0
1007977434,221,25355.0,24347.0,24347.0,24347.0,Age_ICD_condition,23605,742.0,1943,White,Unknown,Male,0.0,0.0
1008498682,87,23048.0,23112.0,23112.0,23048.0,Age_BP_condition,18738,4310.0,1948,Black Or African-American,Baptist,Male,0.0,0.0


In [15]:
Ischemic_heart_disease.to_parquet("Ischemic_heart_disease.parquet")

In [None]:
##### Controls ####

In [16]:
ihd_COUNT = get_has_certain_condition(condition_ihd , Control_filtered_15_540, "ihd", 180 , "COUNT", "Control")

Fetching data for Diagnosis (...)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [17]:
ihd_EVER = get_has_certain_condition(condition_ihd, Control_filtered_15_540, "ihd", 180 , "EVER", "Control")


Fetching data for Diagnosis (...)


In [18]:
Ischemic_heart_disease_Control = ihd_EVER.merge(ihd_COUNT['number_of_occurences_ihd'], left_index=True, right_index=True)


In [19]:
Ischemic_heart_disease_Control

Unnamed: 0_level_0,last_encounter,number_of_encounters,first_encounter,total_record_time,year_of_birth,race,religion,gender,has_condition_ihd,number_of_occurences_ihd
medical_record_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100002558,21414,15,19951,1463,1959,White,Other,Female,0.0,0.0
100002884,23929,44,18526,5403,1952,Unknown,,Male,0.0,0.0
1000068212,10116,29,9228,888,1990,White,Unknown,Female,0.0,0.0
1000083464,25630,20,23026,2604,1948,White,Catholic,Male,0.0,0.0
1000119948,17087,163,15757,1330,1971,African-American,,Male,0.0,0.0
1000165222,17506,627,12215,5291,1970,Other,Catholic,Female,1.0,1.0
1000199143,26941,141,22805,4136,1944,White,Jewish,Female,0.0,0.0
1000205924,14867,108,10618,4249,1976,Unknown,Unknown,Female,0.0,0.0
1000212584,19220,15,16818,2402,1965,White,,Male,0.0,0.0
1000238281,16497,56,13475,3022,1972,White,,Male,0.0,0.0


In [20]:
Ischemic_heart_disease_Control.to_parquet("Ischemic_heart_disease_Control.parquet")