In [1]:
##### REQUIRES THE DATAFRAME FOLDER TO BE NAMED 'Cohorts', WHICH INCLUDES ALL PRECOMPUTED DATAFRAMES #####
import fiber
from fiber.cohort import Cohort
from fiber.condition import Patient, MRNs
from fiber.condition import Diagnosis
from fiber.condition import Measurement, Encounter, Drug
from fiber.storage import yaml as fiberyaml
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import os
from functools import reduce 

DB Password: ········


In [2]:
#load Cohorts
Case_filtered_15_540 = pq.read_table('Cohorts/Case/Case_filtered_15_540.parquet').to_pandas()
Control_filtered_15_540 = pq.read_table('Cohorts/Control/Control_filtered_15_540.parquet').to_pandas()

In [3]:
#generic function get cohort 
def df_to_cohort(df):
    mrns = list(df.index.values)
    condition = MRNs(mrns)
    return Cohort(condition)

In [4]:
#condition= FIBER Condition 
#df_mrn = pandas Data frame with Patient MRNS 
#name_df = Name o the new Dataframe
#name_feature =Feature name 
#frequency = type of occurence: EVER, COUNT,WINDOW
#Cohort_type = Case/Control
def get_has_certain_condition(condition, df_mrn, name_feature, gap_in_days,window_in_days, frequency, cohort_type):
    #get cohort
    Onset_column = "HT_Onset" if cohort_type == "Case" else "last_encounter"
    cohort = df_to_cohort(df_mrn)
    #get cohort with condition
    cohort_condition = cohort.get(condition)
    # Prefilter condition frame, so there are only the rows of interest --> rows which fulfill the gap condition
    try:
        Filtered_DF = df_mrn.merge(cohort_condition, left_index=True, right_on = "medical_record_number")
    except:
        print("DataFrame is empty. Condition does not fit to any record!")
        raise SystemExit(0)
    Filtered_DF = Filtered_DF.set_index("medical_record_number")
    # Use only necessary columns age_in_days + HT_Onset
    Filtered_DF = Filtered_DF[["age_in_days", Onset_column]]
    Filtered_DF = Filtered_DF.loc[(Filtered_DF[Onset_column] - Filtered_DF["age_in_days"]) >= gap_in_days]
    # Filtered_DF contains all gap relevant entries
    
    # get MRNs of input file 
    #cohort_indexes = df_mrn.index
    #get 0 for no diagnosis and 1 for has diagnosis
    cohort_mrn_diagnosis=[]
    if frequency=='EVER':
        #Remove all duplicate MRNs (index)
        Filtered_DF = Filtered_DF.loc[~Filtered_DF.index.duplicated(keep='first')]
        # Set all Rows to 1, because they occur in the dataframe
        Filtered_DF["has_condition_" + name_feature] = 1
        # Prepare merging to the whole cohort
        to_merge = Filtered_DF["has_condition_" + name_feature]
        to_merge = to_merge.to_frame()
        #Merge it with a left outer join
        cohort_mrn_diagnosis = df_mrn.merge(to_merge, left_index = True, right_index = True, how="left")
        # Fill all Rows, which did not appear in the condition with 0
        cohort_mrn_diagnosis["has_condition_" + name_feature].fillna(0, inplace=True)
    elif frequency=='COUNT':
        # Group by Index (MRNs)
        Count_DF = Filtered_DF.groupby([Filtered_DF.index]).count()
        # Count random column, does not matter which
        Count_DF = Count_DF[Onset_column]
        #Convert Series to Frame
        Count_DF = Count_DF.to_frame()
        # Rename column
        Count_DF.rename(columns={Onset_column:'number_of_occurences_' + name_feature},inplace=True)
        # Merge as above
        cohort_mrn_diagnosis = df_mrn.merge(Count_DF, left_index = True, right_index = True, how="left")
        cohort_mrn_diagnosis["number_of_occurences_" + name_feature].fillna(0, inplace=True)
    elif frequency=='WINDOW':
        #max window size
        Filtered_DF = Filtered_DF.loc[(Filtered_DF[Onset_column] - Filtered_DF["age_in_days"]) <= (window_in_days+gap_in_days)]
        # Group by Index (MRNs)
        Count_DF = Filtered_DF.groupby([Filtered_DF.index]).count()
        # Count random column, does not matter which
        Count_DF = Count_DF[Onset_column]
        #Convert Series to Frame
        Count_DF = Count_DF.to_frame()
        # Rename column
        Count_DF.rename(columns={Onset_column:'number_of_occurences_' + name_feature},inplace=True)
        # Merge as above
        cohort_mrn_diagnosis = df_mrn.merge(Count_DF, left_index = True, right_index = True, how="left")
        cohort_mrn_diagnosis["number_of_occurences_" + name_feature].fillna(0, inplace=True)
    else:
        print("function not available")
    #convert list cohort_mrn_diagnosis to panda
    #condition_name= 'has_condition_'+name_feature
    #col_names = ['medical_record_number',condition_name]
    #df_final=pd.DataFrame(cohort_mrn_diagnosis,columns=col_names)
    #df_final.set_index('medical_record_number', inplace=True)
    #df_final=df_mrn.merge( df_final, left_index=True, right_index=True)
    #df_final.to_parquet(name_df)
    #print(df_final)
    df_final = cohort_mrn_diagnosis
    return df_final

In [5]:
condition_cr = (Diagnosis("430%", "ICD-9")| Diagnosis("I60%", "ICD-10")| 
                Diagnosis("I61%", "ICD-10")| Diagnosis("I62%", "ICD-10")|
                Diagnosis("I63%", "ICD-10")| Diagnosis("I64%", "ICD-10")|
                Diagnosis("I65%", "ICD-10")| Diagnosis("431%", "ICD-9") |
                Diagnosis("432%", "ICD-9") | Diagnosis("433%", "ICD-9") |
                Diagnosis("434%", "ICD-9") | Diagnosis("435%", "ICD-9") |
                Diagnosis("436%", "ICD-9") | Diagnosis("437%", "ICD-9") |
                Diagnosis("438%", "ICD-9") | Diagnosis("I66%", "ICD-10")|
                Diagnosis("I67%", "ICD-10")| Diagnosis("I68%", "ICD-10")
                )


In [8]:
cr_WINDOW = get_has_certain_condition(condition_cr , Case_filtered_15_540, "cr", 180 ,720, "WINDOW", "Case")


Fetching data for Diagnosis (...)


In [10]:
cr_COUNT

Unnamed: 0_level_0,number_of_encounters,Age_BP_condition,Age_ICD_condition,Age_MED_condition,HT_Onset,Earliest_Condition,first_encounter,total_record_time,year_of_birth,race,religion,gender,number_of_occurences_cr
medical_record_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100067499,82,24103.0,,24106.0,24103.0,Age_BP_condition,21049,3054.0,1951,White,Other,Male,0.0
100091035,154,11369.0,12938.0,12881.0,11369.0,Age_BP_condition,9141,2228.0,1978,Ba,Other,Male,0.0
1004105957,43,22961.0,,24104.0,22961.0,Age_BP_condition,19914,3047.0,1952,White,Greek Orthodox,Male,0.0
100423963,20,,23584.0,23584.0,23584.0,Age_ICD_condition,19827,3757.0,1945,Other,Pt Declined,Male,0.0
1005188482,279,20636.0,20514.0,20514.0,20514.0,Age_ICD_condition,19884,630.0,1958,Ba,Jewish,Male,0.0
1006954100,28,15242.0,15536.0,15536.0,15242.0,Age_BP_condition,13828,1414.0,1973,Unknown,Catholic,Female,0.0
1006987597,72,27618.0,27618.0,27618.0,27618.0,Age_BP_condition,26706,912.0,1941,White,Catholic,Male,0.0
1007749798,93,,27233.0,27233.0,27233.0,Age_ICD_condition,25235,1998.0,1939,White,Catholic,Female,0.0
1007977434,221,25355.0,24347.0,24347.0,24347.0,Age_ICD_condition,23605,742.0,1943,White,Unknown,Male,0.0
1008498682,87,23048.0,23112.0,23112.0,23048.0,Age_BP_condition,18738,4310.0,1948,Black Or African-American,Baptist,Male,0.0


In [11]:
cr_WINDOW.loc[cr_WINDOW['number_of_occurences_cr'] != 0]

Unnamed: 0_level_0,number_of_encounters,Age_BP_condition,Age_ICD_condition,Age_MED_condition,HT_Onset,Earliest_Condition,first_encounter,total_record_time,year_of_birth,race,religion,gender,number_of_occurences_cr
medical_record_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1173163870,402,20441.0,20504.0,20504.0,20441.0,Age_BP_condition,12432,8009.0,1956,Other,Pt Declined,Male,19.0
1214485155,2354,19820.0,20020.0,19838.0,19820.0,Age_BP_condition,16427,3393.0,1957,White,Catholic,Male,5.0
1672330252,81,,21774.0,21774.0,21774.0,Age_ICD_condition,21099,675.0,1949,White,Jewish,Male,1.0
1705858715,41,28974.0,,,28974.0,Age_BP_condition,27693,1281.0,1939,Other,Catholic,Male,1.0
171582846,115,22146.0,21944.0,21944.0,21944.0,Age_ICD_condition,21186,758.0,1951,Other,Muslim,Female,1.0
1717908522,152,16430.0,16430.0,16430.0,16430.0,Age_BP_condition,14325,2105.0,1969,Other,Unknown,Male,2.0
1967956140,252,18339.0,,18192.0,18192.0,Age_MED_condition,15504,2688.0,1965,Unknown,Other,Male,13.0
2001073573,638,28389.0,28418.0,28418.0,28389.0,Age_BP_condition,24654,3735.0,1937,Other,Catholic,Female,1.0
2082928298,168,,19832.0,,19832.0,Age_ICD_condition,18547,1285.0,1961,Unknown,Unknown,Female,12.0
2210631906,62,26874.0,,26392.0,26392.0,Age_MED_condition,20586,5806.0,1944,White,,Male,5.0


In [13]:
cr_WINDOW.to_parquet("Cerebrovascular_disease_2yr_case.parquet")

In [None]:
#### CONTROLS ####

In [12]:
cr_Window = get_has_certain_condition(condition_cr, Control_filtered_15_540, "cr", 180 ,720, "WINDOW", "Control")


Fetching data for Diagnosis (...)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [15]:
cr_Window.loc[cr_Window['number_of_occurences_cr'] >= 1]

Unnamed: 0_level_0,last_encounter,number_of_encounters,first_encounter,total_record_time,year_of_birth,race,religion,gender,number_of_occurences_cr
medical_record_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1000925805,23372,79,21264,2108,1954,White,,Female,1.0
1017944292,23973,48,22644,1329,1951,White,Unknown,Female,6.0
1024769011,22343,62,21413,930,1957,Other,Catholic,Female,4.0
1025961249,15495,197,12905,2590,1972,Black Or African-American,,Male,3.0
1037512434,15838,143,15047,791,1974,Unknown,Catholic,Female,37.0
1040722046,25685,119,19126,6559,1947,White,Unknown,Male,2.0
1043363103,30135,48,29389,746,1935,Ba,Protestant,Male,6.0
1050027815,14730,59,12907,1823,1978,White,,Female,3.0
1053020818,18135,156,16046,2089,1968,White,,Female,1.0
1063864771,24545,97,20052,4493,1950,Other,Catholic,Female,12.0


In [16]:
cr_WINDOW.to_parquet("Cerebrovascular_disease_2yr_control.parquet")