In [None]:
import fiber
from fiber.cohort import Cohort
from fiber.condition import Patient, MRNs, Weight, TobaccoUse, AlcoholUse, Weight
from fiber.condition import Diagnosis
from fiber.condition import Measurement, Encounter, Drug
from fiber.storage import yaml as fiberyaml
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import os
from functools import reduce

In [None]:
##Converts dataframe to a Fiber Cohort, where the Cohort.get function can be executed on
def df_to_cohort(df):
    mrns = list(df.index.values)
    condition = MRNs(mrns)
    return Cohort(condition)


In [None]:
## Returns a diagram of a given dataframe
def get_diagram(dataframe, diagramm_type, column_name, bin_size):
    if diagramm_type=='hist':
        plt.figure(figsize=[20,8])
        plt.hist(dataframe[column_name], bins=bin_size, rwidth=4)
        plt.xticks(rotation='vertical')
    else: 
        print("diagramm type not available")

In [None]:
##How to iterate through rows and change one column

for i in x.index:
    x.at[i, "date_of_birth"] = x.at[i, "date_of_birth"].year

In [None]:
# Filters a dataframe by the number of encounters and total record time
def filter_by_encounters(df, number_of_encounter, total_record_time):
    df = df.loc[(df["number_of_encounters"] >= number_of_encounter) & (df["total_record_time"] >= total_record_time)]
    return df

In [None]:
#condition= FIBER Condition 
#df_mrn = pandas Data frame with Patient MRNS 
#name_df = Name o the new Dataframe
#name_feature =Feature name 
#frequency = type of occurence: EVER, COUNT,WINDOW
#Cohort_type = Case/Control
def get_has_certain_condition(condition, df_mrn, name_feature, gap_in_days, frequency, cohort_type):
    #get cohort
    Onset_column = "HT_Onset" if cohort_type == "Case" else "last_encounter"
    cohort = df_to_cohort(df_mrn)
    #get cohort with condition
    cohort_condition = cohort.get(condition)
    # Prefilter condition frame, so there are only the rows of interest --> rows which fulfill the gap condition
    Filtered_DF = df_mrn.merge(cohort_condition, left_index=True, right_on = "medical_record_number")
    Filtered_DF = Filtered_DF.set_index("medical_record_number")
    # Use only necessary columns age_in_days + HT_Onset
    Filtered_DF = Filtered_DF[["age_in_days", Onset_column]]
    Filtered_DF = Filtered_DF.loc[(Filtered_DF[Onset_column] - Filtered_DF["age_in_days"]) >= gap_in_days]
    # Filtered_DF contains all gap relevant entries
    
    # get MRNs of input file 
    #cohort_indexes = df_mrn.index
    #get 0 for no diagnosis and 1 for has diagnosis
    cohort_mrn_diagnosis=[]
    if frequency=='EVER':
        #Remove all duplicate MRNs (index)
        Filtered_DF = Filtered_DF.loc[~Filtered_DF.index.duplicated(keep='first')]
        # Set all Rows to 1, because they occur in the dataframe
        Filtered_DF["has_condition_" + name_feature] = 1
        # Prepare merging to the whole cohort
        to_merge = Filtered_DF["has_condition_" + name_feature]
        to_merge = to_merge.to_frame()
        #Merge it with a left outer join
        cohort_mrn_diagnosis = df_mrn.merge(to_merge, left_index = True, right_index = True, how="left")
        # Fill all Rows, which did not appear in the condition with 0
        cohort_mrn_diagnosis["has_condition_" + name_feature].fillna(0, inplace=True)
    elif frequency=='COUNT':
        # Group by Index (MRNs)
        Count_DF = Filtered_DF.groupby([Filtered_DF.index]).count()
        # Count random column, does not matter which
        Count_DF = Count_DF[Onset_column]
        #Convert Series to Frame
        Count_DF = Count_DF.to_frame()
        # Rename column
        Count_DF.rename(columns={Onset_column:'number_of_occurences_' + name_feature},inplace=True)
        # Merge as above
        cohort_mrn_diagnosis = df_mrn.merge(Count_DF, left_index = True, right_index = True, how="left")
        cohort_mrn_diagnosis["number_of_occurences_" + name_feature].fillna(0, inplace=True)
    elif frequency=='WINDOW':
        for mrn in cohort_indexes: 
            x = get_mrn_has_certain_condition_WINDOW(mrn,cohort_condition,df_mrn.loc[mrn]["HT_Onset"]-gap_in_days)
            a = [mrn, x]
            cohort_mrn_diagnosis.append(a)
    else:
        print("function not available")
    #convert list cohort_mrn_diagnosis to panda
    #condition_name= 'has_condition_'+name_feature
    #col_names = ['medical_record_number',condition_name]
    #df_final=pd.DataFrame(cohort_mrn_diagnosis,columns=col_names)
    #df_final.set_index('medical_record_number', inplace=True)
    #df_final=df_mrn.merge( df_final, left_index=True, right_index=True)
    #df_final.to_parquet(name_df)
    #print(df_final)
    df_final = cohort_mrn_diagnosis
    return df_final

In [None]:
#takes a fiber dataframe and a dataframe with HT_Onset and checks if the difference between HT_Onset and age_in_days is bigger than 180
# can be used for all lab values and other measures, which provide age_in_days
def filter_df_Onset(to_check, df_with_HT):
    final_merged = to_check.merge(df_with_HT[["HT_Onset"]], left_on="medical_record_number", right_index=True, how="left")
    final_filtered = final_merged.loc[(final_merged["HT_Onset"] - final_merged["age_in_days"]) > 180]
    return final_filtered

In [1]:
# Get min,max,mean and create a new dataframe with these three values from the output of the function filter_df_onset
def min_max_mean(df, lab_value_name):
    df_min = df.groupby([df["medical_record_number"]])['numeric_value'].min().to_frame()
    df_max = df.groupby([df["medical_record_number"]])['numeric_value'].max().to_frame()
    df_mean = df.groupby([df["medical_record_number"]])['numeric_value'].mean().to_frame()
    df_min.rename(columns={"numeric_value": lab_value_name+"_min"}, inplace=True)
    df_max.rename(columns={"numeric_value": lab_value_name+"_max"}, inplace=True)
    df_mean.rename(columns={"numeric_value": lab_value_name+"_mean"}, inplace=True)
    result = df_min.merge(df_max, left_index=True, right_index=True)
    result = result.merge(df_mean, left_index=True, right_index=True)
    return result

In [None]:
Control_complete = pq.read_table('Cohorts/Control/control_encounter_mrns.parquet').to_pandas()
Case_complete = pq.read_table('Cohorts/Case/Case_complete.parquet').to_pandas()

In [None]:
Control_First_Encounters = []
Control_Last_Encounters = []
Control_Count_Encounters = []
Control_MRNs = list(Control_complete.index.values)
print(len(Control_MRNs))
for limit in range(0, len(Control_MRNs), 500000):
    print("Begin of iteration: " +  str(limit))
    temp = Control_MRNs[limit:(limit+500000)]
    p_condition = MRNs(temp) #how to create cohort from dataframe
    cohort = Cohort(p_condition)
    print(len(cohort))
    enc = cohort.get(Encounter())
    Encounter_WO_0 = enc.loc[enc["age_in_days"] != 0]
    group = Encounter_WO_0.groupby(['medical_record_number'])['age_in_days']
    Control_First_Encounters.append(group.min().to_frame())
    print("End of iteration: " +  str(limit))