In [None]:
""" 
Goals: Find the candidate features (ICD codes) and response (ICD code)
Ideally, we want features to have different frequencies for different domains

Explore in the following way:
- Target/source features by (frequency, ratio, description)
- Correlation matrix (target vs source feature correlation)

"""

In [None]:
import getpass
user_id = getpass.getuser()
import sys
sys.path.append(f"/home/{user_id}/OTTEHR/")

import collections
from common import *
from ast import literal_eval
# %matplotlib notebook
import matplotlib.pyplot as plt
import math
from mpl_toolkits import mplot3d
import numpy as np
import pandas as pd
import random
from select_codes import *
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, accuracy_score
import seaborn as sns


In [None]:
def divide_dataset(df, divide_feature, group_1, group_2):
    """ 
    Divide dataset into group 1 (key value group_1) and group 2 (key value group_2) \
        by dividing feature divide_feature

    Returns: group 1 relative frequency dictionary, group 2 relative frequency dictionary, \
        group 1 total number of codes, group 2 total number of codes \
        total number of unique codes 
    """
    print("df shape is:", df.shape)
    group_1_df = df.loc[df[divide_feature] == group_1]
    print("group_1_df shape is:", group_1_df.shape)
    group_1_codes = []
    for _, row in group_1_df.iterrows():
        group_1_codes.extend(row['ICD codes'])
    group_1_freq_dict = dict(collections.Counter(group_1_codes))
    group_1_total = sum(group_1_freq_dict.values())
    group_1_freq_dict = {k: v / group_1_total for k, v in group_1_freq_dict.items()}

    group_2_df = df.loc[df[divide_feature] == group_2]
    print("group_2_df shape is:", group_2_df.shape)
    group_2_codes = []
    for _, row in group_2_df.iterrows():
        group_2_codes.extend(row['ICD codes'])
    group_2_freq_dict = dict(collections.Counter(group_2_codes))
    group_2_total = sum(group_2_freq_dict.values())
    group_2_freq_dict = {k: v / group_2_total for k, v in group_2_freq_dict.items()}

    unique_codes = list(group_1_freq_dict.keys())
    unique_codes.extend(list(group_2_freq_dict.keys()))
    num_unique_codes = len(list(set(unique_codes)))

    return group_1_freq_dict, group_2_freq_dict, group_1_total, group_2_total, num_unique_codes


In [None]:
def find_differ_codes(dict_1, dict_2, diff_percent):
    """ 
    Find codes in dictionary 1 (dict_1) and and dictionary 2 (dict 2) \
        with relative frequencies differing by diff_percent, 
    
    Returns: the codes differing by diff_percent
    """
    diff_codes = []
    for key, value in dict_1.items():
        # print(key, value)
        if key in dict_2:
            if abs(dict_2[key] - value) > diff_percent:
                diff_codes.append(key)
                print(f"code {key}: dictionary 1 frequency - {value}, dictionary 2 frequency - {dict_2[key]}")
        elif value > diff_percent:
            diff_codes.append
            print(f"code {key}: dictionary 1 frequency - {value}, dictionary 2 frequency - 0")

    for key, value in dict_2.items():
        if key not in dict_1 and value > diff_percent:
            diff_codes.append(key)
            print(f"code {key}: dictionary 1 frequency - 0, dictionary 2 frequency - {value}")

    dict_diff_1 = {}
    dict_diff_2 = {}

    for key, value in dict_1.items():
        if key in diff_codes:
            dict_diff_1[key] = value

    for key, value in dict_2.items():
        if key in diff_codes:
            dict_diff_2[key] = value  

    return diff_codes, dict_diff_1, dict_diff_2


In [None]:
""" 
Read in dataset and select codes
"""

df_path = f"/home/{user_id}/OTTEHR/outputs/mimic/admission_patient_diagnosis_ICD.csv"
admid_diag_df = pd.read_csv(df_path, index_col=0, header=0, converters={'ICD codes': literal_eval})
admid_diag_df

male_freq_dict, female_freq_dict = construct_freq_dict_group(admid_diag_df, 'gender', 'M', 'F')
male_min_count = 120
female_min_count = 100
selected_codes = select_codes(male_freq_dict, female_freq_dict, male_min_count*2, female_min_count*2)
'V50.2' in selected_codes

In [None]:
len(list(male_freq_dict.keys()))

In [None]:
# preprocess the dataset to keep only the top codes

admid_diag_filtered_df = admid_diag_df.copy(deep=True)
empty_indices = []
for index, row in admid_diag_filtered_df.iterrows():
    codes = row['ICD codes']
    filtered_codes = [code for code in codes if code in selected_codes]
    admid_diag_filtered_df.at[index, 'ICD codes'] = filtered_codes
    if len(filtered_codes) == 0:
        empty_indices.append(index)
admid_diag_filtered_df = admid_diag_filtered_df.drop(empty_indices)
print(admid_diag_filtered_df.shape)
print(admid_diag_df.shape)

all_codes = []
for index, row in admid_diag_filtered_df.iterrows():
    all_codes.extend(row['ICD codes'])


data_path = "../../outputs/mimic/ADMID_DIAGNOSIS_selected.csv"
admid_diag_filtered_df.to_csv(data_path, index=True, header=True)
admid_diag_filtered_df


# Divide by gender

In [None]:
""" 
Calculates ICD code frequencies for target and source
"""
male_freq_dict, female_freq_dict, male_total, female_total, num_unique_codes = divide_dataset(admid_diag_df, 'gender', 'M', 'F')
print("male total number of code is:", male_total)
print("female total number of code is:", female_total)
print("number of unique codes is:", num_unique_codes)
avg_percent = 1/num_unique_codes
print("avg percent is:", avg_percent)

In [None]:
# Find frequencies in female and male dictionaries differing by diff_percent
diff_codes, male_diff_dict, female_diff_dict = find_differ_codes(male_freq_dict, female_freq_dict, 10*avg_percent)
print("number of differing codes is:", len(diff_codes))

# report the statistics in REB application

In [None]:
# Find frequency difference for the filtered dataframe
male_freq_dict, female_freq_dict, male_total, female_total, num_unique_codes = divide_dataset(admid_diag_filtered_df, 'gender', 'M', 'F')
avg_percent = 1/num_unique_codes
print("avg percent is:", avg_percent)

diff_codes, male_diff_dict, female_diff_dict = find_differ_codes(male_freq_dict, female_freq_dict, avg_percent)
print("number of differing codes is:", len(diff_codes))

In [None]:
# visualize differing codes
plt.bar(list(male_diff_dict.keys()), list(male_diff_dict.values()), alpha=0.5, label="male")
plt.bar(list(female_diff_dict.keys()), list(female_diff_dict.values()), alpha=0.5, label="female")
plt.xticks(rotation = 90) 
plt.legend()
plt.show()

# Divide by admission type

In [None]:
""" 
Calculate the number of emergency, elective and newborn
"""
types = list(admid_diag_df['adm_type'])
print("emergency count is:", types.count('EMERGENCY'))
print("elective count is:", types.count('ELECTIVE'))
print("newborn count is:", types.count('NEWBORN'))

In [None]:
emergency_freq_dict, elective_freq_dict, emergency_total, elective_total, num_unique_codes \
    = divide_dataset(admid_diag_df, 'adm_type', 'EMERGENCY', 'ELECTIVE')
avg_percent = 1/num_unique_codes
print("avg percent is:", avg_percent)

In [None]:
# Find frequencies in emergency and elective dictionaries differing by diff_percent
diff_codes = find_differ_codes(emergency_freq_dict, elective_freq_dict, 10*avg_percent)
print("number of differing codes is:", len(diff_codes))

In [None]:
# Find frequency difference for the filtered dataframe
emergency_freq_dict, elective_freq_dict, emergency_total, elective_total, num_unique_codes \
    = divide_dataset(admid_diag_filtered_df, 'adm_type', 'EMERGENCY', 'ELECTIVE')
avg_percent = 1/num_unique_codes
print("avg percent is:", avg_percent)

diff_codes, emergency_diff_dict, elective_diff_dict,  = find_differ_codes(emergency_freq_dict, elective_freq_dict, avg_percent)
print("number of differing codes is:", len(diff_codes))

In [None]:
# visualize differing codes
plt.bar(list(emergency_diff_dict.keys()), list(emergency_diff_dict.values()), alpha=0.5, label="emergency")
plt.bar(list(elective_diff_dict.keys()), list(elective_diff_dict.values()), alpha=0.5, label="elective")
plt.xticks(rotation = 90) 
plt.legend()
plt.show()