In [1]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from thesislib.utils import pathutils

In [4]:
import pandas as pd
import json
import time
import numpy as np
import matplotlib.pyplot as plt
from dateutil.parser import parse as date_parser
from dateutil.relativedelta import relativedelta
import math

In [8]:
conditions_csv = pathutils.get_data_file("plain-synthea/data/conditions.csv")
conditions = pd.read_csv(conditions_csv)

encounters_csv = pathutils.get_data_file("plain-synthea/data/encounters.csv")
encounters = pd.read_csv(encounters_csv)

observations_csv = pathutils.get_data_file("plain-synthea/data/observations.csv")
observations = pd.read_csv(observations_csv)

patients_csv = pathutils.get_data_file("plain-synthea/data/patients.csv")
patients = pd.read_csv(patients_csv)

In [9]:
conditions_db_file = pathutils.get_data_file("plain-synthea/data/conditions.json")
with open(conditions_db_file) as f:
    conditions_db = json.load(f)

observations_db_file = pathutils.get_data_file("plain-synthea/data/observations.json")
with open(observations_db_file) as f:
    observations_db = json.load(f)

In [10]:
# some important parameters

# these are so called "encounters for symptoms", 
# The rational is that they usually indicate when the conditions are diagnosed and as such might 
# provide the right kind of information!
SYMPTOM_ENCOUNTER_CODE = 185345009

In [14]:
# utils
def get_output_folder():
    """
    Returns the output folder where generated data should be stored
    """
    output_folder = os.path.join(pathutils.get_data_directory(), "plain-synthea/output")
    
    if not os.path.isdir(output_folder):
        os.mkdir(output_folder)

    return output_folder

def _count_key(label_count):
    return label_count[2]

def _print_list(obj):
    """
    Pretty list printing
    """
    for item in obj:
        print(item)

def form_design_matrix(observation_db):
    """
    Using the observations, builds a 'design matrix' that would hold all the observed
    features!
    """
    if type(observation_db) == "str":
        with open(observation_db) as fp:
            res = json.load(fp)
        observation_db = res
    data_matrix = {
        "condition_code": [],
        "condition_start": [],
        "condition_stop": [],
        "patient_id": [],
        "encounter_id": [],
        "patient_age": [],
        "marital_status": [],
        "race": [],
        "ethnicity": [],
        "gender": [],
    }

    data_matrix.update({k: [] for k in observations_db.keys()})
    return data_matrix

def get_design_matrix(observation_db, combined_df):
    """
    using a dataframe that combines the selected conditions, encounters and patients,
    this function fills up the design matrix with the proper values!
    """
    data_matrix = form_design_matrix(observation_db)
    
    grouped = reduced_combined.groupby(["ENCOUNTER", "CODE"])
    
    data_keys = list(data_matrix.keys())
    for item, df in grouped.__iter__():
        vector = {k: np.nan for k in data_keys}
        vector["encounter_id"] = item[0]
        vector["condition_code"] = item[1]
        vector["condition_start"] = df["START"].iloc[0]
        vector["condition_stop"] = df["STOP"].iloc[0]
        vector["patient_id"] = df["PATIENT"].iloc[0]
        vector["marital_status"] = df["MARITAL"].iloc[0]
        vector["race"] = df["RACE"].iloc[0]
        vector["ethnicity"] = df["ETHNICITY"].iloc[0]
        vector["gender"] = df["GENDER"].iloc[0]

        # fill in the observations
        for idx, obv_code in df["CODE_obv"].items():
            if obv_code not in data_keys:
                continue
            vector[obv_code] = df["VALUE"].loc[idx]

        # handle the age
        start_encounter_date = date_parser(df["START_enc"].iloc[0])
        patient_birthdate = date_parser(df["BIRTHDATE"].iloc[0])
        vector["patient_age"] = abs(patient_birthdate.year - start_encounter_date.year)

        for k,v in vector.items():
            data_matrix[k].append(v)
    return data_matrix

def filter_data(design_db):
    """
    Using a design dataframe, this function drops columns that have no data
    """
    

In [15]:
DATA_DIR = get_output_folder()

In [17]:
symptom_encounters = encounters.loc[encounters["CODE"] == SYMPTOM_ENCOUNTER_CODE]
# which conditions are related to these encounters
symptom_conditions = conditions.loc[conditions['ENCOUNTER'].isin(symptom_encounters["Id"])]

In [18]:
# group the conditions by code
conditions_group = symptom_conditions.groupby(["CODE"])
condition_label_counts = []
for group_name, group in conditions_group.__iter__():
    condition_label_counts.append((conditions_db.get(str(group_name)), group_name, len(group)))

condition_label_counts = sorted(condition_label_counts, key=_count_key, reverse=True)

In [20]:
top_10_conditions = [item[1] for item in condition_label_counts[:10]]
print(top_10_conditions)

[444814009, 195662009, 10509002, 40055000, 65363002, 43878008, 75498004, 36971009, 301011002, 232353008]


In [21]:
# based on the condition label counts we take the top 10 conditions
# target_condition_codes = [444814009, 195662009, 10509002, 40055000, 65363002, 43878008, 75498004, 36971009, 301011002, 232353008]
target_condition_codes = top_10_conditions

condition_labels = {code: idx for idx, code in enumerate(target_condition_codes)}
# save the labels using a 1 of K encoding scheme
condition_labels_json_file = os.path.join(DATA_DIR, "condition_labels.json")
with open(condition_labels_json_file, 'w') as f:
    json.dump(condition_labels, f)

In [22]:
target_conditions = symptom_conditions.loc[symptom_conditions["CODE"].isin(target_condition_codes)]

In [23]:
# merge conditions, encounters and patients so we have this huge design matrix
combined = target_conditions.merge(observations, how='left', left_on='ENCOUNTER', right_on='ENCOUNTER', suffixes=('', '_obv'))
# merge with patients
combined = combined.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
# merge with encounters
combined = combined.merge(encounters, how='left', left_on='ENCOUNTER', right_on='Id', suffixes=('', '_enc'))

In [24]:
# columns that need to be droped. The they don't hold any relevant information ..
# how do we know they don't hold any useful information ?? Erm well ..
to_drop = ['ADDRESS', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX', 'FIRST', 'LAST', 'SUFFIX', 'MAIDEN',
           'BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE', 'COUNTY', 'ZIP','LAT', 'LON', 'HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE',
           'PROVIDER', 'PAYER', 'BASE_ENCOUNTER_COST', 'TOTAL_CLAIM_COST', 'PAYER_COVERAGE'
          ]
reduced_combined = combined.drop(columns=to_drop)

In [25]:
data_matrix = get_design_matrix(observations_db_file, reduced_combined)

In [26]:
# form the dataframe and save to our results folder
data_df = pd.DataFrame(data_matrix)
data_df.to_csv(path_or_buf=os.path.join(DATA_DIR, "data.csv"))

In [27]:
# which observations have the most available data. The dataset is super sparse, but by how much?
observation_keys = list(observations_db.keys())
available_count = [data_df[k].notnull().sum() for k in observation_keys]

aidx = np.argmax(available_count)
uidx = np.argmin(available_count)
print("The most available observation: %s with %d occurences" % (observations_db[observation_keys[aidx]], available_count[aidx]))
print("The least available observation: %s with %d occurences" % (observations_db[observation_keys[uidx]], available_count[uidx]))

The most available observation: Oral temperature with 8720 occurences
The least available observation: Polyp size greatest dimension by CAP cancer protocols with 0 occurences


In [28]:
# which observations are completely not available
completely_empty = np.where(np.array(available_count) < 20)[0]
completely_empty_codes = [observation_keys[idx] for idx in completely_empty]
completely_empty_dicts = {k: observations_db[k] for k in completely_empty_codes}

completely_empty_json_file = os.path.join(DATA_DIR, "completely_empty.json")
# put this in a json
with open(completely_empty_json_file, "w") as f:
    json.dump(completely_empty_dicts, f)

In [29]:
# drop columns that have no data in them!
filtered_data = data_df.drop(columns=completely_empty_codes)

In [30]:
# save to csv
filtered_data_csv_file = os.path.join(DATA_DIR, "filtered_data.csv")
filtered_data.to_csv(path_or_buf=filtered_data_csv_file)

In [31]:
# do some more cleaning to get things more ready to be fed into some training alg.
# some useful functions
def _condition_transform_fxn(value, labels={}):
    return labels[value]

def _gender_transform_fxn(value):
    if value == 'M':
        return 1 # encoding for Male
    elif value == 'F':
        return 0 # encoding for female
    else:
        return 2 # encode the nan's

def _marital_transform_fxn(value):
    if value == 'M':
        return 1 # encoding for Married
    elif value == 'S':
        return 0 # encoding for single
    else:
        return 2 # encode the nan's

def _race_transform_fxn(value):
    if value == 'white':
        return 0
    elif value == 'black':
        return 1
    elif value == 'asian':
        return 2
    elif value == 'native':
        return 3
    elif value == 'other':
        return 4
    else:
        return value # nan's ?? there didn;t seem to be any though

In [33]:
filtered_data['condition_labels'] = filtered_data['condition_code'].transform(_condition_transform_fxn, labels=condition_labels)
filtered_data['marital_status_code'] = filtered_data['marital_status'].transform(_marital_transform_fxn)
filtered_data['gender_code'] = filtered_data['gender'].transform(_gender_transform_fxn)
filtered_data['race_code'] = filtered_data['race'].transform(_race_transform_fxn)

In [37]:
# now we also need to handle the categorical observation data
# there are two of them
nominal_observations = ['72166-2'] # tobacco smoking status
import imput

def _transform_obv(value, code=None):
    return imput.get_encoding(code, value, imput.NA_GUESS)

for obv in nominal_observations:
    new_obv = obv + "_code"
    filtered_data[new_obv] = filtered_data[obv].transform(_transform_obv, code=obv)


In [42]:
ml_data = filtered_data.drop(columns=['encounter_id', 'condition_code', 'condition_start', 'condition_stop'])

# save this as well
ml_data_csv_file = os.path.join(DATA_DIR, "filtered_for_ml.csv")
ml_data.to_csv(ml_data_csv_file)

In [43]:
# prep a test set that we won't ever touch, so we can do some unbiased evaluation. 
train_split = 0.8
test_split = 0.2

train_df = pd.DataFrame(data=None, columns=ml_data.columns)
test_df = pd.DataFrame(data=None, columns=ml_data.columns)

In [47]:
label_grp = ml_data.groupby(['condition_labels'])
for label, df in label_grp.__iter__():
    index = df.index
    num_train = math.ceil(train_split * len(index))
    train_selection = np.random.choice(index, num_train, replace=False)

    # add these to the train set
    train_df = train_df.append(df.loc[train_selection])
    
    # add what's left
    test_selection = list (set(index) - set(train_selection))
    test_df = test_df.append(df.loc[test_selection])

In [49]:
# save the dataframes to csv
train_csv_file = os.path.join(DATA_DIR, "train.csv")
test_csv_file = os.path.join(DATA_DIR, "test.csv")

train_df.to_csv(train_csv_file)
test_df.to_csv(test_csv_file)