### Description

Going to time the initial parsing approach and the updated parsing approach to see which is faster!

In [1]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
import pandas as pd
import numpy as np

In [22]:
from thesislib.utils import pathutils
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from dateutil.parser import parse as date_parser
import json
import time

In [67]:
patients_csv = pathutils.get_data_file("prob-synthea-1/data/patients.csv")
conditions_csv = pathutils.get_data_file("prob-synthea-1/data/patient_conditions.csv")
symptoms_csv = pathutils.get_data_file("prob-synthea-1/data/patient_condition_symptoms.csv")

In [93]:
def _race_txform(item):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(item)

def _transform_label(item, labels):
    return labels.get(item)

def form_matrix(patients_csv, conditions_csv, symptoms_csv):
    patient_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
    patients = pd.read_csv(
        patients_csv,
        usecols=patient_columns, 
        parse_dates=['BIRTHDATE'],
        infer_datetime_format=True,
        dtype={
            'RACE': 'category',
            'GENDER': 'category'
        }
    )

    condition_columns = ['Id', 'PATIENT', 'CODE', 'ONSET']
    conditions = pd.read_csv(conditions_csv, usecols=condition_columns, parse_dates=['ONSET'], infer_datetime_format=True)

    symptom_colums = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE']
    symptoms = pd.read_csv(symptoms_csv, usecols=symptom_colums)
    
    condition_codes = conditions['CODE'].unique().tolist()
    condition_labels = {itm: idx for idx, itm in enumerate(sorted(condition_codes))}
    
    symptom_vector = set(symptoms['SYMPTOM_CODE'].unique().tolist())
    
    patients['RACE'] = patients['RACE'].transform(_race_txform).astype(np.uint8)
    patients['GENDER'] = patients['GENDER'].transform(lambda gender: 0 if gender == 'F' else 1).astype(np.bool)
    
    df = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
    df = symptoms.merge(df, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))
    
    # free up memory ?
    #     del conditions
    #     del symptoms
    #     del patients
    
    df['AGE'] = abs((df['ONSET'] - df['BIRTHDATE']).astype('timedelta64[Y]')).astype(np.uint8)
    df['LABEL'] = df['CODE'].transform(_transform_label, labels=condition_labels).astype(np.uint16)
    
    # don't drop anything ..
    # df = df.drop(columns=['PATIENT_symp', 'Id', 'Id_pat', 'PATIENT', 'ONSET', 'BIRTHDATE', 'CODE'])
    
    _grp = df.groupby('CONDITION_ID')
    
    design_matrix = {
        "LABEL": [],
        "AGE": [],
        "GENDER": [],
        "RACE": [],
    }

    for item in symptom_vector:
        design_matrix[item] = []
    
    _k1 = ['RACE', 'GENDER', 'AGE', 'LABEL']
    for cond, df in _grp:
        _dict = df.iloc[0]
        
        for _k in _k1:
            design_matrix[_k].append(int(_dict.get(_k)))
        
        _in = {_symp for _symp in df['SYMPTOM_CODE']}
        _out = symptom_vector - _in
        
        for _symp in _in:
            design_matrix[_symp].append(1)
        for _symp in _out:
            design_matrix[_symp].append(0)
    
    return design_matrix

In [94]:
def old_form_matrix(patients_csv, conditions_csv, symptoms_csv):
    patients = pd.read_csv(patients_csv)
    conditions = pd.read_csv(conditions_csv)
    symptoms = pd.read_csv(symptoms_csv)
    
    condition_codes = set(conditions['CODE'].unique())
    
    condition_labels = {code: idx for idx, code in enumerate(condition_codes)}
    
    symptom_vector = symptoms['SYMPTOM_CODE'].unique().tolist()
    symptom_vector.sort()
    
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    
    combined = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
    complete = symptoms.merge(combined, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))
    
    to_drop = ['DESCRIPTION', 'DIAGNOSED', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX', 'FIRST', 'LAST', 
              'SUFFIX', 'MAIDEN', 'MARITAL', 'ETHNICITY','BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE',
           'COUNTY', 'ZIP', 'LAT', 'LON', 'HEALTHCARE_EXPENSES', 'Id', 'Id_pat', 'SYMPTOM_DISPLAY', 'VALUE_CODE',
           'VALUE_DISPLAY', 'HEALTHCARE_COVERAGE'
          ]
    
    complete = complete.drop(columns=to_drop)
    
    condition_grp = complete.groupby(['CONDITION_ID'])
    
    design_matrix = {
        "LABEL": [],
        "AGE": [],
        "GENDER": [],
        "RACE": [],
    }

    for item in symptom_vector:
        design_matrix[item] = []
    # build the design matrix
    for item, df in condition_grp:
        vector = {_: 0 for _ in symptom_vector}

        onset_date = date_parser(df['ONSET'].iloc[0])
        patient_birthdate = date_parser(df["BIRTHDATE"].iloc[0])
        vector['AGE'] =  abs(patient_birthdate.year - onset_date.year)
        vector['GENDER'] = 0 if df['GENDER'].iloc[0] == 'F' else 1
        vector['RACE'] = race_code[df['RACE'].iloc[0]]
        vector['LABEL'] = condition_labels[df['CODE'].iloc[0]]

        # fill in the observations
        for idx, symptom_code in df["SYMPTOM_CODE"].items():
            vector[symptom_code] = 1

        for k,v in vector.items():
            design_matrix[k].append(v)
    
    return design_matrix

In [95]:
_t1_start = time.time()
output_path = pathutils.get_data_file("compare-parsing/output/npanda.design.json")
res = form_matrix(patients_csv, conditions_csv, symptoms_csv)
_t1_end = time.time()
print("Took %.2f seconds to parse with npanda" % (_t1_end - _t1_start))

_t2_start = time.time()
with open(output_path, "w") as fp:
    json.dump(res, fp)
_t2_end = time.time()
print("Took %.2f seconds to dump npanda json" % (_t2_end - _t2_start))

Took 339.31 seconds to parse with npanda
Took 16.08 seconds to run npanda


In [96]:
_t1_start = time.time()
output_path = pathutils.get_data_file("compare-parsing/output/old.design.json")
res = old_form_matrix(patients_csv, conditions_csv, symptoms_csv)
_t1_end = time.time()
print("Took %.2f seconds to parse with old method" % (_t1_end - _t1_start))

_t2_start = time.time()
with open(output_path, "w") as fp:
    json.dump(res, fp)
_t2_end = time.time()
print("Took %.2f seconds to dump old method" % (_t2_end - _t2_start))

Took 377.41 seconds to parse with old method
Took 16.59 seconds to dump old method


In [98]:
# %%timeit -n 10 -r 1
# timing running using defined columns and parsing dates, and time inference
patient_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
patients = pd.read_csv(
    patients_csv,
    usecols=patient_columns, 
    parse_dates=['BIRTHDATE'],
    infer_datetime_format=True,
    dtype={
        'RACE': 'category',
        'GENDER': 'category'
    }
)

condition_columns = ['Id', 'PATIENT', 'CODE', 'ONSET']
conditions = pd.read_csv(conditions_csv, usecols=condition_columns, parse_dates=['ONSET'], infer_datetime_format=True)

symptom_colums = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE']
symptoms = pd.read_csv(symptoms_csv)

In [39]:
%%timeit -n 5 -r 5

# timing processing up to group
condition_codes = conditions['CODE'].unique().tolist()
condition_labels = {itm: idx for idx, itm in enumerate(sorted(condition_codes))}


symptom_vector = symptoms['SYMPTOM_CODE'].unique().tolist()
symptom_vector.sort()

patients['RACE'] = patients['RACE'].transform(_race_txform)
patients['GENDER'] = patients['GENDER'].transform(lambda gender: 0 if gender == 'F' else 1)

df = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
df = symptoms.merge(df, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))

# free up memory ?
#     del conditions
#     del symptoms
#     del patients

df['AGE'] = abs((df['ONSET'] - df['BIRTHDATE']).astype('timedelta64[Y]'))
df['LABEL'] = df['CODE'].transform(_transform_label, labels=condition_labels)

# don't drop anything ..
# df = df.drop(columns=['PATIENT_symp', 'Id', 'Id_pat', 'PATIENT', 'ONSET', 'BIRTHDATE', 'CODE'])

_grp = df.groupby('CONDITION_ID')

3.52 s ± 139 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [36]:
%%timeit -n 10 -r 1
# timing plain reading
patients = pd.read_csv(patients_csv)
conditions = pd.read_csv(conditions_csv)
symptoms = pd.read_csv(symptoms_csv)

4.48 s ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


In [52]:
patients['gender'] = patients['GENDER'].transform(lambda gender: 0 if gender == 'F' else 1)

In [53]:
patients['gender'].unique()

array([0, 1])

In [54]:
condition_labels = {itm: idx for idx, itm in enumerate(sorted(condition_codes))}

In [56]:
conditions['LABEL'] = conditions['CODE'].transform(_transform_label, labels=condition_labels)

In [57]:
conditions['LABEL'].unique()

array([8, 1, 3, 7, 6, 5, 2, 0, 4])

In [58]:
symptoms.columns

Index(['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE', 'SYMPTOM_DISPLAY',
       'VALUE_CODE', 'VALUE_DISPLAY'],
      dtype='object')

In [59]:
_g = symptoms.groupby('CONDITION_ID')

In [61]:
_gs = _g.get_group('9230eb47-e102-49a9-98ca-b233bafa24d4')

In [65]:
_s = _gs['SYMPTOM_CODE']

In [69]:
_d = _gs.iloc[0]

In [73]:
_kys = ['PATIENT', 'SYMPTOM_CODE', 'CONDITION_ID', 'SYMPTOM_DISPLAY']
for _k in _kys:
    print(_k, _d.get(_k))

PATIENT b3d23730-2323-416e-9fb6-190c7adf8b02
SYMPTOM_CODE 85ec196b9fcfd32cbdee5a044b20d7c406ff7846cb02be6803c93ba2
CONDITION_ID 9230eb47-e102-49a9-98ca-b233bafa24d4
SYMPTOM_DISPLAY Painful urination


In [72]:
_d

CONDITION_ID                    9230eb47-e102-49a9-98ca-b233bafa24d4
PATIENT                         b3d23730-2323-416e-9fb6-190c7adf8b02
SYMPTOM_CODE       85ec196b9fcfd32cbdee5a044b20d7c406ff7846cb02be...
SYMPTOM_DISPLAY                                    Painful urination
VALUE_CODE         85ec196b9fcfd32cbdee5a044b20d7c406ff7846cb02be...
VALUE_DISPLAY                            Painful urination (finding)
Name: 0, dtype: object

In [76]:
for idx, k in enumerate(set(_d)):
    print(idx, k)

0 Painful urination
1 Painful urination (finding)
2 85ec196b9fcfd32cbdee5a044b20d7c406ff7846cb02be6803c93ba2
3 9230eb47-e102-49a9-98ca-b233bafa24d4
4 b3d23730-2323-416e-9fb6-190c7adf8b02


In [77]:
p = {k for k in _s}

In [78]:
type(p)

set

In [79]:
p

{'2fe6d99eb788392eb93382757b162bfcab7936d47c789f5e20f00de6',
 '703cf77867ac879e0da8b007eefe9d407476a0cdaf1132c11066081f',
 '85ec196b9fcfd32cbdee5a044b20d7c406ff7846cb02be6803c93ba2'}

In [80]:
p = {1,2,3,4}
pp = {2,3}
p - pp

{1, 4}