In [1]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np

In [3]:
from thesislib.utils import pathutils
from thesislib.utils.ml import report

In [4]:
patients_csv = pathutils.get_data_file("prob-synthea-1/data/patients.csv")
patient_conditions_csv = pathutils.get_data_file("prob-synthea-1/data/patient_conditions.csv")
condition_symptom_csv = pathutils.get_data_file("prob-synthea-1/data/patient_condition_symptoms.csv")

In [128]:
patient_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
patients = pd.read_csv(patients_csv, usecols=patient_columns, parse_dates=['BIRTHDATE'], infer_datetime_format=True)

condition_columns = ['Id', 'PATIENT', 'CODE', 'ONSET']
conditions = pd.read_csv(patient_conditions_csv, usecols=condition_columns, parse_dates=['ONSET'], infer_datetime_format=True)

symptom_colums = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE']
symptoms = pd.read_csv(condition_symptom_csv)

In [129]:
symptoms.columns

Index(['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE', 'SYMPTOM_DISPLAY',
       'VALUE_CODE', 'VALUE_DISPLAY'],
      dtype='object')

In [6]:
condition_codes = conditions['CODE'].unique().tolist()

In [7]:
symptom_vector = symptoms['SYMPTOM_CODE'].unique().tolist()
symptom_vector.sort()

In [8]:
condition_labels = {code: idx for idx, code in enumerate(condition_codes)}

In [9]:
race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}

In [10]:
def _race_txform(item):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(item, np.nan)
    
patients['race'] = patients['RACE'].transform(_race_txform)

In [11]:
patients['gender'] = patients['GENDER'].transform(lambda gender: 0 if gender == 'F' else 1)

In [12]:
patients = patients.drop(columns=['GENDER', 'RACE'])

In [13]:
df = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'), )

In [14]:
df = symptoms.merge(df, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))

In [15]:
df = df.drop(columns=['PATIENT_symp', 'Id', 'Id_pat', 'PATIENT'])

In [16]:
df['age'] = abs((df['ONSET'] - df['BIRTHDATE']).astype('timedelta64[Y]'))

In [17]:
df = df.drop(columns=['ONSET', 'BIRTHDATE'])

In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
_ = le.fit(condition_codes)

In [19]:
df['label'] = le.transform(df['CODE'])

In [20]:
df = df.drop(columns=['CODE'])

In [21]:
_grp = df.groupby('CONDITION_ID')

In [24]:
df['CONDITION_ID'].iloc[0]

'9230eb47-e102-49a9-98ca-b233bafa24d4'

In [25]:
grp_1 = _grp.get_group('9230eb47-e102-49a9-98ca-b233bafa24d4')

In [29]:
grp_1

Unnamed: 0,CONDITION_ID,SYMPTOM_CODE,race,gender,age,label
0,9230eb47-e102-49a9-98ca-b233bafa24d4,85ec196b9fcfd32cbdee5a044b20d7c406ff7846cb02be...,0,0,22.0,8
1,9230eb47-e102-49a9-98ca-b233bafa24d4,2fe6d99eb788392eb93382757b162bfcab7936d47c789f...,0,0,22.0,8
2,9230eb47-e102-49a9-98ca-b233bafa24d4,703cf77867ac879e0da8b007eefe9d407476a0cdaf1132...,0,0,22.0,8


In [154]:
def grp_apply(df, symptomcodes):
    cols = ['race', 'gender', 'age', 'label']
    _s = df.head(1)[cols].copy()
    symptoms = {itm: 1 for itm in df['SYMPTOM_CODE'].values}
    for _symp in symptomcodes:
        _s[_symp] = symptoms.get(_symp, 0)
    
    return _s

In [None]:
dd = _grp.apply(grp_apply, symptomcodes=symptom_vector)

In [121]:
from collections import defaultdict
_k1 = ['race', 'gender', 'age', 'label']
keys = _k1 + symptom_vector
design_matrix = defaultdict(list)
for cond, df in _grp:
    _dict = df[_k1].iloc[0].to_dict()
    symptoms = {_symp: 1 for _symp in df['SYMPTOM_CODE'].to_list()}
    _dict.update({_symp: symptoms.get(_symp, 0) for _symp in keys})
    
    for k, v in _dict.items():
        design_matrix[k].append(v)

In [124]:
import json
with open('design_matrix.json', 'w') as fp:
    json.dump(design_matrix, fp)

In [125]:
del design_matrix

In [147]:
_kk = grp_1.head(1).copy()

In [148]:
_kk['']

Unnamed: 0,CONDITION_ID,SYMPTOM_CODE,race,gender,age,label
0,9230eb47-e102-49a9-98ca-b233bafa24d4,85ec196b9fcfd32cbdee5a044b20d7c406ff7846cb02be...,0,0,22.0,8


In [150]:
_kk['cool'] = 1

In [151]:
_kk

Unnamed: 0,CONDITION_ID,SYMPTOM_CODE,race,gender,age,label,cool
0,9230eb47-e102-49a9-98ca-b233bafa24d4,85ec196b9fcfd32cbdee5a044b20d7c406ff7846cb02be...,0,0,22.0,8,1
