In [5]:
import matplotlib.pyplot as plt
import numpy as np
import math
import os
import pandas as pd
import time
import seaborn as sns
from sklearn import metrics
from sklearn.decomposition import PCA
from collections import Counter

In [12]:
datasetPath = '../datasets/diabetes_data_initial.csv'

In [13]:
data = pd.read_csv(datasetPath)

In [14]:
replaceDict = {'[0-10)' : 5,
'[10-20)' : 15,
'[20-30)' : 25,
'[30-40)' : 35,
'[40-50)' : 45,
'[50-60)' : 55,
'[60-70)' : 65,
'[70-80)' : 75,
'[80-90)' : 85,
'[90-100)' : 95}

data['age'] = data['age'].apply(lambda x : replaceDict[x])

In [15]:
data['prev_visits'] = data.groupby('patient_nbr').encounter_id.transform(lambda x: pd.Series(range(len(x))))

In [16]:
high_frequency = ['InternalMedicine', 'Family/GeneralPractice', 'Cardiology', 'Surgery-General', 'Orthopedics', 'Orthopedics-Reconstructive', 'Emergency/Trauma', 'Urology','ObstetricsandGynecology','Psychiatry','Pulmonology ','Nephrology','Radiologist']
low_frequency = ['Surgery-PlasticwithinHeadandNeck','Psychiatry-Addictive','Proctology','Dermatology','SportsMedicine','Speech','Perinatology', 'Neurophysiology','Resident','Pediatrics-Hematology-Oncology','Pediatrics-EmergencyMedicine','Dentistry','DCPTEAM','Psychiatry-Child/Adolescent', 'Pediatrics-Pulmonology','Surgery-Pediatric','AllergyandImmunology','Pediatrics-Neurology','Anesthesiology','Pathology','Cardiology-Pediatric', 'Endocrinology-Metabolism','PhysicianNotFound','Surgery-Colon&Rectal','OutreachServices', 'Surgery-Maxillofacial','Rheumatology','Anesthesiology-Pediatric','Obstetrics','Obsterics&Gynecology-GynecologicOnco']
pediatrics = ['Pediatrics','Pediatrics-CriticalCare','Pediatrics-EmergencyMedicine','Pediatrics-Endocrinology','Pediatrics-Hematology-Oncology', 'Pediatrics-Neurology','Pediatrics-Pulmonology', 'Anesthesiology-Pediatric', 'Cardiology-Pediatric', 'Surgery-Pediatric']
psychic = ['Psychiatry-Addictive', 'Psychology', 'Psychiatry',  'Psychiatry-Child/Adolescent', 'PhysicalMedicineandRehabilitation', 'Osteopath']
neurology = ['Neurology', 'Surgery-Neuro',  'Pediatrics-Neurology', 'Neurophysiology']
surgery = ['Surgeon', 'Surgery-Cardiovascular', 'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Maxillofacial', 'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck',  'Surgery-Thoracic', 'Surgery-Vascular', 'SurgicalSpecialty', 'Podiatry']
ungrouped = ['Endocrinology','Gastroenterology','Gynecology','Hematology','Hematology/Oncology','Hospitalist','InfectiousDiseases', 'Oncology','Ophthalmology','Otolaryngology','Pulmonology','Radiology']
missing = ['?']

def get_specialty_type(specialty):
    if specialty in pediatrics : return 'pediatrics'
    elif specialty in psychic : return 'psychic'
    elif specialty in neurology : return 'neurology'
    elif specialty in surgery : return 'surgery'
    elif specialty in high_frequency : return 'high_freq'
    elif specialty in low_frequency : return 'low_freq'
    elif specialty in ungrouped : return 'ungrouped'
    elif specialty in missing : return 'missing'

data['medical_specialty'] = data['medical_specialty'].apply(get_specialty_type)

In [17]:
def get_diag(diag):
    if '?' in diag: return 'unknown'
    if 'V' in diag:  return 'health_contact'
    if 'E' in diag: return 'injury_poisoning'
    diag_no = int(float(diag))
    if diag_no in range(0, 140): return 'infectious'
    if diag_no in range(140, 240): return 'neoplasms'
    if diag_no == 250: return 'diabetes'
    if diag_no in range(251, 260): return 'endocrine'
    if diag_no in range(240, 280): return 'long_term'
    if diag_no in range(280, 290): return 'blood'
    if diag_no in range(290, 320): return 'mental'
    if diag_no in range(320, 390): return 'nervous'
    if diag_no in range(390, 460): return 'respiratory'
    if diag_no in range(520, 580): return 'digestive'
    if diag_no in range(580, 630): return 'genitourinary'
    if diag_no in range(630, 680): return 'birth'
    if diag_no in range(680, 710): return 'skin'
    if diag_no in range(710, 740): return 'musculoskeletal'
    if diag_no in range(740, 760): return 'congenital'
    if diag_no in range(760, 780): return 'prenital'
    if diag_no in range(780, 800): return 'symptoms'
    if diag_no in range(800, 1000): return 'injury_poisoning'
    return 'other'

In [18]:
data['diagnosis_1'] = data['diag_1'].apply(get_diag)
data['diagnosis_2'] = data['diag_2'].apply(get_diag)
data['diagnosis_3'] = data['diag_3'].apply(get_diag)

In [19]:
def get_diabetes_type(diag):
    if 'V' in diag or 'E' in diag or '?' in diag: return 'no_diabetes'
    diag_float = round(float(diag),1)
    if diag_float == 250.0: return 'no_complications'
    if diag_float == 250.1: return 'ketoacidosis'
    if diag_float == 250.2: return 'hyperosmolarity'
    if diag_float == 250.3: return 'other_coma'
    if diag_float == 250.4: return 'renal'
    if diag_float == 250.5: return 'ophthalmic'
    if diag_float == 250.6: return 'nuerological'
    if diag_float == 250.7: return 'peripheral'
    if diag_float == 250.8: return 'other'
    if diag_float == 250.9: return 'unspecificed'
    return 'no_diabetes'

In [20]:
data['diabetes_type_1'] = data['diag_1'].apply(get_diabetes_type)
data['diabetes_type_2'] = data['diag_2'].apply(get_diabetes_type)
data['diabetes_type_3'] = data['diag_3'].apply(get_diabetes_type)

In [21]:
data['readmitted'] = data['readmitted'].apply(lambda x : 0 if (x == '>30' or x == 'NO') else 1)
data = data.reset_index()

In [22]:
def get_aic(r):
    if r == 'None':
        return [1,0,0]
    if r == 'Norm':
        return [0,1,0]
    if r == '>7' or r == '>8':
        return [0,0,1]

def get_glu(r):
    if r == 'None':
        return [1,0,0]
    if r == 'Norm':
        return [0,1,0]
    if r == '>200' or r == '>300':
        return [0,0,1]

def get_medicine(m):
    if m == 'No' or m == 'Steady':
        return 0
    if m == 'Up' or m == 'Down':
        return 1

def get_diabetes(d):
    if d == 'Yes':
        return 1
    if d == 'No':
        return 0

def get_ch(c):
    if c == 'Ch':
        return 1
    if c == 'No':
        return 0

data['A1Cresult'] = data['A1Cresult'].apply(get_aic)
data['max_glu_serum'] = data['max_glu_serum'].apply(get_glu)
for m in data.columns[25:48]:
    data[m] = data[m].apply(get_medicine)
data['diabetesMed'] = data['diabetesMed'].apply(get_diabetes)
data['change'] = data['change'].apply(get_ch)

In [23]:
def logfunc(m):
    return math.log(m+1)

data['num_medications'] = data['num_medications'].apply(logfunc)
data['num_procedures'] = data['num_procedures'].apply(logfunc)
data['num_lab_procedures'] = data['num_lab_procedures'].apply(logfunc)
data['number_diagnoses'] = data['number_diagnoses'].apply(logfunc)
data['number_emergency'] = data['number_emergency'].apply(logfunc)
data['number_inpatient'] = data['number_inpatient'].apply(logfunc)
data['number_outpatient'] = data['number_outpatient'].apply(logfunc)
data['time_in_hospital'] = data['time_in_hospital'].apply(logfunc)

In [24]:
data['total_services'] = data['number_inpatient'] + data['number_outpatient'] + data['number_emergency']
data['change_meds'] = np.sum([data[m] for m in data.columns[25:48]],axis=0)



In [25]:
admission_type = {
    'Emergency': 1,
    'Urgent': 2,
    'Elective': 3,
    'Newborn': 4,
    'Not available': 5,
    '': 6,
    'Trauma Center': 7,
    'Not Mapped': 8
}
admission_type_reverse = {}

for key in admission_type:
    admission_type_reverse[admission_type[key]] = key
    
data['admission_type_id'] = data['admission_type_id'].replace(admission_type['Urgent'], admission_type['Emergency']) # Urgent -> emergency
data['admission_type_id'] = data['admission_type_id'].replace(admission_type['Trauma Center'],admission_type['Emergency']) # Trauma -> emergency
data['admission_type_id'] = data['admission_type_id'].replace(admission_type[''],admission_type['Not available']) # Not available
data['admission_type_id'] = data['admission_type_id'].replace(admission_type['Not Mapped'],admission_type['Not available']) # Not available



In [26]:
discharge_map = {
    'Discharged home': 1,
    'Discharged to short term hospital': 2,
    'Discharged to skilled nursing facility': 3,
    'Discharged to intermediate care facility': 4,
    'Discharged to inpatient care institution': 5,
    'Discharged to home with home health service': 6,
    'Left against medical advice': 7,
    'Discharged to home under HOME IV provider': 8,
    'Admitted as inpatient to this hospital': 9,
    'Neonate discharged': 10,
    'Expired': 11,
    'Still patient': 12,
    'Hospice at home': 13,
    'Hospice at facility': 14,
    'Transferred to swing bed': 15,
    'Discharged to another institution for outpatient services': 16,
    'Discharged to this institution for outpatient services': 17,
    '': 18,
    'Expired in home': 19,
    'Expired in facility': 20,
    'Expired in unknown place': 21,
    'Discharged to rehab': 22,
    'Transferred to long-term hospital': 23,
    'Discharged to Medicaid but not Medicare-certified facility': 24,
    'Not mapped': 25,
    'Unknown/Invalid': 26,
    'Discharged to federal health care facility': 27,
    'Discharged to psychiatric hospital': 28,
    'Discharged to CAH': 29,
    'Discharged to other health care institution': 30
}


discharge_map_reverse = {}

for key in discharge_map:
    discharge_map_reverse[discharge_map[key]] = key


data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(6,1) # home w/ health services -> home
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(8,1) # 
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(9,1)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(13,1)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(3,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(4,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(5,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(14,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(22,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(23,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(24,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(12,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(15,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(16,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(17,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(25,18)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(26,18)

In [27]:
admission_source = {
    'Physician referral': 1,
    'Clinic referral': 2,
    'HMO Referral': 3,
    'Transfer from hospital': 4,
    'Transfer from skilled nursing facility': 5,
    'Transfer from another health facility': 6,
    'Emergency Room': 7,
    'Law enforcement': 8,
    'Not available': 9,
    'Transfer from critical access hospital': 10,
    'Normal delivery': 11,
    'Premature delivery': 12,
    'Sick baby': 13,
    'Extramural birth': 14,
    'Not available': 15,
    '': 17,
    'Transfer from another home health agency': 18,
    'Readmission to same home health agency': 19,
    'Not mapped': 20,
    'Unknown/Invalid': 21,
    'Transfer from hospital inpatient': 22,
    'Born inside this hospital': 23,
    'Born outside this hospital': 24,
    'Transfer from ambulatory surgery center': 25,
    'Transfer from hospice': 26
}

admission_source_reverse = {}

for key in admission_source:
    admission_source_reverse[admission_source[key]] = key


data['admission_source_id'] = data['admission_source_id'].replace(2,1)
data['admission_source_id'] = data['admission_source_id'].replace(3,1)
data['admission_source_id'] = data['admission_source_id'].replace(5,4)
data['admission_source_id'] = data['admission_source_id'].replace(6,4)
data['admission_source_id'] = data['admission_source_id'].replace(10,4)
data['admission_source_id'] = data['admission_source_id'].replace(22,4)
data['admission_source_id'] = data['admission_source_id'].replace(25,4)
data['admission_source_id'] = data['admission_source_id'].replace(15,9)
data['admission_source_id'] = data['admission_source_id'].replace(17,9)
data['admission_source_id'] = data['admission_source_id'].replace(20,9)
data['admission_source_id'] = data['admission_source_id'].replace(21,9)
data['admission_source_id'] = data['admission_source_id'].replace(13,11)
data['admission_source_id'] = data['admission_source_id'].replace(14,11)

In [28]:
from sklearn.preprocessing import MultiLabelBinarizer
def multi_encode(cols):
    to_encode = data[cols].apply(list, axis=1)
    mlb = MultiLabelBinarizer()
    return pd.DataFrame(mlb.fit_transform(to_encode),
                   columns=mlb.classes_,
                   index=to_encode.index)
diagnosis_cols = ['diagnosis_1', 'diagnosis_2', 'diagnosis_3']
diabetes_cols = ['diabetes_type_1', 'diabetes_type_2', 'diabetes_type_3']
data = data.join(multi_encode(diagnosis_cols).add_prefix('diagnosis_'))
data = data.join(multi_encode(diabetes_cols).add_prefix('diabetes_'))

In [29]:
cat_vars = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'medical_specialty']
for var in cat_vars:
    data = data.join(pd.get_dummies(data[var], prefix=var))
    data = data.drop(var, axis=1)

In [30]:
data

Unnamed: 0,index,encounter_id,patient_nbr,age,weight,time_in_hospital,payer_code,num_lab_procedures,num_procedures,num_medications,...,admission_source_id_9,admission_source_id_11,medical_specialty_high_freq,medical_specialty_low_freq,medical_specialty_missing,medical_specialty_neurology,medical_specialty_pediatrics,medical_specialty_psychic,medical_specialty_surgery,medical_specialty_ungrouped
0,0,2278392,8222157,5,?,0.693147,?,3.737670,0.000000,0.693147,...,0,0,0,0,0,0,1,0,0,0
1,1,149190,55629189,15,?,1.386294,?,4.094345,0.000000,2.944439,...,0,0,0,0,1,0,0,0,0,0
2,2,64410,86047875,25,?,1.098612,?,2.484907,1.791759,2.639057,...,0,0,0,0,1,0,0,0,0,0
3,3,500364,82442376,35,?,1.098612,?,3.806662,0.693147,2.833213,...,0,0,0,0,1,0,0,0,0,0
4,4,16680,42519267,45,?,0.693147,?,3.951244,0.000000,2.197225,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,101761,443847548,100162476,75,?,1.386294,MC,3.951244,0.000000,2.833213,...,0,0,0,0,1,0,0,0,0,0
101762,101762,443847782,74694222,85,?,1.791759,MC,3.526361,1.386294,2.944439,...,0,0,0,0,1,0,0,0,0,0
101763,101763,443854148,41088789,75,?,0.693147,MC,3.988984,0.000000,2.302585,...,0,0,0,0,1,0,0,0,0,0
101764,101764,443857166,31693671,85,?,2.397895,MC,3.828641,1.098612,3.091042,...,0,0,0,0,0,0,0,0,1,0


In [31]:
copy = data.copy()

In [34]:
copy

Unnamed: 0,index,encounter_id,patient_nbr,age,weight,time_in_hospital,payer_code,num_lab_procedures,num_procedures,num_medications,...,admission_source_id_9,admission_source_id_11,medical_specialty_high_freq,medical_specialty_low_freq,medical_specialty_missing,medical_specialty_neurology,medical_specialty_pediatrics,medical_specialty_psychic,medical_specialty_surgery,medical_specialty_ungrouped
0,0,2278392,8222157,5,?,0.693147,?,3.737670,0.000000,0.693147,...,0,0,0,0,0,0,1,0,0,0
1,1,149190,55629189,15,?,1.386294,?,4.094345,0.000000,2.944439,...,0,0,0,0,1,0,0,0,0,0
2,2,64410,86047875,25,?,1.098612,?,2.484907,1.791759,2.639057,...,0,0,0,0,1,0,0,0,0,0
3,3,500364,82442376,35,?,1.098612,?,3.806662,0.693147,2.833213,...,0,0,0,0,1,0,0,0,0,0
4,4,16680,42519267,45,?,0.693147,?,3.951244,0.000000,2.197225,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,101761,443847548,100162476,75,?,1.386294,MC,3.951244,0.000000,2.833213,...,0,0,0,0,1,0,0,0,0,0
101762,101762,443847782,74694222,85,?,1.791759,MC,3.526361,1.386294,2.944439,...,0,0,0,0,1,0,0,0,0,0
101763,101763,443854148,41088789,75,?,0.693147,MC,3.988984,0.000000,2.302585,...,0,0,0,0,1,0,0,0,0,0
101764,101764,443857166,31693671,85,?,2.397895,MC,3.828641,1.098612,3.091042,...,0,0,0,0,0,0,0,0,1,0


In [60]:
y = data['readmitted']
y

0         0
1         0
2         0
3         0
4         0
         ..
101761    0
101762    0
101763    0
101764    0
101765    0
Name: readmitted, Length: 101766, dtype: int64

In [61]:
X = data.loc[:, ~data.columns.isin(['readmitted', 'index'])]
X

Unnamed: 0,encounter_id,patient_nbr,age,weight,time_in_hospital,payer_code,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,admission_source_id_9,admission_source_id_11,medical_specialty_high_freq,medical_specialty_low_freq,medical_specialty_missing,medical_specialty_neurology,medical_specialty_pediatrics,medical_specialty_psychic,medical_specialty_surgery,medical_specialty_ungrouped
0,2278392,8222157,5,?,0.693147,?,3.737670,0.000000,0.693147,0.000000,...,0,0,0,0,0,0,1,0,0,0
1,149190,55629189,15,?,1.386294,?,4.094345,0.000000,2.944439,0.000000,...,0,0,0,0,1,0,0,0,0,0
2,64410,86047875,25,?,1.098612,?,2.484907,1.791759,2.639057,1.098612,...,0,0,0,0,1,0,0,0,0,0
3,500364,82442376,35,?,1.098612,?,3.806662,0.693147,2.833213,0.000000,...,0,0,0,0,1,0,0,0,0,0
4,16680,42519267,45,?,0.693147,?,3.951244,0.000000,2.197225,0.000000,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,75,?,1.386294,MC,3.951244,0.000000,2.833213,0.000000,...,0,0,0,0,1,0,0,0,0,0
101762,443847782,74694222,85,?,1.791759,MC,3.526361,1.386294,2.944439,0.000000,...,0,0,0,0,1,0,0,0,0,0
101763,443854148,41088789,75,?,0.693147,MC,3.988984,0.000000,2.302585,0.693147,...,0,0,0,0,1,0,0,0,0,0
101764,443857166,31693671,85,?,2.397895,MC,3.828641,1.098612,3.091042,0.000000,...,0,0,0,0,0,0,0,0,1,0


In [62]:
X = X.to_numpy()
X = np.asarray(X).astype('float32')
X

ValueError: could not convert string to float: '?'

In [63]:
y = y.to_numpy()
y = np.asarray(y).astype('float32')
y

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [55]:
from sklearn.model_selection import train_test_split

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter
print('Original dataset shape {}'.format(Counter(y_train)))
sm = SMOTE()
train_input_new, train_output_new = sm.fit_sample(X_train, y_train)
print('New dataset shape {}'.format(Counter(train_output_new)))


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [57]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout

In [58]:
model = Sequential()
model.add(Dense(512, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.15))
model.add(Dense(512, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.15))
model.add(Dense(1, activation='sigmoid'))

In [59]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=150, batch_size=10)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [None]:
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))