In [1]:
import matplotlib.pyplot as plt
import numpy as np
import math
import os
import pandas as pd
import time
import seaborn as sns
from sklearn import metrics
from sklearn.decomposition import PCA
from collections import Counter

In [2]:
datasetPath = 'datasets/diabetic_data_initial.csv'
idMappingsPath = 'datasets/id_mapping.csv'

In [3]:
data = pd.read_csv(datasetPath)
idMappings = pd.read_csv(idMappingsPath)

In [4]:
replaceDict = {'[0-10)' : 5,
'[10-20)' : 15,
'[20-30)' : 25, 
'[30-40)' : 35, 
'[40-50)' : 45, 
'[50-60)' : 55,
'[60-70)' : 65, 
'[70-80)' : 75,
'[80-90)' : 85,
'[90-100)' : 95}

data['age'] = data['age'].apply(lambda x : replaceDict[x])

In [5]:
data.dropna(inplace = True)
data.drop_duplicates(['patient_nbr'], keep = 'first', inplace = True)
data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,5,?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,15,?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,25,?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,35,?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,45,?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101754,443842016,183087545,Caucasian,Female,75,?,1,1,7,9,...,No,Steady,No,No,No,No,No,Ch,Yes,>30
101755,443842022,188574944,Other,Female,45,?,1,1,7,14,...,No,Up,No,No,No,No,No,Ch,Yes,>30
101756,443842070,140199494,Other,Female,65,?,1,1,7,2,...,No,Steady,No,No,No,No,No,No,Yes,>30
101758,443842340,120975314,Caucasian,Female,85,?,1,1,7,5,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [6]:
list(data)

['encounter_id',
 'patient_nbr',
 'race',
 'gender',
 'age',
 'weight',
 'admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'time_in_hospital',
 'payer_code',
 'medical_specialty',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'diag_1',
 'diag_2',
 'diag_3',
 'number_diagnoses',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'readmitted']

In [7]:
high_frequency = ['InternalMedicine', 'Family/GeneralPractice', 'Cardiology', 'Surgery-General', 'Orthopedics', 'Orthopedics-Reconstructive', 'Emergency/Trauma', 'Urology','ObstetricsandGynecology','Psychiatry','Pulmonology ','Nephrology','Radiologist']

low_frequency = ['Surgery-PlasticwithinHeadandNeck','Psychiatry-Addictive','Proctology','Dermatology','SportsMedicine','Speech','Perinatology', 'Neurophysiology','Resident','Pediatrics-Hematology-Oncology','Pediatrics-EmergencyMedicine','Dentistry','DCPTEAM','Psychiatry-Child/Adolescent', 'Pediatrics-Pulmonology','Surgery-Pediatric','AllergyandImmunology','Pediatrics-Neurology','Anesthesiology','Pathology','Cardiology-Pediatric', 'Endocrinology-Metabolism','PhysicianNotFound','Surgery-Colon&Rectal','OutreachServices', 'Surgery-Maxillofacial','Rheumatology','Anesthesiology-Pediatric','Obstetrics','Obsterics&Gynecology-GynecologicOnco']

pediatrics = ['Pediatrics','Pediatrics-CriticalCare','Pediatrics-EmergencyMedicine','Pediatrics-Endocrinology','Pediatrics-Hematology-Oncology', 'Pediatrics-Neurology','Pediatrics-Pulmonology', 'Anesthesiology-Pediatric', 'Cardiology-Pediatric', 'Surgery-Pediatric']

psychic = ['Psychiatry-Addictive', 'Psychology', 'Psychiatry',  'Psychiatry-Child/Adolescent', 'PhysicalMedicineandRehabilitation', 'Osteopath']


neurology = ['Neurology', 'Surgery-Neuro',  'Pediatrics-Neurology', 'Neurophysiology']


surgery = ['Surgeon', 'Surgery-Cardiovascular', 'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Maxillofacial', 'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck',  'Surgery-Thoracic', 'Surgery-Vascular', 'SurgicalSpecialty', 'Podiatry']
             
ungrouped = ['Endocrinology','Gastroenterology','Gynecology','Hematology','Hematology/Oncology','Hospitalist','InfectiousDiseases', 'Oncology','Ophthalmology','Otolaryngology','Pulmonology','Radiology']


missing = ['?']

colMedical = []

for val in data['medical_specialty'] :
    if val in pediatrics :
        colMedical.append('pediatrics')
    elif val in psychic :
        colMedical.append('psychic')
    elif val in neurology :
        colMedical.append('neurology')
    elif val in surgery :
        colMedical.append('surgery')
    elif val in high_frequency :
        colMedical.append('high_freq')
    elif val in low_frequency :
        colMedical.append('low_freq')
    elif val in ungrouped :
        colMedical.append('ungrouped')
    elif val in missing :
        colMedical.append('missing')

data['medical_specialty'] = colMedical

In [8]:
diag_1 = Counter(list(data['diag_1'])).most_common(1)[0][0]
diag_2 = Counter(list(data['diag_2'])).most_common(1)[0][0]
diag_3 = Counter(list(data['diag_3'])).most_common(1)[0][0]

data['diag_1'] = data['diag_1'].apply(lambda x : diag_1 if x == '?' else x)
data['diag_2'] = data['diag_1'].apply(lambda x : diag_2 if x == '?' else x)
data['diag_3'] = data['diag_3'].apply(lambda x : diag_3 if x == '?' else x)

In [9]:
data['diag_1'] = data['diag_1'].apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1)  
                                        else ('circulatory' if int(float(x)) in range(390, 460) or int(float(x)) == 785
                                        else     ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786
                                        else     ('digestive'   if int(float(x)) in range(520, 580) or int(float(x)) == 787
                                        else     ('diabetes'    if int(float(x)) == 250
                                        else     ('injury'      if int(float(x)) in range(800, 1000)
                                        else ('musculoskeletal' if int(float(x)) in range(710, 740)
                                        else ('genitourinary'   if int(float(x)) in range(580, 630) or int(float(x)) == 788
                                        else ('neoplasms'       if int(float(x)) in range(140, 240)
                                        else ('pregnecy'        if int(float(x)) in range(630, 680)
                                        else 'other'))))))))))

data['diag_2'] = data['diag_2'].apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1)  
                                        else ('circulatory' if int(float(x)) in range(390, 460) or int(float(x)) == 785
                                        else     ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786
                                        else     ('digestive'   if int(float(x)) in range(520, 580) or int(float(x)) == 787
                                        else     ('diabetes'    if int(float(x)) == 250
                                        else     ('injury'      if int(float(x)) in range(800, 1000)
                                        else ('musculoskeletal' if int(float(x)) in range(710, 740)
                                        else ('genitourinary'   if int(float(x)) in range(580, 630) or int(float(x)) == 788
                                        else ('neoplasms'       if int(float(x)) in range(140, 240)
                                        else ('pregnecy'        if int(float(x)) in range(630, 680)
                                        else 'other'))))))))))

data['diag_3'] = data['diag_3'].apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1)  
                                        else ('circulatory' if int(float(x)) in range(390, 460) or int(float(x)) == 785
                                        else     ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786
                                        else     ('digestive'   if int(float(x)) in range(520, 580) or int(float(x)) == 787
                                        else     ('diabetes'    if int(float(x)) == 250
                                        else     ('injury'      if int(float(x)) in range(800, 1000)
                                        else ('musculoskeletal' if int(float(x)) in range(710, 740)
                                        else ('genitourinary'   if int(float(x)) in range(580, 630) or int(float(x)) == 788
                                        else ('neoplasms'       if int(float(x)) in range(140, 240)
                                        else ('pregnecy'        if int(float(x)) in range(630, 680)
                                        else 'other'))))))))))  
 

In [10]:
data['readmitted'] = data['readmitted'].apply(lambda x : 0 if (x == '>30' or x == 'NO') else 1)
data = data.reset_index()

In [13]:
data

Unnamed: 0,index,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,0,2278392,8222157,Caucasian,Female,5,?,6,25,1,...,No,No,No,No,No,No,No,No,No,0
1,1,149190,55629189,Caucasian,Female,15,?,1,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,0
2,2,64410,86047875,AfricanAmerican,Female,25,?,1,1,7,...,No,No,No,No,No,No,No,No,Yes,0
3,3,500364,82442376,Caucasian,Male,35,?,1,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,0
4,4,16680,42519267,Caucasian,Male,45,?,1,1,7,...,No,Steady,No,No,No,No,No,Ch,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71513,101754,443842016,183087545,Caucasian,Female,75,?,1,1,7,...,No,Steady,No,No,No,No,No,Ch,Yes,0
71514,101755,443842022,188574944,Other,Female,45,?,1,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,0
71515,101756,443842070,140199494,Other,Female,65,?,1,1,7,...,No,Steady,No,No,No,No,No,No,Yes,0
71516,101758,443842340,120975314,Caucasian,Female,85,?,1,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,0


In [11]:
def getValuesByGroup(tbl, group):
    grouped = tbl.groupby(group)
    groups = np.unique(tbl[group])
    output = {}
    for g in groups:
        output[g] = grouped.get_group(g)
    return output

In [12]:
pc = getValuesByGroup(data, 'payer_code')
readmitted = len(np.where(data['readmitted'] == 1)[0])
for k in pc:
    print(len(np.where(pc[k]['readmitted'] == 1)[0]) / readmitted, k, len(np.where(pc[k]['readmitted'] == 1)[0])/len(pc[k]))

0.4797393929763229 ? 0.09725219856328318
0.03225806451612903 BC 0.05914918414918415
0.001430160495788972 CH 0.07692307692307693
0.01668520578420467 CM 0.07871064467766117
0.019863340219291276 CP 0.06294058408862034
0.005402828539647227 DM 0.09090909090909091
0.0 FR 0.0
0.0441760686477038 HM 0.06911984087518648
0.29477196885428253 MC 0.09086901146272167
0.027490862863499128 MD 0.07874374146563495
0.0003178134435086604 MP 0.0625
0.00794533608771651 OG 0.07598784194528875
0.0004767201652629906 OT 0.046153846153846156
0.004131574765612585 PO 0.056155507559395246
0.0003178134435086604 SI 0.05405405405405406
0.041792467821388846 SP 0.07857783089333732
0.02240584776736056 UN 0.07503991484832358
0.0007945336087716511 WC 0.042735042735042736


In [18]:
copy = data.copy()

In [14]:
data

Unnamed: 0,index,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,0,2278392,8222157,Caucasian,Female,5,?,6,25,1,...,No,No,No,No,No,No,No,No,No,0
1,1,149190,55629189,Caucasian,Female,15,?,1,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,0
2,2,64410,86047875,AfricanAmerican,Female,25,?,1,1,7,...,No,No,No,No,No,No,No,No,Yes,0
3,3,500364,82442376,Caucasian,Male,35,?,1,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,0
4,4,16680,42519267,Caucasian,Male,45,?,1,1,7,...,No,Steady,No,No,No,No,No,Ch,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71513,101754,443842016,183087545,Caucasian,Female,75,?,1,1,7,...,No,Steady,No,No,No,No,No,Ch,Yes,0
71514,101755,443842022,188574944,Other,Female,45,?,1,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,0
71515,101756,443842070,140199494,Other,Female,65,?,1,1,7,...,No,Steady,No,No,No,No,No,No,Yes,0
71516,101758,443842340,120975314,Caucasian,Female,85,?,1,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,0


In [19]:
data['weight'].value_counts()

?            68665
[75-100)      1195
[50-75)        781
[100-125)      566
[125-150)      131
[25-50)         89
[0-25)          46
[150-175)       33
[175-200)        9
>200             3
Name: weight, dtype: int64

In [20]:
data['readmitted'].value_counts()

0    65225
1     6293
Name: readmitted, dtype: int64

In [21]:
data['admission_type_id'].value_counts()

1    36490
3    13917
2    13028
6     4588
5     3174
8      291
7       21
4        9
Name: admission_type_id, dtype: int64

In [22]:
data['discharge_disposition_id'].value_counts()

1     44317
3      8784
6      8289
18     2474
2      1539
22     1410
11     1077
5       913
25      778
4       541
7       409
23      260
13      243
14      218
28       90
8        73
15       40
24       25
9         9
17        8
19        6
10        6
16        3
27        3
12        2
20        1
Name: discharge_disposition_id, dtype: int64

In [33]:
admission_type = {
    'Emergency': 1,
    'Urgent': 2,
    'Elective': 3,
    'Newborn': 4,
    'Not available': 5,
    '': 6,
    'Trauma Center': 7,
    'Not Mapped': 8
}


In [38]:
admission_type_reverse = {}

for key in admission_type:
    admission_type_reverse[admission_type[key]] = key
    
admission_type_reverse

{1: 'Emergency',
 2: 'Urgent',
 3: 'Elective',
 4: 'Newborn',
 5: 'Not available',
 6: '',
 7: 'Trauma Center',
 8: 'Not Mapped'}

In [34]:
data['admission_type_id'] = data['admission_type_id'].replace(admission_type['Urgent'], admission_type['Emergency']) # Urgent -> emergency
data['admission_type_id'] = data['admission_type_id'].replace(admission_type['Trauma Center'],admission_type['Emergency']) # Trauma -> emergency
data['admission_type_id'] = data['admission_type_id'].replace(admission_type[''],admission_type['Not available']) # Not available
data['admission_type_id'] = data['admission_type_id'].replace(admission_type['Not Mapped'],admission_type['Not available']) # Not available

In [35]:
data['admission_type_id'].value_counts()

1    49539
3    13917
5     8053
4        9
Name: admission_type_id, dtype: int64

In [27]:
# Most people came by emergency, next by choice, then unknown

In [None]:
admission_type_concise = {
     1: 'Emergency',
     3: 'Elective',
     4: 'Newborn',
     5: 'Unavailable'
}

In [36]:
discharge_map = {
    'Discharged home': 1,
    'Discharged to short term hospital': 2,
    'Discharged to skilled nursing facility': 3,
    'Discharged to intermediate care facility': 4,
    'Discharged to inpatient care institution': 5,
    'Discharged to home with home health service': 6,
    'Left against medical advice': 7,
    'Discharged to home under HOME IV provider': 8,
    'Admitted as inpatient to this hospital': 9,
    'Neonate discharged': 10,
    'Expired': 11,
    'Still patient': 12,
    'Hospice at home': 13,
    'Hospice at facility': 14,
    'Transferred to swing bed': 15,
    'Discharged to another institution for outpatient services': 16,
    'Discharged to this institution for outpatient services': 17,
    '': 18,
    'Expired in home': 19,
    'Expired in facility': 20,
    'Expired in unknown place': 21,
    'Discharged to rehab': 22,
    'Transferred to long-term hospital': 23,
    'Discharged to Medicaid but not Medicare-certified facility': 24,
    'Not mapped': 25,
    'Unknown/Invalid': 26,
    'Discharged to federal health care facility': 27,
    'Discharged to psychiatric hospital': 28,
    'Discharged to CAH': 29,
    'Discharged to other health care institution': 30
}

In [37]:
discharge_map_reverse = {}

for key in discharge_map:
    discharge_map_reverse[discharge_map[key]] = key
    
discharge_map_reverse

{1: 'Discharged home',
 2: 'Discharged to short term hospital',
 3: 'Discharged to skilled nursing facility',
 4: 'Discharged to intermediate care facility',
 5: 'Discharged to inpatient care institution',
 6: 'Discharged to home with home health service',
 7: 'Left against medical advice',
 8: 'Discharged to home under HOME IV provider',
 9: 'Admitted as inpatient to this hospital',
 10: 'Neonate discharged',
 11: 'Expired',
 12: 'Still patient',
 13: 'Hospice at home',
 14: 'Hospice at facility',
 15: 'Transferred to swing bed',
 16: 'Discharged to another institution for outpatient services',
 17: 'Discharged to this institution for outpatient services',
 18: '',
 19: 'Expired in home',
 20: 'Expired in facility',
 21: 'Expired in unknown place',
 22: 'Discharged to rehab',
 23: 'Transferred to long-term hospital',
 24: 'Discharged to Medicaid but not Medicare-certified facility',
 25: 'Not mapped',
 26: 'Unknown/Invalid',
 27: 'Discharged to federal health care facility',
 28: 'Dis

In [25]:
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(6,1) # home w/ health services -> home
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(8,1) # 
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(9,1)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(13,1)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(3,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(4,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(5,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(14,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(22,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(23,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(24,2)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(12,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(15,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(16,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(17,10)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(25,18)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(26,18)

In [26]:
data['discharge_disposition_id'].value_counts()

1     52931
2     13690
18     3252
11     1077
7       409
28       90
10       59
19        6
27        3
20        1
Name: discharge_disposition_id, dtype: int64

In [28]:
# Lots of people expired -> 3252, most discharged to home

In [None]:
discharge_disposition_concise = {
     1: 'Discharged home',
     2: 'Discharged hospital',
     7: 'Left AMA',
     10: 'Neonate discharged',
     11: 'Expired',
     18: 'Unavailable',
     19: 'Expired in home',
     20: 'Expired in facility',
     27: 'Discharged federal facility',
     28: 'Discharged psychiatric hospital'
}

In [39]:
admission_source = {
    'Physician referral': 1,
    'Clinic referral': 2,
    'HMO Referral': 3,
    'Transfer from hospital': 4,
    'Transfer from skilled nursing facility': 5,
    'Transfer from another health facility': 6,
    'Emergency Room': 7,
    'Law enforcement': 8,
    'Not available': 9,
    'Transfer from critical access hospital': 10,
    'Normal delivery': 11,
    'Premature delivery': 12,
    'Sick baby': 13,
    'Extramural birth': 14,
    'Not available': 15,
    '': 17,
    'Transfer from another home health agency': 18,
    'Readmission to same home health agency': 19,
    'Not mapped': 20,
    'Unknown/Invalid': 21,
    'Transfer from hospital inpatient': 22,
    'Born inside this hospital': 23,
    'Born outside this hospital': 24,
    'Transfer from ambulatory surgery center': 25,
    'Transfer from hospice': 26
}

In [40]:
admission_source_reverse = {}

for key in admission_source:
    admission_source_reverse[admission_source[key]] = key
    
admission_source_reverse

{1: 'Physician referral',
 2: 'Clinic referral',
 3: 'HMO Referral',
 4: 'Transfer from hospital',
 5: 'Transfer from skilled nursing facility',
 6: 'Transfer from another health facility',
 7: 'Emergency Room',
 8: 'Law enforcement',
 15: 'Not available',
 10: 'Transfer from critical access hospital',
 11: 'Normal delivery',
 12: 'Premature delivery',
 13: 'Sick baby',
 14: 'Extramural birth',
 17: '',
 18: 'Transfer from another home health agency',
 19: 'Readmission to same home health agency',
 20: 'Not mapped',
 21: 'Unknown/Invalid',
 22: 'Transfer from hospital inpatient',
 23: 'Born inside this hospital',
 24: 'Born outside this hospital',
 25: 'Transfer from ambulatory surgery center',
 26: 'Transfer from hospice'}

In [29]:
data['admission_source_id'] = data['admission_source_id'].replace(2,1)
data['admission_source_id'] = data['admission_source_id'].replace(3,1)
data['admission_source_id'] = data['admission_source_id'].replace(5,4)
data['admission_source_id'] = data['admission_source_id'].replace(6,4)
data['admission_source_id'] = data['admission_source_id'].replace(10,4)
data['admission_source_id'] = data['admission_source_id'].replace(22,4)
data['admission_source_id'] = data['admission_source_id'].replace(25,4)
data['admission_source_id'] = data['admission_source_id'].replace(15,9)
data['admission_source_id'] = data['admission_source_id'].replace(17,9)
data['admission_source_id'] = data['admission_source_id'].replace(20,9)
data['admission_source_id'] = data['admission_source_id'].replace(21,9)
data['admission_source_id'] = data['admission_source_id'].replace(13,11)
data['admission_source_id'] = data['admission_source_id'].replace(14,11)


In [30]:
data['admission_source_id'].value_counts()

7     38290
1     23071
9      5199
4      4942
8        12
11        4
Name: admission_source_id, dtype: int64

In [31]:
# Most came from emergency room, next by physician referral

In [None]:
admission_source_concise = {
     1: 'Physician referral',
     4: 'Transfer hospital',
     7: 'Emergency Room',
     8: 'Law enforcement',
     9: 'Not available',
     11: 'Delivery'
}