# Imports

In [1]:
import pandas as pd
import numpy as np

# Read Data

In [2]:
diabetes_raw = pd.read_csv("../data/raw/diabetic_data.csv")

In [3]:
diabetes_raw.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
diabetes_raw['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

# Pre-processing

## Lidando com inconsistências

In [5]:
diabetes_raw.replace("?", np.nan, inplace=True)

In [6]:
print('Unique entries = ', len(np.unique(diabetes_raw['patient_nbr'])))
diabetes_raw.drop_duplicates(['patient_nbr'], keep = 'first', inplace = True)
print('Length after removing Duplicates:', len(diabetes_raw))

Unique entries =  71518
Length after removing Duplicates: 71518


In [7]:
# 1,13,14,19,20,21 estão relacionados a morte ou hospício. Obviamente tais pessoas não
# serão reinternadas ao hospital. Precisamos deletar tais amostras.
filter = diabetes_raw['discharge_disposition_id'].isin([11,13,14,19,20,21])
diabetes_raw = diabetes_raw[~filter]

In [8]:
to_drop = ["patient_nbr", "weight", "encounter_id", "examide", "citoglipton"]
diabetes_raw.drop(columns=to_drop, axis=1, inplace=True)
diabetes_raw.shape

(69973, 45)

## Ajustando escopo da classe target

In [9]:
diabetes_raw['readmitted'].replace(">30", "NO", inplace=True)

## Imputação de valores

In [10]:
num_features = diabetes_raw.select_dtypes(include=np.number).columns
num_features

Index(['admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses'],
      dtype='object')

In [11]:
cat_features = diabetes_raw.select_dtypes(include=object).columns
cat_features

Index(['race', 'gender', 'age', 'payer_code', 'medical_specialty', 'diag_1',
       'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [12]:
# Sem necessidade de imputação nas features numéricas
diabetes_raw[num_features].isna().sum()

admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
number_diagnoses            0
dtype: int64

In [13]:
diabetes_raw[cat_features].isna().sum()

race                         1918
gender                          0
age                             0
payer_code                  30415
medical_specialty           33639
diag_1                         10
diag_2                        293
diag_3                       1224
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide                       0
glyburide                       0
tolbutamide                     0
pioglitazone                    0
rosiglitazone                   0
acarbose                        0
miglitol                        0
troglitazone                    0
tolazamide                      0
insulin                         0
glyburide-metformin             0
glipizide-metformin             0
glimepiride-pioglitazone        0
metformin-rosi

In [14]:
diabetes_raw['medical_specialty'] = diabetes_raw['medical_specialty'].fillna("Missing")
diabetes_raw['payer_code'] = diabetes_raw['payer_code'].fillna("Missing")

In [15]:
from sklearn.impute import SimpleImputer

col_to_input = ['race', 'diag_1', 'diag_2', 'diag_3']
si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
si.fit(diabetes_raw[col_to_input])
diabetes_raw[col_to_input] = si.transform(diabetes_raw[col_to_input])

In [16]:
diabetes_raw[cat_features].isna().sum()

race                        0
gender                      0
age                         0
payer_code                  0
medical_specialty           0
diag_1                      0
diag_2                      0
diag_3                      0
max_glu_serum               0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol                    0
troglitazone                0
tolazamide                  0
insulin                     0
glyburide-metformin         0
glipizide-metformin         0
glimepiride-pioglitazone    0
metformin-rosiglitazone     0
metformin-pioglitazone      0
change                      0
diabetesMed                 0
readmitted

## Feature Engineering

### Age feature

In [17]:
import re

def age_to_number(age_interval):
    match = re.findall(r'\d+', age_interval)
    return (int(match[0]) + int(match[1])) / 2

diabetes_raw['age'] = diabetes_raw['age'].apply(age_to_number)

In [18]:
diabetes_raw.age

0          5.0
1         15.0
2         25.0
3         35.0
4         45.0
          ... 
101754    75.0
101755    45.0
101756    65.0
101758    85.0
101765    75.0
Name: age, Length: 69973, dtype: float64

### Clustering some features

#### diag feature

In [19]:
def categorize_icd9_code(x):
    
    if 'V' in str(x) or 'E' in str(x):
        return 'other'

    x_int = int(float(x))
    
    icd9_categories = {
        'circulatory': list(range(390, 460)) + [785],
        'respiratory': list(range(460, 520)) + [786],
        'digestive': list(range(520, 580)) + [787],
        'diabetes': [250],
        'injury': list(range(800, 1000)),
        'musculoskeletal': list(range(710, 740)),
        'genitourinary': list(range(580, 630)) + [788],
        'neoplasms': list(range(140, 240)),
    }

    for category, icd9_range in icd9_categories.items():
        if x_int in icd9_range:
            return category
    
    return 'other'

In [20]:
diabetes_raw['diag_1'] = diabetes_raw['diag_1'].apply(categorize_icd9_code)
diabetes_raw['diag_2'] = diabetes_raw['diag_2'].apply(categorize_icd9_code)
diabetes_raw['diag_3'] = diabetes_raw['diag_3'].apply(categorize_icd9_code)

#### medical_specialty feature

In [22]:
#pd.set_option('display.max_rows', 500)
print(diabetes_raw['medical_specialty'].value_counts())

Missing                                 33639
InternalMedicine                        10641
Family/GeneralPractice                   4978
Emergency/Trauma                         4393
Cardiology                               4207
Surgery-General                          2205
Orthopedics                              1128
Orthopedics-Reconstructive               1041
Radiologist                               821
Nephrology                                797
Pulmonology                               637
Psychiatry                                613
ObstetricsandGynecology                   593
Urology                                   530
Surgery-Cardiovascular/Thoracic           488
Surgery-Neuro                             404
Gastroenterology                          383
Surgery-Vascular                          359
Oncology                                  205
Pediatrics                                195
PhysicalMedicineandRehabilitation         194
Neurology                         

In [40]:
def cluster_medical_specialty(x):
    
    # Dicionário com as categorias semelhantes
    specialty_groups = {
        
        'Missing': ['Missing'],
        
        'cardiology': ['Cardiology', 'Cardiology-Pediatric'],
        
        'medical-care': ['InternalMedicine', 'Family/GeneralPractice', 'Emergency/Trauma',
                        'Hospitalist'],
        
        'surgery': ['Surgery-General', 'Surgery-PlasticwithinHeadandNeck',
                    'Surgery-Pediatric', 'Surgery-Maxillofacial', 'Surgery-Colon&Rectal',
                    'Surgery-Plastic', 'Surgeon', 'Surgery-Vascular', 'Surgery-Neuro',
                    'Surgery-Cardiovascular/Thoracic', 'Surgery-Cardiovascular',
                    'SurgicalSpecialty', 'Surgery-Thoracic'],
        
        'ortopedic' : ['Orthopedics', 'Orthopedics-Reconstructive'],
        
        'neuro' : ['Neurology', 'Neurophysiology'],
        
        'pediatrics' : ['Pediatrics','Pediatrics-CriticalCare','Pediatrics-EmergencyMedicine',
                        'Pediatrics-Endocrinology','Pediatrics-Hematology-Oncology',
                'Pediatrics-Neurology','Pediatrics-Pulmonology', 'Anesthesiology-Pediatric'],

        'psychic' : ['Psychiatry-Addictive', 'Psychology', 'Psychiatry',
                    'Psychiatry-Child/Adolescent', 'PhysicalMedicineandRehabilitation'],
        
        'endo-obstetric': ['Obsterics&Gynecology-GynecologicOnco', 'Obstetrics', 'Endocrinology-Metabolism',
                        'Endocrinology', 'ObstetricsandGynecology', 'Gynecology'],
        
        'ungrouped': ['Pathology', 'Ophthalmology', 'Dermatology', 'Proctology', 'Dentistry',
                    'Perinatology', 'Anesthesiology', 'Rheumatology', 'Hematology',
                    'Pulmonology', 'Urology', 'Oncology', 'Radiologist', 'Gastroenterology',
                    'Otolaryngology', 'Hematology/Oncology', 'Podiatry', 'Radiology'],

        'others': ['OutreachServices', 'Resident', 'DCPTEAM', 'Speech', 'Osteopath',
                'SportsMedicine', 'PhysicianNotFound', 'InfectiousDiseases',
                'AllergyandImmunology'],
    }

    for category, groups in specialty_groups.items():
        if x in groups:
            return category


In [44]:
diabetes_raw['medical_specialty'] = diabetes_raw['medical_specialty'].apply(cluster_medical_specialty)

In [45]:
diabetes_raw['medical_specialty'].value_counts()

Missing           33639
medical-care      20048
cardiology         4214
surgery            3751
ungrouped          2992
ortopedic          2169
psychic             867
endo-obstetric      786
pediatrics          447
neuro               168
others               95
Name: medical_specialty, dtype: int64

### Feture Creation

In [48]:
diabetes_raw['service_utilization'] = (
    diabetes_raw['number_emergency']
    + diabetes_raw['number_inpatient']
    + diabetes_raw['number_outpatient']
)

In [55]:
diabetes_raw['severity_of_disease'] = (
    diabetes_raw['time_in_hospital']
    + diabetes_raw['num_procedures']
    + diabetes_raw['num_medications']
    + diabetes_raw['num_lab_procedures']
    + diabetes_raw['number_diagnoses']
)

In [56]:
diabetes_raw.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'payer_code', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'service_utilization', 'severity_of_disease'],
      dtype='object')

In [58]:
from tqdm import tqdm

number_of_changes = []
drugList = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-rosiglitazone', 'metformin-pioglitazone']


for i in tqdm(range(len(diabetes_raw))) :
    changeCount = 0
    for col in drugList : 
        if diabetes_raw.iloc[i][col] in ['Down', 'Up'] :
            changeCount += 1
    number_of_changes.append(changeCount)

diabetes_raw['number_of_changes'] = number_of_changes

 14%|█▎        | 9573/69973 [00:51<05:40, 177.62it/s]

### Encoding

#### Scalling

### Feature Selection

## Class Imbalanced

SMOTE

### Save Dataframe