# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Read Data

In [2]:
diabetes_raw = pd.read_csv("../data/raw/diabetic_data.csv")

In [3]:
diabetes_raw.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
diabetes_raw['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

# Pre-processing

## Lidando com inconsistências

In [5]:
diabetes_raw.replace("?", np.nan, inplace=True)

In [6]:
print('Unique entries = ', len(np.unique(diabetes_raw['patient_nbr'])))
diabetes_raw.drop_duplicates(['patient_nbr'], keep = 'first', inplace = True)
print('Length after removing Duplicates:', len(diabetes_raw))

Unique entries =  71518
Length after removing Duplicates: 71518


In [7]:
# 1,13,14,19,20,21 estão relacionados a morte ou hospício. Obviamente tais pessoas não
# serão reinternadas ao hospital. Precisamos deletar tais amostras.
filter = diabetes_raw['discharge_disposition_id'].isin([11,13,14,19,20,21])
diabetes_raw = diabetes_raw[~filter]

In [8]:
to_drop = ["patient_nbr", "weight", "encounter_id", "examide", "citoglipton"]
diabetes_raw.drop(columns=to_drop, axis=1, inplace=True)
diabetes_raw.shape

(69973, 45)

## Ajustando escopo da classe target

In [9]:
diabetes_raw['readmitted'].replace(">30", "NO", inplace=True)

## Imputação de valores

In [10]:
num_features = diabetes_raw.select_dtypes(include=np.number).columns
num_features

Index(['admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses'],
      dtype='object')

In [11]:
cat_features = diabetes_raw.select_dtypes(include=object).columns
cat_features

Index(['race', 'gender', 'age', 'payer_code', 'medical_specialty', 'diag_1',
       'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [12]:
# Sem necessidade de imputação nas features numéricas
diabetes_raw[num_features].isna().sum()

admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
number_diagnoses            0
dtype: int64

In [13]:
diabetes_raw[cat_features].isna().sum()

race                         1918
gender                          0
age                             0
payer_code                  30415
medical_specialty           33639
diag_1                         10
diag_2                        293
diag_3                       1224
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide                       0
glyburide                       0
tolbutamide                     0
pioglitazone                    0
rosiglitazone                   0
acarbose                        0
miglitol                        0
troglitazone                    0
tolazamide                      0
insulin                         0
glyburide-metformin             0
glipizide-metformin             0
glimepiride-pioglitazone        0
metformin-rosi

In [14]:
diabetes_raw['medical_specialty'] = diabetes_raw['medical_specialty'].fillna("Missing")
diabetes_raw['payer_code'] = diabetes_raw['payer_code'].fillna("Missing")

In [15]:
from sklearn.impute import SimpleImputer

col_to_input = ['race', 'diag_1', 'diag_2', 'diag_3']
si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
si.fit(diabetes_raw[col_to_input])
diabetes_raw[col_to_input] = si.transform(diabetes_raw[col_to_input])

In [16]:
diabetes_raw[cat_features].isna().sum()

race                        0
gender                      0
age                         0
payer_code                  0
medical_specialty           0
diag_1                      0
diag_2                      0
diag_3                      0
max_glu_serum               0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol                    0
troglitazone                0
tolazamide                  0
insulin                     0
glyburide-metformin         0
glipizide-metformin         0
glimepiride-pioglitazone    0
metformin-rosiglitazone     0
metformin-pioglitazone      0
change                      0
diabetesMed                 0
readmitted

## Feature Engineering

### Age feature

In [17]:
import re

def age_to_number(age_interval):
    match = re.findall(r'\d+', age_interval)
    return (int(match[0]) + int(match[1])) / 2

diabetes_raw['age'] = diabetes_raw['age'].apply(age_to_number)

In [18]:
diabetes_raw.age

0          5.0
1         15.0
2         25.0
3         35.0
4         45.0
          ... 
101754    75.0
101755    45.0
101756    65.0
101758    85.0
101765    75.0
Name: age, Length: 69973, dtype: float64

### Clustering some features

#### diag feature

In [19]:
def categorize_icd9_code(x):
    
    if 'V' in str(x) or 'E' in str(x):
        return 'other'

    x_int = int(float(x))
    
    icd9_categories = {
        'circulatory': list(range(390, 460)) + [785],
        'respiratory': list(range(460, 520)) + [786],
        'digestive': list(range(520, 580)) + [787],
        'diabetes': [250],
        'injury': list(range(800, 1000)),
        'musculoskeletal': list(range(710, 740)),
        'genitourinary': list(range(580, 630)) + [788],
        'neoplasms': list(range(140, 240)),
    }

    for category, icd9_range in icd9_categories.items():
        if x_int in icd9_range:
            return category
    
    return 'other'

In [20]:
diabetes_raw['diag_1'] = diabetes_raw['diag_1'].apply(categorize_icd9_code)
diabetes_raw['diag_2'] = diabetes_raw['diag_2'].apply(categorize_icd9_code)
diabetes_raw['diag_3'] = diabetes_raw['diag_3'].apply(categorize_icd9_code)

#### medical_specialty feature

In [21]:
#pd.set_option('display.max_rows', 500)
print(diabetes_raw['medical_specialty'].value_counts())

Missing                             33639
InternalMedicine                    10641
Family/GeneralPractice               4978
Emergency/Trauma                     4393
Cardiology                           4207
                                    ...  
SportsMedicine                          1
Dermatology                             1
Proctology                              1
Surgery-PlasticwithinHeadandNeck        1
Resident                                1
Name: medical_specialty, Length: 71, dtype: int64


In [22]:
def cluster_medical_specialty(x):
    
    # Dicionário com as categorias semelhantes
    specialty_groups = {
        
        'Missing': ['Missing'],
        
        'cardiology': ['Cardiology', 'Cardiology-Pediatric'],
        
        'medical-care': ['InternalMedicine', 'Family/GeneralPractice', 'Emergency/Trauma',
                        'Hospitalist'],
        
        'surgery': ['Surgery-General', 'Surgery-PlasticwithinHeadandNeck',
                    'Surgery-Pediatric', 'Surgery-Maxillofacial', 'Surgery-Colon&Rectal',
                    'Surgery-Plastic', 'Surgeon', 'Surgery-Vascular', 'Surgery-Neuro',
                    'Surgery-Cardiovascular/Thoracic', 'Surgery-Cardiovascular',
                    'SurgicalSpecialty', 'Surgery-Thoracic'],
        
        'ortopedic' : ['Orthopedics', 'Orthopedics-Reconstructive'],
        
        'neuro' : ['Neurology', 'Neurophysiology'],
        
        'pediatrics' : ['Pediatrics','Pediatrics-CriticalCare','Pediatrics-EmergencyMedicine',
                        'Pediatrics-Endocrinology','Pediatrics-Hematology-Oncology',
                'Pediatrics-Neurology','Pediatrics-Pulmonology', 'Anesthesiology-Pediatric'],

        'psychic' : ['Psychiatry-Addictive', 'Psychology', 'Psychiatry',
                    'Psychiatry-Child/Adolescent', 'PhysicalMedicineandRehabilitation'],
        
        'endo-obstetric': ['Obsterics&Gynecology-GynecologicOnco', 'Obstetrics', 'Endocrinology-Metabolism',
                        'Endocrinology', 'ObstetricsandGynecology', 'Gynecology'],
        
        'ungrouped': ['Pathology', 'Ophthalmology', 'Dermatology', 'Proctology', 'Dentistry',
                    'Perinatology', 'Anesthesiology', 'Rheumatology', 'Hematology',
                    'Pulmonology', 'Urology', 'Oncology', 'Radiologist', 'Gastroenterology',
                    'Otolaryngology', 'Hematology/Oncology', 'Podiatry', 'Radiology'],

        'others': ['OutreachServices', 'Resident', 'DCPTEAM', 'Speech', 'Osteopath',
                'SportsMedicine', 'PhysicianNotFound', 'InfectiousDiseases',
                'AllergyandImmunology'],
    }

    for category, groups in specialty_groups.items():
        if x in groups:
            return category


In [23]:
diabetes_raw['medical_specialty'] = diabetes_raw['medical_specialty'].apply(cluster_medical_specialty)

In [24]:
diabetes_raw['medical_specialty'].value_counts()

Missing           33639
medical-care      20048
cardiology         4214
surgery            3751
ungrouped          2992
ortopedic          2169
psychic             867
endo-obstetric      786
pediatrics          447
neuro               168
others               95
Name: medical_specialty, dtype: int64

### Feture Creation

In [25]:
diabetes_raw['service_utilization'] = (
    diabetes_raw['number_emergency']
    + diabetes_raw['number_inpatient']
    + diabetes_raw['number_outpatient']
)

In [26]:
diabetes_raw['severity_of_disease'] = (
    diabetes_raw['time_in_hospital']
    + diabetes_raw['num_procedures']
    + diabetes_raw['num_medications']
    + diabetes_raw['num_lab_procedures']
    + diabetes_raw['number_diagnoses']
)

In [27]:
diabetes_raw.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'payer_code', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'service_utilization', 'severity_of_disease'],
      dtype='object')

In [28]:
from tqdm import tqdm

number_of_changes = []
drugList = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-rosiglitazone', 'metformin-pioglitazone']


for i in tqdm(range(len(diabetes_raw))) :
    changeCount = 0
    for col in drugList : 
        if diabetes_raw.iloc[i][col] in ['Down', 'Up'] :
            changeCount += 1
    number_of_changes.append(changeCount)

diabetes_raw['number_of_changes'] = number_of_changes

100%|██████████| 69973/69973 [04:26<00:00, 262.77it/s]


### Encoding

In [29]:
num_features = diabetes_raw.select_dtypes(include=np.number).columns

In [30]:
num_features = num_features.drop(
    ['admission_type_id',
    'discharge_disposition_id',
    'admission_source_id'])

In [31]:
cat_features = diabetes_raw.select_dtypes(include=object).columns
cat_features = cat_features.drop(['readmitted'])

In [32]:
cols_num_cat = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']
diabetes_raw[cols_num_cat] = diabetes_raw[cols_num_cat].astype('str')

cat_features = list(cat_features) + cols_num_cat
cat_features

['race',
 'gender',
 'payer_code',
 'medical_specialty',
 'diag_1',
 'diag_2',
 'diag_3',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'admission_type_id',
 'discharge_disposition_id',
 'admission_source_id']

In [33]:
df_dum = pd.get_dummies(diabetes_raw, columns=cat_features, drop_first=True)

In [34]:
df_dum.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted,...,admission_source_id_20,admission_source_id_22,admission_source_id_25,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9
0,5.0,1,41,0,1,0,0,0,1,NO,...,0,0,0,0,0,0,0,0,0,0
1,15.0,3,59,0,18,0,0,0,9,NO,...,0,0,0,0,0,0,0,1,0,0
2,25.0,2,11,5,13,2,0,1,6,NO,...,0,0,0,0,0,0,0,1,0,0
3,35.0,2,44,1,16,0,0,0,7,NO,...,0,0,0,0,0,0,0,1,0,0
4,45.0,1,51,0,8,0,0,0,5,NO,...,0,0,0,0,0,0,0,1,0,0


In [35]:
diabetes_raw = pd.concat([diabetes_raw, df_dum], axis=1)

In [36]:
diabetes_raw.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'payer_code', 'medical_specialty', 'num_lab_procedures',
       ...
       'admission_source_id_20', 'admission_source_id_22',
       'admission_source_id_25', 'admission_source_id_3',
       'admission_source_id_4', 'admission_source_id_5',
       'admission_source_id_6', 'admission_source_id_7',
       'admission_source_id_8', 'admission_source_id_9'],
      dtype='object', length=214)

In [37]:
diabetes_raw.drop(columns=cat_features, inplace=True)
diabetes_raw

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted,...,admission_source_id_20,admission_source_id_22,admission_source_id_25,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9
0,5.0,1,41,0,1,0,0,0,1,NO,...,0,0,0,0,0,0,0,0,0,0
1,15.0,3,59,0,18,0,0,0,9,NO,...,0,0,0,0,0,0,0,1,0,0
2,25.0,2,11,5,13,2,0,1,6,NO,...,0,0,0,0,0,0,0,1,0,0
3,35.0,2,44,1,16,0,0,0,7,NO,...,0,0,0,0,0,0,0,1,0,0
4,45.0,1,51,0,8,0,0,0,5,NO,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101754,75.0,9,50,2,33,0,0,0,9,NO,...,0,0,0,0,0,0,0,1,0,0
101755,45.0,14,73,6,26,0,1,0,9,NO,...,0,0,0,0,0,0,0,1,0,0
101756,65.0,2,46,6,17,1,1,1,9,NO,...,0,0,0,0,0,0,0,1,0,0
101758,85.0,5,76,1,22,0,1,0,9,NO,...,0,0,0,0,0,0,0,1,0,0


In [39]:
diabetes_raw = diabetes_raw.T.drop_duplicates().T

## Class Imbalanced

In [41]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [55]:
diabetes_raw['readmitted'].value_counts()

NO     63696
<30     6277
Name: readmitted, dtype: int64

In [56]:
X = diabetes_raw.drop('readmitted', axis=1)
y = diabetes_raw['readmitted']

In [57]:
smote = SMOTE()
rus = RandomUnderSampler(sampling_strategy=0.17 , random_state=42)


In [58]:
X, y = rus.fit_resample(X, y)
X, y = smote.fit_resample(X, y)

In [59]:
print(y.value_counts())

<30    36923
NO     36923
Name: readmitted, dtype: int64


#### Scalling

In [60]:
num_features

Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'service_utilization',
       'severity_of_disease', 'number_of_changes'],
      dtype='object')

In [61]:
X[num_features].columns

Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'service_utilization',
       'severity_of_disease', 'number_of_changes'],
      dtype='object')

In [62]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer([
        ('somename', StandardScaler(), list(num_features))
    ], remainder='passthrough')


In [63]:
X_scaled = ct.fit_transform(X)

In [64]:
X_scaled.shape

(73846, 165)

In [65]:
X_scaled

array([[-0.09774875857340863, 0.873108855113323, 0.9313706480281035, ...,
        0.0, 0.0, 0.0],
       [-1.3789455549787002, 0.873108855113323, 0.8291649705957551, ...,
        1.0, 0.0, 0.0],
       [-0.7383471567760544, -0.16150696357304542, 0.06262238985314132,
        ..., 1.0, 0.0, 0.0],
       ...,
       [-2.019543953181346, -0.5759650149344239, 0.021830763619716573,
        ..., 1.0, 0.0, 0.0],
       [-0.7383471567760544, 0.3005157192701277, 0.06262238985314132,
        ..., 1.0, 0.0, 0.0],
       [0.5428496396292372, -0.16150696357304542, 0.6753166617395787,
        ..., 1.0, 0.0, 0.0]], dtype=object)

### Save Dataframe

In [74]:
diabetes_raw.drop('readmitted', axis=1).columns

Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'service_utilization',
       ...
       'admission_source_id_20', 'admission_source_id_22',
       'admission_source_id_25', 'admission_source_id_3',
       'admission_source_id_4', 'admission_source_id_5',
       'admission_source_id_6', 'admission_source_id_7',
       'admission_source_id_8', 'admission_source_id_9'],
      dtype='object', length=165)

In [76]:
X = pd.DataFrame(X_scaled, columns=list(diabetes_raw.drop(['readmitted'], axis=1).columns))

In [77]:
X['readmitted'] = y

In [None]:
X.to_csv(path_or_buf = '../data/processed/X_data.csv')
y.to_csv(path_or_buf = '../data/processed/Y_data.csv')