In [1]:
import numpy as np
import pandas as pd

Reads Data

In [2]:
data = pd.read_csv("./dataset_diabetes/diabetic_data.csv")
data.shape

(101766, 50)

In [3]:
data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [4]:
data.head(10)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),?,2,1,2,3,...,No,Steady,No,No,No,No,No,No,Yes,>30
6,55842,84259809,Caucasian,Male,[60-70),?,3,1,2,4,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,63768,114882984,Caucasian,Male,[70-80),?,1,1,7,5,...,No,No,No,No,No,No,No,No,Yes,>30
8,12522,48330783,Caucasian,Female,[80-90),?,2,1,4,13,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,15738,63555939,Caucasian,Female,[90-100),?,3,3,4,12,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


## Data Cleaning

In [5]:
# replace Null string like ? or 'NULL' to numpy NaN object
none_kinds = ['?','NULL','None']
data = data.replace(to_replace=none_kinds,value=np.nan)

# replace NULL ids according to mappings desciription 
admission_type_id_null_values = [5,6,8]
discharge_disposition_id_null_values = [18,25,26]
admission_source_id_null_values = [9,17,20,21]
data['admission_type_id'] = data['admission_type_id'].replace(to_replace=admission_type_id_null_values,value=np.nan)
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(to_replace=discharge_disposition_id_null_values,value=np.nan)
data['admission_source_id'] = data['admission_source_id'].replace(to_replace=admission_source_id_null_values,value=np.nan)

In [6]:
# check for data sparse precentage 
(data.isnull().sum() / data.shape[0])

encounter_id                0.000000
patient_nbr                 0.000000
race                        0.022336
gender                      0.000000
age                         0.000000
weight                      0.968585
admission_type_id           0.102156
discharge_disposition_id    0.045988
admission_source_id         0.069444
time_in_hospital            0.000000
payer_code                  0.395574
medical_specialty           0.490822
num_lab_procedures          0.000000
num_procedures              0.000000
num_medications             0.000000
number_outpatient           0.000000
number_emergency            0.000000
number_inpatient            0.000000
diag_1                      0.000206
diag_2                      0.003518
diag_3                      0.013983
number_diagnoses            0.000000
max_glu_serum               0.947468
A1Cresult                   0.832773
metformin                   0.000000
repaglinide                 0.000000
nateglinide                 0.000000
c

We decided to drop every feature that as less than 80% data

In [10]:
features_to_remove = (data.isnull().sum() / data.shape[0]) > 0.2
features_to_remove = features_to_remove[features_to_remove].index.values
features_to_remove


array(['weight', 'payer_code', 'medical_specialty', 'max_glu_serum',
       'A1Cresult'], dtype=object)

In [11]:
data.drop(features_to_remove,axis=1,inplace=True)

In [12]:
data.shape

(101766, 45)

In [None]:
# Todo: fill NaN's
# take care categorical features 