# Overview

This notebook will accomplish the follwing:
- determine missing values
- reclassify missing values
- drop large missing data or columns
- convert data types to numeric

## Authors

[Yung Han Jeong](https://github.com/yunghanjeong) <br>
[Malcolm Katzenbach](https://github.com/malcolm206)

# Data Import

In [71]:
import os
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 200) #set to show all columns
pd.set_option('display.max_rows', 200) 
import numpy as np

%matplotlib inline

In [72]:
df = pd.read_csv(r"..\data\diabetic_data.csv")

In [73]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [74]:
df.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


# Predictant Inspsection

In [75]:
df.readmitted.value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

In [76]:
# 0 = NO, 1 = <30, 2 = >30
df.readmitted = df.readmitted.apply(lambda x: 0 if x == "NO" else 1 if x == "<30" else 2)

# Data Inspection

In [77]:
df.shape # lots 'o data

(101766, 50)

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [79]:
for column in df.columns:
    print(f"This column {column} contains following unique values: ")
    print(df[column].value_counts())
    print("-----------------------------------------------------------")

This column encounter_id contains following unique values: 
96210942     1
89943846     1
384306986    1
94650156     1
83156784     1
            ..
74454612     1
208073976    1
166229592    1
38340702     1
77856768     1
Name: encounter_id, Length: 101766, dtype: int64
-----------------------------------------------------------
This column patient_nbr contains following unique values: 
88785891     40
43140906     28
23199021     23
1660293      23
88227540     23
             ..
71081460      1
30060018      1
67443444      1
141344240     1
93251151      1
Name: patient_nbr, Length: 71518, dtype: int64
-----------------------------------------------------------
This column race contains following unique values: 
Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: race, dtype: int64
-----------------------------------------------------------
This column gender contains following

Name: max_glu_serum, dtype: int64
-----------------------------------------------------------
This column A1Cresult contains following unique values: 
None    84748
>8       8216
Norm     4990
>7       3812
Name: A1Cresult, dtype: int64
-----------------------------------------------------------
This column metformin contains following unique values: 
No        81778
Steady    18346
Up         1067
Down        575
Name: metformin, dtype: int64
-----------------------------------------------------------
This column repaglinide contains following unique values: 
No        100227
Steady      1384
Up           110
Down          45
Name: repaglinide, dtype: int64
-----------------------------------------------------------
This column nateglinide contains following unique values: 
No        101063
Steady       668
Up            24
Down          11
Name: nateglinide, dtype: int64
-----------------------------------------------------------
This column chlorpropamide contains following unique v

- About 2300 missing data on race

- Medication: No (not prescribed) stead, up, down (dosage change)


# Drop columns with large missing values

In [80]:
bad_columns = ['encounter_id', 'patient_nbr', 'payer_code', 'weight', 
               'medical_specialty', 'acetohexamide', 'tolbutamide',
              'troglitazone', 'examide', 'citoglipton', 'glipizide-metformin',
              'glimepiride-pioglitazone', 'metformin-rosiglitazone',
              'metformin-pioglitazone']

In [81]:
df.drop(columns=bad_columns, inplace=True)

In [82]:
for column in df.columns:
    print(f"This column {column} contains following unique values: ")
    print(df[column].value_counts())
    print("-----------------------------------------------------------")

This column race contains following unique values: 
Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: race, dtype: int64
-----------------------------------------------------------
This column gender contains following unique values: 
Female             54708
Male               47055
Unknown/Invalid        3
Name: gender, dtype: int64
-----------------------------------------------------------
This column age contains following unique values: 
[70-80)     26068
[60-70)     22483
[50-60)     17256
[80-90)     17197
[40-50)      9685
[30-40)      3775
[90-100)     2793
[20-30)      1657
[10-20)       691
[0-10)        161
Name: age, dtype: int64
-----------------------------------------------------------
This column admission_type_id contains following unique values: 
1    53990
3    18869
2    18480
6     5291
5     4785
8      320
7       21
4       10
Name: admission_type_id, dtyp

Name: pioglitazone, dtype: int64
-----------------------------------------------------------
This column rosiglitazone contains following unique values: 
No        95401
Steady     6100
Up          178
Down         87
Name: rosiglitazone, dtype: int64
-----------------------------------------------------------
This column acarbose contains following unique values: 
No        101458
Steady       295
Up            10
Down           3
Name: acarbose, dtype: int64
-----------------------------------------------------------
This column miglitol contains following unique values: 
No        101728
Steady        31
Down           5
Up             2
Name: miglitol, dtype: int64
-----------------------------------------------------------
This column tolazamide contains following unique values: 
No        101727
Steady        38
Up             1
Name: tolazamide, dtype: int64
-----------------------------------------------------------
This column insulin contains following unique values: 
No     

# Drop index with missing gender information

Also update to numeric. 
- Male = 0
- Female = 1

In [83]:
# drop undefined sex value
df.drop(index = df[df.gender == "Unknown/Invalid"].index, inplace=True)

In [84]:
df.gender = df.gender.apply(lambda x: 0 if x == "Male" else 1)

# Make Dummy Columns for categorical data

In [85]:
dummy_columns = ["admission_type_id", "discharge_disposition_id", "admission_source_id", 
                 "age", "race"]

dummy_drug_columns =  ["metformin", "repaglinide", "nateglinide", "chlorpropamide", 
                 "glimepiride", "glipizide", "glyburide", "pioglitazone", "rosiglitazone", 
                 "acarbose", "miglitol", "tolazamide", "insulin", "glyburide-metformin"]

blood_test_columns = ["max_glu_serum", "A1Cresult"]

for column in dummy_columns:
    print("working on: ", column)
    dummies = pd.get_dummies(df[column], drop_first=True, prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df.drop(columns=column, inplace = True)
    
for drug_column in dummy_drug_columns:
    print("working on: ", drug_column)
    if "No" not in df[drug_column].unique(): 
        dummies = pd.get_dummies(df[drug_column], prefix=drug_column)
    else:
        dummies = pd.get_dummies(df[drug_column], prefix=drug_column)
        column_name = "_".join([drug_column, "No"])
        dummies.drop(columns=column_name, inplace=True)
    df = pd.concat([df, dummies], axis=1)
    df.drop(columns=drug_column, inplace = True)
    
for blood_test in blood_test_columns:
    print("working on: ", blood_test)
    dummies = pd.get_dummies(df[blood_test],  prefix=blood_test)
    column_name = "_".join([blood_test, "None"])
    dummies.drop(columns=column_name, inplace=True)
    df = pd.concat([df, dummies], axis=1)
    df.drop(columns=blood_test, inplace = True)
    

working on:  admission_type_id
working on:  discharge_disposition_id
working on:  admission_source_id
working on:  age
working on:  race
working on:  metformin
working on:  repaglinide
working on:  nateglinide
working on:  chlorpropamide
working on:  glimepiride
working on:  glipizide
working on:  glyburide
working on:  pioglitazone
working on:  rosiglitazone
working on:  acarbose
working on:  miglitol
working on:  tolazamide
working on:  insulin
working on:  glyburide-metformin
working on:  max_glu_serum
working on:  A1Cresult


In [86]:
df.change = df.change.apply(lambda x: 0 if x == "No" else 1)

In [87]:
df.diabetesMed = df.diabetesMed.apply(lambda x: 0 if x == "No" else 1)

### Diagnosis Binning

Per Category defined in `description.pdf`

#### Diagnosis Definition

In [88]:
# get all diabetest diagnosis
diab_diag_1 = list(df.diag_1.apply(lambda x: x if x[:3]=="250" else np.nan).dropna().unique())
diab_diag_2 = list(df.diag_2.apply(lambda x: x if x[:3]=="250" else np.nan).dropna().unique())
diab_diag_3 = list(df.diag_3.apply(lambda x: x if x[:3]=="250" else np.nan).dropna().unique())

In [68]:
# get string value of all diagnosis code
circulatory_list = [str(code) for code in range(390, 460, 1)] + ["785"]
respiratory_list = [str(code) for code in range(460, 520, 1)] + ["786"]
digestive_list = [str(code) for code in range(520, 580, 1)] + ["787"]
diabetes_list = set(diab_diag_1 + diab_diag_2 + diab_diag_3)
injury_list = [str(code) for code in range(800, 1000, 1)]
muscle_list = [str(code) for code in range(710, 740, 1)]
genit_list = [str(code) for code in range(580, 629, 1)] + ["788"]
neo_list = [str(code) for code in range(140, 240, 1)]

def diagnosis_clean(value):
    if value in circulatory_list:
        return "circulatory"
    elif value in respiratory_list:
        return "respiratory"
    elif value in digestive_list:
        return "digestive"
    elif value in diabetes_list:
        return "diabetes"
    elif value in injury_list:
        return "injury"
    elif value in muscle_list:
        return "musculoskeletal"
    elif value in genit_list:
        return "genitourinary"
    elif value in neo_list:
        return "neoplasms"
    else:
        return "other"

In [89]:
df.diag_1 = df.diag_1.apply(diagnosis_clean)
df.diag_2 = df.diag_2.apply(diagnosis_clean)
df.diag_3 = df.diag_3.apply(diagnosis_clean)

#### Diagnosis Categorical

In [92]:
diagnosis_columns = ["diag_1", "diag_2", "diag_3"]

for diag_col in diagnosis_columns:
    dummies = pd.get_dummies(df[diag_col], prefix=diag_col)
    drop_col = "_".join([diag_col, "other"])
    dummies.drop(columns=drop_col, inplace=True)
    df = pd.concat([df, dummies], axis=1)
    df.drop(columns=diag_col, inplace=True)

# Export Clean Data

In [94]:
df.head(30)

Unnamed: 0,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,change,diabetesMed,readmitted,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,discharge_disposition_id_2,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_8,discharge_disposition_id_9,discharge_disposition_id_10,discharge_disposition_id_11,discharge_disposition_id_12,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_15,discharge_disposition_id_16,discharge_disposition_id_17,discharge_disposition_id_18,discharge_disposition_id_19,discharge_disposition_id_20,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_24,discharge_disposition_id_25,discharge_disposition_id_27,discharge_disposition_id_28,admission_source_id_2,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,admission_source_id_10,admission_source_id_11,admission_source_id_13,admission_source_id_14,admission_source_id_17,admission_source_id_20,admission_source_id_22,admission_source_id_25,age_[10-20),age_[20-30),age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),age_[90-100),race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,metformin_Down,metformin_Steady,metformin_Up,repaglinide_Down,repaglinide_Steady,repaglinide_Up,nateglinide_Down,nateglinide_Steady,nateglinide_Up,chlorpropamide_Down,chlorpropamide_Steady,chlorpropamide_Up,glimepiride_Down,glimepiride_Steady,glimepiride_Up,glipizide_Down,glipizide_Steady,glipizide_Up,glyburide_Down,glyburide_Steady,glyburide_Up,pioglitazone_Down,pioglitazone_Steady,pioglitazone_Up,rosiglitazone_Down,rosiglitazone_Steady,rosiglitazone_Up,acarbose_Down,acarbose_Steady,acarbose_Up,miglitol_Down,miglitol_Steady,miglitol_Up,tolazamide_Steady,tolazamide_Up,insulin_Down,insulin_Steady,insulin_Up,glyburide-metformin_Down,glyburide-metformin_Steady,glyburide-metformin_Up,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_Norm,diag_1_circulatory,diag_1_diabetes,diag_1_digestive,diag_1_genitourinary,diag_1_injury,diag_1_musculoskeletal,diag_1_neoplasms,diag_1_respiratory,diag_2_circulatory,diag_2_diabetes,diag_2_digestive,diag_2_genitourinary,diag_2_injury,diag_2_musculoskeletal,diag_2_neoplasms,diag_2_respiratory,diag_3_circulatory,diag_3_diabetes,diag_3_digestive,diag_3_genitourinary,diag_3_injury,diag_3_musculoskeletal,diag_3_neoplasms,diag_3_respiratory
0,1,1,41,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,3,59,0,18,0,0,0,9,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,2,11,5,13,2,0,1,6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,2,44,1,16,0,0,0,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,1,51,0,8,0,0,0,5,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
5,0,3,31,6,16,0,0,0,9,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
6,0,4,70,1,21,0,0,0,7,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,5,73,0,12,0,0,0,8,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
8,1,13,68,2,28,0,0,0,8,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,12,33,3,18,0,0,0,8,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1


In [95]:
df.shape

(101763, 145)

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101763 entries, 0 to 101765
Columns: 145 entries, gender to diag_3_respiratory
dtypes: int64(12), uint8(133)
memory usage: 23.0 MB


In [100]:
df.to_csv(r"..\data\diabetic_data_dummy.csv")