In [57]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [58]:
dataset_path = "dataset_raw.csv"

In [59]:
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path

df = pd.read_csv(Path(dataset_path))
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [60]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [61]:
# Looking for null values
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [62]:
#  let's look closer at non-numeric columns as they are low-hanging fruit
df.gender.unique()
# we can see that there is 'Other' value in this field, which is a non-biological;
# it will certainly lower our accuracy, but let's remap it to numeric value 2,
# whereas female would be 0 (because of my personal belief in women importance) and
# male will be 1

array(['Female', 'Male', 'Other'], dtype=object)

In [63]:
df.smoking_history.unique()
# we can see quite vague values such as 'former', 'ever', 'not current';
# I'll try to give em numerical values and we will see if that will affect our data.
# Also, I've googled that smoking actually affects diabetes according to the medicine science,
# so I think it is important to embed this data as a feature

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [64]:
# let's remap several columns to a machine-processable format
SMOKING_HISTORY_REMAPPING = {
    'never': 0,
    'No Info': 0, # this value came to me after initially using 1, since a lot of kids have No Info
    'former': 2,
    'not current': 3,
    'current': 4,
    'ever': 4,
}
df['smoking_history'] = df['smoking_history'].map(SMOKING_HISTORY_REMAPPING)

GENDER_REMAPPING = {
    'Female': 0,
    'Male': 1,
    'Other': 2,
}
df['gender'] = df['gender'].map(GENDER_REMAPPING)
df['smoking_history'] = df['smoking_history'].astype(int)
df['gender'] = df['gender'].astype(int)

In [65]:
# Maybe there are duplicates?
df.duplicated().sum()

5891

In [66]:
# luckily, there is a fancy method for this
df = df.drop_duplicates()

In [67]:
# now we should refer to common sence when inspecting other columns
df.bmi.describe()
# no zero values for BMI as it should be, ok

count    94109.000000
mean        27.321978
std          6.840092
min         10.010000
25%         23.280000
50%         27.320000
75%         30.000000
max         95.690000
Name: bmi, dtype: float64

In [68]:
df.age.describe()
# I guess the minimum value for 0.08 could be ok, but lets inspect this row closer

count    94109.000000
mean        41.783242
std         22.509183
min          0.080000
25%         24.000000
50%         43.000000
75%         60.000000
max         80.000000
Name: age, dtype: float64

In [69]:
df[df.age == 0.08]
# well, there are a lot of results, and these look OK to me

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
155,0,0.08,0,0,0,14.43,6.5,160,0
2049,0,0.08,0,0,0,13.35,3.5,145,0
3742,1,0.08,0,0,0,12.8,6.6,80,0
9683,0,0.08,0,0,0,11.88,5.7,80,0
13162,1,0.08,0,0,0,27.32,6.1,126,0
16028,1,0.08,0,0,0,14.4,6.0,90,0
29551,1,0.08,0,0,0,13.55,6.1,145,0
30060,1,0.08,0,0,0,12.89,6.5,145,0
32402,1,0.08,0,0,0,13.68,5.0,85,0
33684,1,0.08,0,0,0,30.64,6.6,130,0


In [70]:
df.info()
# now our dataset looks good, let's dump it for further processing

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94109 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               94109 non-null  int64  
 1   age                  94109 non-null  float64
 2   hypertension         94109 non-null  int64  
 3   heart_disease        94109 non-null  int64  
 4   smoking_history      94109 non-null  int64  
 5   bmi                  94109 non-null  float64
 6   HbA1c_level          94109 non-null  float64
 7   blood_glucose_level  94109 non-null  int64  
 8   diabetes             94109 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 7.2 MB


In [71]:
df.to_csv("dataset.csv")