In [1]:
# Python, Numpy, OS libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = 'plotly_dark'
from plotly.subplots import make_subplots

In [2]:
train = pd.read_csv('dataset_37_diabetes.csv')

In [3]:
train.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive


In [4]:
# Information about the columns and diabetes in general to understand about diabetes and the tests done to check for type1 and type2 diabetes
# Type 1 diabetes is often diagnosed more rapidly because it usually develops rapidly and involves high blood sugar levels and symptoms
# Type 2 diabetes develops often in over the course of years. Most common diabetes and usually develops during adulthood
# Diabetes can cause pregnancy complications and is usually recommended to test withtin 24~28 weeks of pregnancy.


# preg = Number of times pregnant
# plas = Plasma glucose concentration in a 2 hours in an oral glucose test
    # This measures how much the body is able to absorb the gluose(sugar). This is usually done to test diabetes during pregnency
# pres = Diastolic blood pressure(mm Hg) - this cannot be 0 because no one has 0 blood pressure when they are alive. 
# skin = Triceps skin folding thickness(mm)
# insu = 2 Hour serum insulin(mu U/ml) // Test is used to find out the levels of insulin in blood, Basically whether the patient is making enough insulin or not 
# mass = body mass index(weight in kgs) - BMI, it is considered overweight when BMI is greater than 25
# pedi = Diabetes pedigree function
# age = Age of patient
# class = Target(whether patient has diabetes or not)

In [5]:
train.describe()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
preg     768 non-null int64
plas     768 non-null int64
pres     768 non-null int64
skin     768 non-null int64
insu     768 non-null int64
mass     768 non-null float64
pedi     768 non-null float64
age      768 non-null int64
class    768 non-null object
dtypes: float64(2), int64(6), object(1)
memory usage: 54.1+ KB


In [7]:
train.isnull().sum(), train.isna().sum()

(preg     0
 plas     0
 pres     0
 skin     0
 insu     0
 mass     0
 pedi     0
 age      0
 class    0
 dtype: int64, preg     0
 plas     0
 pres     0
 skin     0
 insu     0
 mass     0
 pedi     0
 age      0
 class    0
 dtype: int64)

In [8]:
zeros = (train==0).astype(int).sum(axis=0)
zeros

preg     111
plas       5
pres      35
skin     227
insu     374
mass      11
pedi       0
age        0
class      0
dtype: int64

In [9]:
cols = ['mass', 'pres', 'plas']
train[cols] = train[cols].replace({0: np.nan})

In [10]:
zeros = (train==0).astype(int).sum(axis=0)
zeros

preg     111
plas       0
pres       0
skin     227
insu     374
mass       0
pedi       0
age        0
class      0
dtype: int64

In [11]:
train.isna().sum()

preg      0
plas      5
pres     35
skin      0
insu      0
mass     11
pedi      0
age       0
class     0
dtype: int64

In [12]:
train.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148.0,72.0,35,0,33.6,0.627,50,tested_positive
1,1,85.0,66.0,29,0,26.6,0.351,31,tested_negative
2,8,183.0,64.0,0,0,23.3,0.672,32,tested_positive
3,1,89.0,66.0,23,94,28.1,0.167,21,tested_negative
4,0,137.0,40.0,35,168,43.1,2.288,33,tested_positive


In [13]:
onezero = train[train['pres'].isna()]
onezero

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
7,10,115.0,,0,0,35.3,0.134,29,tested_negative
15,7,100.0,,0,0,30.0,0.484,32,tested_positive
49,7,105.0,,0,0,,0.305,24,tested_negative
60,2,84.0,,0,0,,0.304,21,tested_negative
78,0,131.0,,0,0,43.2,0.27,26,tested_positive
81,2,74.0,,0,0,,0.102,22,tested_negative
172,2,87.0,,23,0,28.9,0.773,25,tested_negative
193,11,135.0,,0,0,52.3,0.578,40,tested_positive
222,7,119.0,,0,0,25.2,0.209,37,tested_negative
261,3,141.0,,0,0,30.0,0.761,27,tested_positive


In [19]:
train['mass'].fillna(train.groupby('age')['mass'].transform('mean'), inplace=True)
train['pres'].fillna(train.groupby(['age'])['pres'].transform('mean'), inplace=True)
train['plas'].fillna(train.groupby(['age'])['plas'].transform('mean'), inplace=True)

In [20]:
train.isna().sum()

preg     0
plas     0
pres     1
skin     0
insu     0
mass     0
pedi     0
age      0
class    0
dtype: int64

In [21]:
train.isnull().sum()

preg     0
plas     0
pres     1
skin     0
insu     0
mass     0
pedi     0
age      0
class    0
dtype: int64

In [22]:
train.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148.0,72.0,35,0,33.6,0.627,50,tested_positive
1,1,85.0,66.0,29,0,26.6,0.351,31,tested_negative
2,8,183.0,64.0,0,0,23.3,0.672,32,tested_positive
3,1,89.0,66.0,23,94,28.1,0.167,21,tested_negative
4,0,137.0,40.0,35,168,43.1,2.288,33,tested_positive


In [23]:
onezero = train[train['pres'].isna()]
onezero

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
453,2,119.0,,0,0,19.6,0.832,72,tested_negative


In [24]:
train['pres'].fillna(train.groupby(['mass'])['pres'].transform('mean'), inplace=True)

In [25]:
train.isnull().sum()

preg     0
plas     0
pres     0
skin     0
insu     0
mass     0
pedi     0
age      0
class    0
dtype: int64