In [31]:
import pandas as pd 
import numpy as np # linear algebra
import seaborn as sns
import matplotlib.pyplot as plt # plotting
import os # accessing directory structure
from scipy.stats import skew, kurtosis
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler



In [32]:
df= pd.read_csv("diabetes.csv")

1. EXPLOYRATORY DATA ANALYSIS - EDA

In [33]:
#Show first 5 rows 
print(df.head())

   Pregnancies  Glucose  Blood Pressure  Skin Thickness  Insulin   BMI  \
0            6      148              72              35        0  33.6   
1            1       85              66              29        0  26.6   
2            8      183              64               0        0  23.3   
3            1       89              66              23       94  28.1   
4            0      137              40              35      168  43.1   

   Diabetes Pedigree Function  Age  Outcome  
0                       0.627   50        1  
1                       0.351   31        0  
2                       0.672   32        1  
3                       0.167   21        0  
4                       2.288   33        1  


In [34]:
#Show size of DataFrame 
print(df.shape)

(768, 9)


In [35]:
#Show datatype of each column 
print(df.dtypes)

Pregnancies                     int64
Glucose                         int64
Blood Pressure                  int64
Skin Thickness                  int64
Insulin                         int64
BMI                           float64
Diabetes Pedigree Function    float64
Age                             int64
Outcome                         int64
dtype: object


In [36]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Pregnancies                 768 non-null    int64  
 1   Glucose                     768 non-null    int64  
 2   Blood Pressure              768 non-null    int64  
 3   Skin Thickness              768 non-null    int64  
 4   Insulin                     768 non-null    int64  
 5   BMI                         768 non-null    float64
 6   Diabetes Pedigree Function  768 non-null    float64
 7   Age                         768 non-null    int64  
 8   Outcome                     768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None


In [37]:
#Show descriptive statistics
print(df.describe())

       Pregnancies     Glucose  Blood Pressure  Skin Thickness     Insulin  \
count   768.000000  768.000000      768.000000      768.000000  768.000000   
mean      3.845052  120.894531       69.105469       20.536458   79.799479   
std       3.369578   31.972618       19.355807       15.952218  115.244002   
min       0.000000    0.000000        0.000000        0.000000    0.000000   
25%       1.000000   99.000000       62.000000        0.000000    0.000000   
50%       3.000000  117.000000       72.000000       23.000000   30.500000   
75%       6.000000  140.250000       80.000000       32.000000  127.250000   
max      17.000000  199.000000      122.000000       99.000000  846.000000   

              BMI  Diabetes Pedigree Function         Age     Outcome  
count  768.000000                  768.000000  768.000000  768.000000  
mean    31.992578                    0.471876   33.240885    0.348958  
std      7.884160                    0.331329   11.760232    0.476951  
min      

In [38]:
print(df.isnull().sum())

Pregnancies                   0
Glucose                       0
Blood Pressure                0
Skin Thickness                0
Insulin                       0
BMI                           0
Diabetes Pedigree Function    0
Age                           0
Outcome                       0
dtype: int64


Descriptive Statistics

In [46]:
df.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.85,3.37,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.89,31.97,0.0,99.0,117.0,140.25,199.0
Blood Pressure,768.0,69.11,19.36,0.0,62.0,72.0,80.0,122.0
Skin Thickness,768.0,20.54,15.95,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.8,115.24,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.99,7.88,0.0,27.3,32.0,36.6,67.1
Diabetes Pedigree Function,768.0,0.47,0.33,0.08,0.24,0.37,0.63,2.42
Age,768.0,33.24,11.76,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.35,0.48,0.0,0.0,0.0,1.0,1.0


In [48]:
# Descriptive statistics function
def descriptive_stats(dataframe):
    stats = pd.DataFrame(index=dataframe.columns)
    stats['Mean'] = dataframe.mean()
    stats['Median'] = dataframe.median()
    stats['Mode'] = dataframe.mode().iloc[0]
    stats['Std'] = dataframe.std()
    stats['Variance'] = dataframe.var()
    stats['Min'] = dataframe.min()
    stats['5%'] = dataframe.quantile(0.05)
    stats['25%'] = dataframe.quantile(0.25)
    stats['50%'] = dataframe.quantile(0.50)
    stats['75%'] = dataframe.quantile(0.75)
    stats['95%'] = dataframe.quantile(0.95)
    stats['Max'] = dataframe.max()
    stats['Range'] = stats['Max'] - stats['Min']
    stats['IQR'] = stats['75%'] - stats['25%']
    stats['Skewness'] = dataframe.apply(skew)
    stats['Kurtosis'] = dataframe.apply(kurtosis)
    return stats.round(2)

# Call function and display
stats_df = descriptive_stats(df)
# Rounding and formatting
styled_stats = stats_df.style\
    .background_gradient(cmap='Blues', axis=1)\
    .format("{:.2f}")\
    .set_caption("Descriptive statistics of attributes in the Diabetes dataset")\
    .set_table_styles([{
        'selector': 'caption',
        'props': [('font-size', '16px'), ('font-weight', 'bold')]
    }])

styled_stats


Unnamed: 0,Mean,Median,Mode,Std,Variance,Min,5%,25%,50%,75%,95%,Max,Range,IQR,Skewness,Kurtosis
Pregnancies,3.85,3.0,1.0,3.37,11.35,0.0,0.0,1.0,3.0,6.0,10.0,17.0,17.0,5.0,0.9,0.15
Glucose,120.89,117.0,99.0,31.97,1022.25,0.0,79.0,99.0,117.0,140.25,181.0,199.0,199.0,41.25,0.17,0.63
Blood Pressure,69.11,72.0,70.0,19.36,374.65,0.0,38.7,62.0,72.0,80.0,90.0,122.0,122.0,18.0,-1.84,5.14
Skin Thickness,20.54,23.0,0.0,15.95,254.47,0.0,0.0,0.0,23.0,32.0,44.0,99.0,99.0,32.0,0.11,-0.52
Insulin,79.8,30.5,0.0,115.24,13281.18,0.0,0.0,0.0,30.5,127.25,293.0,846.0,846.0,127.25,2.27,7.16
BMI,31.99,32.0,32.0,7.88,62.16,0.0,21.8,27.3,32.0,36.6,44.4,67.1,67.1,9.3,-0.43,3.26
Diabetes Pedigree Function,0.47,0.37,0.25,0.33,0.11,0.08,0.14,0.24,0.37,0.63,1.13,2.42,2.34,0.38,1.92,5.55
Age,33.24,29.0,22.0,11.76,138.3,21.0,21.0,24.0,29.0,41.0,58.0,81.0,60.0,17.0,1.13,0.63
Outcome,0.35,0.0,0.0,0.48,0.23,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.63,-1.6
