In [20]:
import pandas as pd
from pathlib import Path
import numpy as np

In [3]:
#Create path to CSV file
diabetes= Path("Resources/diabetes.csv")

In [4]:
#Read in CSV file 
diabetes_df=pd.read_csv(diabetes)

In [5]:
#View diabetes dataframe
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [6]:
diabetes_drop_df = diabetes_df[diabetes_df["SkinThickness"] != 0]
diabtetes_drop_df=diabetes_drop_df.reset_index(drop=True)
diabetes_drop_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
6,3,78,50,32,88,31.0,0.248,26,1
...,...,...,...,...,...,...,...,...,...
761,9,170,74,31,0,44.0,0.403,43,1
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0


In [7]:
data_types = diabetes_df.dtypes
print(data_types)

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object


In [10]:
zero_skin_count = (diabetes_df["SkinThickness"]== 0).sum()
zero_skin_count

227

In [11]:
zero_insulin_count = (diabetes_df["Insulin"]== 0).sum()
zero_insulin_count

374

In [12]:
zero_glucose_count = (diabetes_df["Glucose"]== 0).sum()
zero_glucose_count

5

In [13]:
#From reading about the raw data these 0 values are incorrect input rather than true values, as there are too many
#rows to remove I am going to replace the 0s with the median value

#copy diabetes dataframe
median_diabetes_df = diabetes_df.copy()

#columns to impute
columns_to_median = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

for column in columns_to_median:
    non_zero_median = diabetes_df[diabetes_df[column] != 0][column].median()
    median_diabetes_df[column].replace(0, non_zero_median, inplace=True)
    
median_diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,125,33.6,0.627,50,1
1,1,85,66,29,125,26.6,0.351,31,0
2,8,183,64,29,125,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,125,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,29,125,30.1,0.349,47,1


In [14]:
# The pregnancy column will be removed so the model can be used for men and women 
median_diabetes_df.drop(columns=["Pregnancies"], inplace=True)
median_diabetes_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,72,35,125,33.6,0.627,50,1
1,85,66,29,125,26.6,0.351,31,0
2,183,64,29,125,23.3,0.672,32,1
3,89,66,23,94,28.1,0.167,21,0
4,137,40,35,168,43.1,2.288,33,1


In [19]:
#Export to csv file 
output_file_path= Path('Resources/median_diabetes_df.csv') 
median_diabetes_df.to_csv(output_file_path, index=False)

In [21]:
cleaned_diabetes_df = median_diabetes_df.copy()

In [25]:
# Binning the Blood Pressure, BMI and Glucose columns for graphs 

#Blood pressure ranges 
Blood_Pressure_Ranges = [0,59,69,79,89,100]
BP_Categories = ["Low", "Normal", "Elevated", "High Stage 1", "High Stage 2"]

#Create new feature
cleaned_diabetes_df["BloodPressureRange"] =pd.cut(cleaned_diabetes_df["BloodPressure"], 
                                                bins=Blood_Pressure_Ranges, labels=BP_Categories)
cleaned_diabetes_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BloodPressureRange
0,148,72,35,125,33.6,0.627,50,1,Elevated
1,85,66,29,125,26.6,0.351,31,0,Normal
2,183,64,29,125,23.3,0.672,32,1,Normal
3,89,66,23,94,28.1,0.167,21,0,Normal
4,137,40,35,168,43.1,2.288,33,1,Low


In [28]:
#Glucose ranges 
Glucose_Ranges = [0, 139,199, float("inf")]
Glucose_Categories = ["Normal", "Prediabetic", "Diabetic"]

#Create new feature
cleaned_diabetes_df["GlucoseRange"] =pd.cut(cleaned_diabetes_df["Glucose"], 
                                                bins=Glucose_Ranges, labels=Glucose_Categories)
cleaned_diabetes_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BloodPressureRange,GlucoseRange
0,148,72,35,125,33.6,0.627,50,1,Elevated,Prediabetic
1,85,66,29,125,26.6,0.351,31,0,Normal,Normal
2,183,64,29,125,23.3,0.672,32,1,Normal,Prediabetic
3,89,66,23,94,28.1,0.167,21,0,Normal,Normal
4,137,40,35,168,43.1,2.288,33,1,Low,Normal


In [29]:
#BMI ranges 
BMI_Ranges = [0,18.5,24.9,29.9,34.9,39.9, float("inf")]
BMI_Categories = ["Underweight", "Normal Weight", "Overweight", "Obesity Class I",
                  "Obesity Class II", "Obesity Class III"]

#Create new feature
cleaned_diabetes_df["BMIRange"] =pd.cut(cleaned_diabetes_df["BMI"], 
                                                bins=BMI_Ranges, labels=BMI_Categories)
cleaned_diabetes_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BloodPressureRange,GlucoseRange,BMIRange
0,148,72,35,125,33.6,0.627,50,1,Elevated,Prediabetic,Obesity Class I
1,85,66,29,125,26.6,0.351,31,0,Normal,Normal,Overweight
2,183,64,29,125,23.3,0.672,32,1,Normal,Prediabetic,Normal Weight
3,89,66,23,94,28.1,0.167,21,0,Normal,Normal,Overweight
4,137,40,35,168,43.1,2.288,33,1,Low,Normal,Obesity Class III


In [30]:
#Export to csv file 
output_file_path= Path('Resources/cleaned_diabetes_df.csv') 
cleaned_diabetes_df.to_csv(output_file_path, index=False)