In [11]:
import pandas as pd
df = pd.read_csv("../data/diabetes_raw.csv")


In [3]:
print(df.head())
print(df.info())
print(df.describe())
print(df.shape)
print("Data loaded successfully.")

   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3              0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4              0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0   

In [4]:
df.columns


Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

Select only the needed features

In [13]:
selected_cols = [
    "BMI",
    "HighBP",
    "Age",
    "Smoker",
    "PhysActivity",
    "HeartDiseaseorAttack",
    "HighChol",
    "GenHlth",
    # Target
    "Diabetes_binary"
]
# creates a new DataFrame containing ONLY the above columns.
df_selected = df[selected_cols]
df_selected.head()


Unnamed: 0,BMI,HighBP,Age,Smoker,PhysActivity,HeartDiseaseorAttack,HighChol,GenHlth,Diabetes_binary
0,40.0,1.0,9.0,1.0,0.0,0.0,1.0,5.0,0.0
1,25.0,0.0,7.0,1.0,1.0,0.0,0.0,3.0,0.0
2,28.0,1.0,9.0,0.0,0.0,0.0,1.0,5.0,0.0
3,27.0,1.0,11.0,0.0,1.0,0.0,0.0,2.0,0.0
4,24.0,1.0,11.0,0.0,1.0,0.0,1.0,2.0,0.0


Check missing values

In [5]:
df_selected.isna().sum()


BMI                     0
HighBP                  0
Age                     0
Smoker                  0
PhysActivity            0
HeartDiseaseorAttack    0
HighChol                0
GenHlth                 0
Diabetes_binary         0
dtype: int64

Fix Invalid BMI Values (Zero BMI)

In [14]:
df_selected = df_selected.copy()
#Find the median(middle) BMI value that will be used to fill in missing values.
median_bmi = df_selected["BMI"].median()
# Replace all BMI=0 with median BMI
df_selected["BMI"] = df_selected["BMI"].replace(0, median_bmi)


Ensure All Binary Variables Are 0 or 1

In [15]:
# It loops through each binary column and prints the unique values inside.
for col in ["HighBP", "Smoker", "PhysActivity", "Diabetes_binary", "HighChol", "HeartDiseaseorAttack"]:
    print(col, df_selected[col].unique())


HighBP [1. 0.]
Smoker [1. 0.]
PhysActivity [0. 1.]
Diabetes_binary [0. 1.]
HighChol [1. 0.]
HeartDiseaseorAttack [0. 1.]


converts the smallest value to 0
and the largest value to 1
and everything else in between.

In [16]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()
df_selected[["BMI", "Age"]] = scaler.fit_transform(df_selected[["BMI", "Age"]])

# Normalize GenHlth to 0–1 scale (original typically 1–5)
genhlth = df_selected['GenHlth'].astype(float)
min_g, max_g = genhlth.min(), genhlth.max()
print(f"GenHlth range before: {min_g} to {max_g}")
# Use min-max scaling for robustness
df_selected['GenHlth'] = (genhlth - min_g) / (max_g - min_g + 1e-9)
print(f"GenHlth range after: {df_selected['GenHlth'].min():.3f} to {df_selected['GenHlth'].max():.3f}")

# Ensure binary columns are 0/1 exactly
for col in ["HighBP", "Smoker", "PhysActivity", "Diabetes_binary", "HighChol", "HeartDiseaseorAttack"]:
    df_selected[col] = df_selected[col].round().clip(0, 1).astype(int)


GenHlth range before: 1.0 to 5.0
GenHlth range after: 0.000 to 1.000


Save the Clean Dataset

In [17]:
df_selected.to_csv("../data/diabetes_clean.csv", index=False)
print("Cleaned dataset saved with added features: HeartDiseaseorAttack, HighChol, GenHlth!")


Cleaned dataset saved with added features: HeartDiseaseorAttack, HighChol, GenHlth!
