## 0. Imports + Loading dataset

In [22]:
from kagglehub import KaggleDatasetAdapter
from sklearn.preprocessing import StandardScaler

In [23]:
file_name = "diabetes_prediction_dataset.csv"

df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "iammustafatz/diabetes-prediction-dataset",
    file_name,
)

If you want to see details about exploratory data analysis, please take a look in the file [exploratory_data_analysis.ipynb](exploratory_data_analysis.ipynb) instead, because this file will be focused in get a baselines model.

## 1. Cleaning the dataset

In [24]:
df.isnull().sum() # we don't have null values in the dataset, so we don't need to handle them

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [25]:
# Convert the column "blood_glucose_level" to float
df['blood_glucose_level'] = df['blood_glucose_level'].astype(float)

In [26]:
# Transform feature gender in binary (Male == 1, Female = 0)
def is_male(gender: str):
    return 1 if gender.strip().lower() == 'male' else 0
df['gender'] = df['gender'].map(is_male)

In [27]:
# Convert the info smoking_history in information if the patient is a smoker or not (binary) 
df['is_smoker'] = df['smoking_history'].map({'never': 0, 'No Info': 0, 'current': 1, 'former': 1, 'ever': 1, 'not current': 0})
# Remove the column "age" from the dataset
df = df.drop(columns=['smoking_history'])

In [28]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,is_smoker
0,0,80.0,0,1,25.19,6.6,140.0,0,0
1,0,54.0,0,0,27.32,6.6,80.0,0,0
2,1,28.0,0,0,27.32,5.7,158.0,0,0
3,0,36.0,0,0,23.45,5.0,155.0,0,1
4,1,76.0,1,1,20.14,4.8,155.0,0,1
...,...,...,...,...,...,...,...,...,...
99995,0,80.0,0,0,27.32,6.2,90.0,0,0
99996,0,2.0,0,0,17.37,6.5,100.0,0,0
99997,1,66.0,0,0,27.83,5.7,155.0,0,1
99998,0,24.0,0,0,35.42,4.0,100.0,0,0


In [29]:
df.dtypes # all the data is in the correct format to proceed with the model

gender                   int64
age                    float64
hypertension             int64
heart_disease            int64
bmi                    float64
HbA1c_level            float64
blood_glucose_level    float64
diabetes                 int64
is_smoker                int64
dtype: object

## 2. Normalizing the features

In [32]:
# Apply normalization to the continuous features to improve the model performance
continuous_features = ['age', 'blood_glucose_level', 'bmi', 'HbA1c_level']

scaler = StandardScaler()
df[continuous_features] = scaler.fit_transform(df[continuous_features])

In [None]:
df[continuous_features].describe() # normalization was applied correctly

Unnamed: 0,age,blood_glucose_level,bmi,HbA1c_level
count,100000.0,100000.0,100000.0,100000.0
mean,-1.7621460000000002e-17,-1.534772e-17,8.242296e-18,1.151079e-17
std,1.000005,1.000005,1.000005,1.000005
min,-1.856658,-1.42621,-2.60832,-1.893686
25%,-0.7943364,-0.9349053,-0.5561106,-0.6794897
50%,0.04948073,0.04770422,-0.0001155837,0.2545078
75%,0.804475,0.5144437,0.3404125,0.6281067
max,1.692704,3.978142,10.30161,3.2433
