In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

file_path = r'C:\Users\vivek\Data science projects\Data-Science-Project-Portfolio\DIABETES\data\diabetes.csv'
df = pd.read_csv(file_path)

#replace zeros with NaN, then impute with median
df_clean = df.copy()
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df_clean[cols_with_zeros] = df_clean[cols_with_zeros].replace(0, np.nan)

# Impute with median
for col in cols_with_zeros:
    df_clean[col].fillna(df_clean[col].median(), inplace=True)

#checks if zeros are gone
print('Zero Values After Imputation:')
print(df_clean[cols_with_zeros].eq(0).sum())

#caps outliers using IQR method
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)

for col in ['Insulin', 'SkinThickness', 'BMI']:
    cap_outliers(df_clean, col)

#summary stats after outlier capping
print('\nSummary Statistics After Outlier Capping:')
print(df_clean.describe())

#Creates New Features
# BMI Category (Underweight, Normal, Overweight, Obese)
df_clean['BMI_Category'] = pd.cut(df_clean['BMI'], 
                                 bins=[0, 18.5, 25, 30, float('inf')],
                                 labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

# Glucose-Insulin Ratio (avoid division by zero)
df_clean['Glucose_Insulin_Ratio'] = df_clean['Glucose'] / (df_clean['Insulin'] + 1e-6)

scaler = StandardScaler()
numeric_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Glucose_Insulin_Ratio']
df_clean[numeric_cols] = scaler.fit_transform(df_clean[numeric_cols])

df_clean = pd.get_dummies(df_clean, columns=['BMI_Category'], drop_first=True)


print('\nFinal Dataset Preview:')
print(df_clean.head())
df_clean.to_csv('data\processed_diabetes.csv', index=False)
print('\nProcessed dataset saved as processed_diabetes.csv')

  from pandas.core import (


Zero Values After Imputation:
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
dtype: int64

Summary Statistics After Outlier Capping:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  121.656250      72.386719      28.866536  124.691081   
std       3.369578   30.438286      12.096642       7.442353    7.913595   
min       0.000000   44.000000      24.000000      14.500000  112.875000   
25%       1.000000   99.750000      64.000000      25.000000  121.500000   
50%       3.000000  117.000000      72.000000      29.000000  125.000000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      42.500000  135.875000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[col].fillna(df_clean[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[col].fillna(df_clean[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi