In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [3]:
# If extracted already
df = pd.read_csv("/content/Dataset of Diabetes .csv")

# View first few rows
df.head()


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


In [4]:
df.isnull().sum()


Unnamed: 0,0
ID,0
No_Pation,0
Gender,0
AGE,0
Urea,0
Cr,0
HbA1c,0
Chol,0
TG,0
HDL,0


In [6]:
df.drop(['ID','No_Pation'],axis=1,inplace=True)

In [7]:
df['Gender']=df['Gender'].map({'M':1,'F':0})

In [8]:
df['CLASS'].unique()

array(['N', 'N ', 'P', 'Y', 'Y '], dtype=object)

In [9]:
df['CLASS'] = df['CLASS'].map({'N':0, 'Y':1, 'P':1})

In [11]:
def remove_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    return df[(df[col] >= lower) & (df[col] <= upper)]


num_cols = ['AGE','Urea','Cr','HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']

# keep original
df_clean = df.copy()

print("Initial shape:", df_clean.shape)

for col in num_cols:
    before = df_clean.shape[0]
    df_clean = remove_outliers_iqr(df_clean, col)
    after = df_clean.shape[0]
    print(f"{col}: removed {before - after} rows")

print("Final shape:", df_clean.shape)
df_clean.head()
df_clean.describe()

Initial shape: (678, 12)
AGE: removed 19 rows
Urea: removed 11 rows
Cr: removed 0 rows
HbA1c: removed 0 rows
Chol: removed 3 rows
TG: removed 5 rows
HDL: removed 0 rows
LDL: removed 0 rows
VLDL: removed 0 rows
BMI: removed 0 rows
Final shape: (640, 12)


Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
count,639.0,640.0,640.0,640.0,640.0,640.0,640.0,640.0,640.0,640.0,640.0,639.0
mean,0.543036,55.139063,4.495938,57.698437,8.459469,4.769687,2.100937,1.109969,2.569359,0.990625,29.6275,0.926448
std,0.498535,5.07512,1.379908,16.934967,2.492399,1.084874,0.958464,0.318731,0.991912,0.445502,4.503374,0.261246
min,0.0,42.0,1.1,22.0,2.0,2.0,0.3,0.4,0.6,0.1,19.0,0.0
25%,0.0,52.0,3.5,45.0,6.8,4.075,1.4,0.9,1.8,0.7,26.0,1.0
50%,1.0,55.0,4.4,56.0,8.2,4.8,2.0,1.1,2.5,0.9,30.0,1.0
75%,1.0,59.0,5.4,70.0,10.2,5.5,2.6,1.3,3.3,1.3,33.0,1.0
max,1.0,69.0,8.2,106.0,14.8,7.5,4.6,1.9,5.5,2.1,39.0,1.0


In [12]:
from sklearn.preprocessing import MinMaxScaler

# numeric columns to scale
num_cols = ['AGE','Urea','Cr','HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']

scaler = MinMaxScaler()

df_clean[num_cols] = scaler.fit_transform(df_clean[num_cols])

df_clean.head()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
5,0.0,0.111111,0.169014,0.02381,0.15625,0.163636,0.162791,0.4,0.183673,0.15,0.1,0.0
6,0.0,0.296296,0.126761,0.333333,0.15625,0.290909,0.232558,0.333333,0.306122,0.25,0.25,0.0
7,1.0,0.222222,0.507042,0.297619,0.15625,0.163636,0.116279,0.333333,0.204082,0.15,0.25,0.0
13,0.0,0.111111,0.492958,0.380952,0.242188,0.4,0.325581,0.533333,0.326531,0.35,0.2,0.0
14,0.0,0.296296,0.338028,0.202381,0.15625,0.363636,0.27907,0.533333,0.326531,0.3,0.25,0.0


In [13]:
from sklearn.preprocessing import StandardScaler

# numeric columns to scale
num_cols = ['AGE','Urea','Cr','HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']

scaler = StandardScaler()

df_clean[num_cols] = scaler.fit_transform(df_clean[num_cols])

df_clean.head()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
5,0.0,-1.99936,-1.59261,-1.99143,-1.790627,-1.724763,-1.149546,-0.34529,-1.078922,-1.326788,-1.917284,0.0
6,0.0,-1.013391,-1.810186,-0.454944,-1.790627,-1.079022,-0.836301,-0.659279,-0.473557,-0.877505,-1.250596,0.0
7,1.0,-1.407779,0.147997,-0.632231,-1.790627,-1.724763,-1.358377,-0.659279,-0.978028,-1.326788,-1.250596,0.0
13,0.0,-1.99936,0.075472,-0.218561,-1.34894,-0.525529,-0.41864,0.282688,-0.372662,-0.428223,-1.472825,0.0
14,0.0,-1.013391,-0.722306,-1.104996,-1.790627,-0.710027,-0.62747,0.282688,-0.372662,-0.652864,-1.250596,0.0
