# Importing all the necessary libraries and api

In [None]:
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder,MinMaxScaler,StandardScaler,RobustScaler,MaxAbsScaler,FunctionTransformer
oe = OrdinalEncoder()
le = LabelEncoder()
ohe = OneHotEncoder(handle_unknown="ignore",sparse_output=False).set_output(transform="pandas")
min_max = MinMaxScaler(feature_range=(0, 1))
std = StandardScaler()
rb = RobustScaler()
maxabs = MaxAbsScaler()
ft = FunctionTransformer(np.log1p)
# "Import libraries and initialize common preprocessing tools"

# Loading the Data

In [None]:
DF = pd.read_excel("synthetic_health_dataset.xlsx") # "Loading the data"
DF 

Unnamed: 0,Gender,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness
0,Female,No,Moderate,Never,Normal,Borderline,High,90.0,16.6,119.0,3.6,143.6,,No,Yes
1,Other,Yes,,Never,Normal,Normal,Low,20.0,29.9,69.0,9.9,121.8,,Yes,No
2,Male,Yes,Heavy,Daily,High,High,Low,52.0,33.5,54.0,8.5,107.0,,Yes,Yes
3,Male,Yes,Heavy,Daily,Normal,High,Low,15.0,20.3,72.0,9.5,92.1,,No,Yes
4,Male,No,Moderate,Often,High,High,Medium,60.0,36.0,58.0,4.4,113.6,,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Female,No,Heavy,Daily,Normal,Borderline,Medium,70.0,21.7,95.0,9.3,187.6,Regular,No,Yes
996,Female,No,Heavy,Rarely,Normal,Borderline,Low,61.0,29.8,80.0,3.8,114.3,Regular,Yes,Yes
997,Female,No,,Rarely,,High,Low,40.0,33.0,81.0,4.2,119.0,Regular,No,No
998,Male,Yes,Heavy,Never,High,Normal,High,94.0,38.5,81.0,6.0,187.8,Occasional,Yes,No


# Cleaning the Data

In [6]:
DF.isna().sum() # Checking how many null value present in the data set

Gender                  12
Smoking                  7
Alcohol_Consumption    351
Exercise_Frequency       7
Blood_Pressure          10
Cholesterol_Level        7
Stress_Level             5
Age                      8
BMI                      6
Heart_Rate               8
Sleep_Hours              6
Blood_Sugar_Level        3
Medication_Use         345
Family_History           5
Illness                  0
dtype: int64

In [None]:
DF.dropna(inplace=True) # "Removing null value and applying it at the same time"

In [9]:
DF.duplicated().sum() # Checking dupicated value

np.int64(0)

In [8]:
DF1 = DF.copy()
DF2 = DF.copy()
DF3 = DF.copy()
DF4 = DF.copy()
DF5 = DF.copy()
DF6 = DF.copy()
DF7 = DF.copy()
DF8 = DF.copy() # Making multiple copies of main data frame

In [None]:
# Applying LabelEncoder

In [10]:
for col in DF.columns:
    if not is_numeric_dtype(DF[col]):
        print(col) 

# Finding all the categorical columns

Gender
Smoking
Alcohol_Consumption
Exercise_Frequency
Blood_Pressure
Cholesterol_Level
Stress_Level
Medication_Use
Family_History
Illness


In [11]:
DF1["Smoking"].unique() # Checking unique values

<StringArray>
['No', 'Yes']
Length: 2, dtype: str

In [12]:
DF1["Smoking"] = le.fit_transform(DF1["Smoking"]) # Applying LabelEncoder 

In [13]:
DF1["Family_History"].unique() # Checking unique values

<StringArray>
['No', 'Yes']
Length: 2, dtype: str

In [14]:
DF1["Family_History"] = le.fit_transform(DF1["Family_History"]) # Applying LabelEncoder 

# Applying OrdinalEncoder

In [15]:
DF1["Alcohol_Consumption"].unique() # Checking unique values

<StringArray>
['Moderate', 'Heavy']
Length: 2, dtype: str

In [16]:
encoder = OrdinalEncoder(
    categories=[['Moderate', 'Heavy']])
DF1["Alcohol_Consumption"] = encoder.fit_transform(DF1[["Alcohol_Consumption"]])
# Encode the 'Alcohol_Consumption' column as ordinal values:
# 'Moderate' -> 0, 'Heavy' -> 1 (preserving the order of categories)

In [17]:
DF1["Exercise_Frequency"].unique() # Checking unique values

<StringArray>
['Never', 'Daily', 'Rarely', 'Often']
Length: 4, dtype: str

In [18]:
encoder = OrdinalEncoder(
    categories=[['Never', 'Rarely', 'Often', 'Daily']])
DF1["Exercise_Frequency"] = encoder.fit_transform(DF1[["Exercise_Frequency"]])
# Encode 'Exercise_Frequency' as ordinal numeric values based on frequency level:
# Never -> 0, Rarely -> 1, Often -> 2, Daily -> 3

In [19]:
DF1["Blood_Pressure"].unique() # Checking unique values

<StringArray>
['High', 'Low', 'Normal']
Length: 3, dtype: str

In [20]:
encoder1 = OrdinalEncoder(
    categories=[['Low', 'Normal', 'High']])
DF1["Blood_Pressure"] = encoder1.fit_transform(DF1[["Blood_Pressure"]])
# Encode 'Blood_Pressure' as ordinal numeric values by severity:
# Low -> 0, Normal -> 1, High -> 2

In [21]:
DF1["Cholesterol_Level"].unique() # Checking unique values

<StringArray>
['Borderline', 'Normal', 'High']
Length: 3, dtype: str

In [22]:
encoder2 = OrdinalEncoder(
    categories=[['Normal', 'Borderline', 'High']])
DF1["Cholesterol_Level"] = encoder2.fit_transform(DF1[["Cholesterol_Level"]])
# Encode 'Cholesterol_Level' as ordinal numeric values by increasing risk:
# Normal -> 0, Borderline -> 1, High -> 2

In [23]:
DF1["Stress_Level"].unique() # Checking unique values

<StringArray>
['Medium', 'Low', 'High']
Length: 3, dtype: str

In [24]:
encoder3 = OrdinalEncoder(
    categories=[['Low', 'Medium', 'High']])
DF1["Stress_Level"] = encoder3.fit_transform(DF1[["Stress_Level"]])
# Encode 'Stress_Level' as ordinal numeric values by intensity:
# Low -> 0, Medium -> 1, High -> 2

In [25]:
DF1["Medication_Use"].unique() # Checking unique values

<StringArray>
['Occasional', 'Regular']
Length: 2, dtype: str

In [26]:
encoder4 = OrdinalEncoder(
    categories=[['Occasional', 'Regular']])
DF1["Medication_Use"] = encoder4.fit_transform(DF1[["Medication_Use"]])
# Encode 'Medication_Use' as ordinal numeric values by frequency:
# Occasional -> 0, Regular -> 1

# Applying OneHotEncoder

In [27]:
encode= ohe.fit_transform(DF1[["Gender"]]) # Apply One-Hot Encoding to 'Gender'

In [28]:
df1= pd.DataFrame(encode, columns=ohe.get_feature_names_out(["Gender"]))
# Create a DataFrame from the one-hot encoded Gender array
# and assign meaningful column names (e.g., Gender_Male, Gender_Female)

In [29]:
DF1 = pd.concat([DF1,df1], axis=1).drop(columns=["Gender"])
# Concatenate one-hot encoded Gender columns to DF1
# and drop the original 'Gender' column to avoid redundancy

In [30]:
DF1.head()

Unnamed: 0,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness,Gender_Female,Gender_Male,Gender_Other
8,0,0.0,0.0,2.0,1.0,1.0,79.0,20.4,95.0,8.8,86.2,0.0,0,Yes,0.0,1.0,0.0
11,0,0.0,3.0,0.0,1.0,0.0,99.0,16.8,57.0,3.9,184.4,1.0,0,Yes,0.0,1.0,0.0
16,0,0.0,1.0,1.0,0.0,2.0,58.0,18.1,99.0,6.9,126.7,1.0,1,No,0.0,0.0,1.0
21,0,1.0,3.0,1.0,2.0,2.0,70.0,32.6,93.0,3.5,178.3,0.0,0,Yes,1.0,0.0,0.0
22,0,0.0,0.0,2.0,2.0,0.0,78.0,20.4,83.0,7.8,115.2,1.0,1,No,0.0,0.0,1.0


# Applying LabelEncoder with loop

In [31]:
binary_cols = ['Smoking', 'Family_History']

for col in binary_cols:
    DF2[col] = le.fit_transform(DF2[col])

# Apply Label Encoding to binary categorical columns  

# Applying OrdinalEncoder with loop

In [32]:
ordinal_cols = ['Alcohol_Consumption','Exercise_Frequency','Blood_Pressure','Cholesterol_Level','Stress_Level','Medication_Use']

for col in ordinal_cols:
    DF2[col] = oe.fit_transform(DF2[[col]])

# Apply Ordinal Encoding to columns with natural order

# Applying OneHotEncoder with loop using dummy

In [33]:
categorical_cols = ['Gender']

for col in categorical_cols:
    dummy = pd.get_dummies(DF2[col],drop_first=True,prefix = col).astype("int64")
    DF2 = pd.concat([DF2, dummy], axis=1)
    DF2.drop(col, axis=1, inplace=True)
    
# Create dummy variables for categorical column 'Gender'
# drop_first=True avoids the dummy variable trap (multicollinearity)
# Add encoded columns to DF2 and remove the original column

In [34]:
DF2.head()

Unnamed: 0,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness,Gender_Male,Gender_Other
8,0,1.0,1.0,0.0,0.0,2.0,79.0,20.4,95.0,8.8,86.2,0.0,0,Yes,1,0
11,0,1.0,0.0,1.0,0.0,1.0,99.0,16.8,57.0,3.9,184.4,1.0,0,Yes,1,0
16,0,1.0,3.0,2.0,2.0,0.0,58.0,18.1,99.0,6.9,126.7,1.0,1,No,0,1
21,0,0.0,0.0,2.0,1.0,0.0,70.0,32.6,93.0,3.5,178.3,0.0,0,Yes,0,0
22,0,1.0,1.0,0.0,1.0,1.0,78.0,20.4,83.0,7.8,115.2,1.0,1,No,0,1


# Min–Max Scaling

In [35]:
for col in DF.columns:
    if is_numeric_dtype(DF[col]):
        print(col)

# Loop through all columns and print names of numeric (int/float) columns        

Age
BMI
Heart_Rate
Sleep_Hours
Blood_Sugar_Level


In [36]:
DF3.head()

Unnamed: 0,Gender,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness
8,Male,No,Moderate,Never,High,Borderline,Medium,79.0,20.4,95.0,8.8,86.2,Occasional,No,Yes
11,Male,No,Moderate,Daily,Low,Borderline,Low,99.0,16.8,57.0,3.9,184.4,Regular,No,Yes
16,Other,No,Moderate,Rarely,Normal,Normal,High,58.0,18.1,99.0,6.9,126.7,Regular,Yes,No
21,Female,No,Heavy,Daily,Normal,High,High,70.0,32.6,93.0,3.5,178.3,Occasional,No,Yes
22,Other,No,Moderate,Never,High,High,Low,78.0,20.4,83.0,7.8,115.2,Regular,Yes,No


In [37]:
DF3["Age"]= min_max.fit_transform(DF3[["Age"]])
# Scale 'Age' to a normalized range (0–1) using Min-Max Scaling

In [38]:
DF3["BMI"]= min_max.fit_transform(DF3[["BMI"]])
# Scale 'BMI' to a normalized range (0–1) using Min-Max Scaling

In [39]:
DF3["Heart_Rate"]= min_max.fit_transform(DF3[["Heart_Rate"]])
# Scale 'BMI' to a normalized range (0–1) using Min-Max Scaling

In [40]:
DF3["Sleep_Hours"]= min_max.fit_transform(DF3[["Sleep_Hours"]])
# Scale 'Heart_Rate' to a normalized range (0–1) using Min-Max Scaling

In [41]:
DF3["Blood_Sugar_Level"]= min_max.fit_transform(DF3[["Blood_Sugar_Level"]])
# Scale 'Blood_Sugar_Level' to a normalized range (0–1) using Min-Max Scaling

In [42]:
DF3.head()

Unnamed: 0,Gender,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness
8,Male,No,Moderate,Never,High,Borderline,Medium,0.79798,0.210526,0.652174,0.828571,0.123648,Occasional,No,Yes
11,Male,No,Moderate,Daily,Low,Borderline,Low,1.0,0.064777,0.101449,0.128571,0.882535,Regular,No,Yes
16,Other,No,Moderate,Rarely,Normal,Normal,High,0.585859,0.117409,0.710145,0.557143,0.436631,Regular,Yes,No
21,Female,No,Heavy,Daily,Normal,High,High,0.707071,0.704453,0.623188,0.071429,0.835394,Occasional,No,Yes
22,Other,No,Moderate,Never,High,High,Low,0.787879,0.210526,0.478261,0.685714,0.347759,Regular,Yes,No


# Standardization Scaling

In [43]:
DF4.head()

Unnamed: 0,Gender,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness
8,Male,No,Moderate,Never,High,Borderline,Medium,79.0,20.4,95.0,8.8,86.2,Occasional,No,Yes
11,Male,No,Moderate,Daily,Low,Borderline,Low,99.0,16.8,57.0,3.9,184.4,Regular,No,Yes
16,Other,No,Moderate,Rarely,Normal,Normal,High,58.0,18.1,99.0,6.9,126.7,Regular,Yes,No
21,Female,No,Heavy,Daily,Normal,High,High,70.0,32.6,93.0,3.5,178.3,Occasional,No,Yes
22,Other,No,Moderate,Never,High,High,Low,78.0,20.4,83.0,7.8,115.2,Regular,Yes,No


In [44]:
for col in DF.columns:
    if is_numeric_dtype(DF[col]):
        print(col)

# Loop through all columns and print names of numeric (int/float) columns  

Age
BMI
Heart_Rate
Sleep_Hours
Blood_Sugar_Level


In [45]:
DF4["Age"]= std.fit_transform(DF4[["Age"]])
# Standardize using StandardScaler

In [46]:
DF4["BMI"]= std.fit_transform(DF4[["BMI"]])
# Standardize using StandardScaler

In [47]:
DF4["Heart_Rate"]= std.fit_transform(DF4[["Heart_Rate"]])
# Standardize using StandardScaler

In [48]:
DF4["Sleep_Hours"]= std.fit_transform(DF4[["Sleep_Hours"]])
# Standardize using StandardScaler

In [49]:
DF4["Blood_Sugar_Level"]= std.fit_transform(DF4[["Blood_Sugar_Level"]])
# Standardize using StandardScaler

In [50]:
DF4.head()

Unnamed: 0,Gender,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness
8,Male,No,Moderate,Never,High,Borderline,Medium,1.034589,-0.934544,0.494317,1.036195,-1.354719,Occasional,No,Yes
11,Male,No,Moderate,Daily,Low,Borderline,Low,1.716659,-1.429377,-1.363395,-1.333877,1.269922,Regular,No,Yes
16,Other,No,Moderate,Rarely,Normal,Normal,High,0.318415,-1.250687,0.689866,0.117187,-0.272255,Regular,Yes,No
21,Female,No,Heavy,Daily,Normal,High,High,0.727657,0.742388,0.396543,-1.527353,1.106884,Occasional,No,Yes
22,Other,No,Moderate,Never,High,High,Low,1.000485,-0.934544,-0.092329,0.552507,-0.579621,Regular,Yes,No


# Robust Scaling

In [51]:
DF5.head()

Unnamed: 0,Gender,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness
8,Male,No,Moderate,Never,High,Borderline,Medium,79.0,20.4,95.0,8.8,86.2,Occasional,No,Yes
11,Male,No,Moderate,Daily,Low,Borderline,Low,99.0,16.8,57.0,3.9,184.4,Regular,No,Yes
16,Other,No,Moderate,Rarely,Normal,Normal,High,58.0,18.1,99.0,6.9,126.7,Regular,Yes,No
21,Female,No,Heavy,Daily,Normal,High,High,70.0,32.6,93.0,3.5,178.3,Occasional,No,Yes
22,Other,No,Moderate,Never,High,High,Low,78.0,20.4,83.0,7.8,115.2,Regular,Yes,No


In [52]:
for col in DF.columns:
    if is_numeric_dtype(DF[col]):
        print(col)

# Loop through all columns and print names of numeric (int/float) columns

Age
BMI
Heart_Rate
Sleep_Hours
Blood_Sugar_Level


In [53]:
DF5["Age"]= rb.fit_transform(DF5[["Age"]])
# Scale using RobustScaler

In [54]:
DF5["BMI"]= rb.fit_transform(DF5[["BMI"]])
# Scale using RobustScaler

In [55]:
DF5["Heart_Rate"]= rb.fit_transform(DF5[["Heart_Rate"]])
# Scale using RobustScaler

In [56]:
DF5["Sleep_Hours"]= rb.fit_transform(DF5[["Sleep_Hours"]])
# Scale using RobustScaler

In [57]:
DF5["Blood_Sugar_Level"]= rb.fit_transform(DF5[["Blood_Sugar_Level"]])
# Scale using RobustScaler

In [58]:
DF5.head()

Unnamed: 0,Gender,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness
8,Male,No,Moderate,Never,High,Borderline,Medium,0.596154,-0.479087,0.314286,0.542857,-0.820472,Occasional,No,Yes
11,Male,No,Moderate,Daily,Low,Borderline,Low,0.980769,-0.752852,-0.771429,-0.857143,0.725984,Regular,No,Yes
16,Other,No,Moderate,Rarely,Normal,Normal,High,0.192308,-0.653992,0.428571,0.0,-0.182677,Regular,Yes,No
21,Female,No,Heavy,Daily,Normal,High,High,0.423077,0.448669,0.257143,-0.971429,0.629921,Occasional,No,Yes
22,Other,No,Moderate,Never,High,High,Low,0.576923,-0.479087,-0.028571,0.257143,-0.36378,Regular,Yes,No


# Max Absolute Scaling

In [59]:
DF6.head()

Unnamed: 0,Gender,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness
8,Male,No,Moderate,Never,High,Borderline,Medium,79.0,20.4,95.0,8.8,86.2,Occasional,No,Yes
11,Male,No,Moderate,Daily,Low,Borderline,Low,99.0,16.8,57.0,3.9,184.4,Regular,No,Yes
16,Other,No,Moderate,Rarely,Normal,Normal,High,58.0,18.1,99.0,6.9,126.7,Regular,Yes,No
21,Female,No,Heavy,Daily,Normal,High,High,70.0,32.6,93.0,3.5,178.3,Occasional,No,Yes
22,Other,No,Moderate,Never,High,High,Low,78.0,20.4,83.0,7.8,115.2,Regular,Yes,No


In [60]:
for col in DF.columns:
    if is_numeric_dtype(DF[col]):
        print(col)

# Loop through all columns and print names of numeric (int/float) columns

Age
BMI
Heart_Rate
Sleep_Hours
Blood_Sugar_Level


In [61]:
DF6["Age"]= maxabs.fit_transform(DF6[["Age"]])
# Scale using MaxAbsScaler

In [62]:
DF6["BMI"]= maxabs.fit_transform(DF6[["BMI"]])
# Scale using MaxAbsScaler

In [63]:
DF6["Heart_Rate"]= maxabs.fit_transform(DF6[["Heart_Rate"]])
# Scale using MaxAbsScaler

In [64]:
DF6["Sleep_Hours"]= maxabs.fit_transform(DF6[["Sleep_Hours"]])
# Scale using MaxAbsScaler

In [65]:
DF6["Blood_Sugar_Level"]= maxabs.fit_transform(DF6[["Blood_Sugar_Level"]])
# Scale using MaxAbsScaler

In [66]:
DF6.head()

Unnamed: 0,Gender,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness
8,Male,No,Moderate,Never,High,Borderline,Medium,0.79798,0.511278,0.798319,0.88,0.431864,Occasional,No,Yes
11,Male,No,Moderate,Daily,Low,Borderline,Low,1.0,0.421053,0.478992,0.39,0.923848,Regular,No,Yes
16,Other,No,Moderate,Rarely,Normal,Normal,High,0.585859,0.453634,0.831933,0.69,0.63477,Regular,Yes,No
21,Female,No,Heavy,Daily,Normal,High,High,0.707071,0.817043,0.781513,0.35,0.893287,Occasional,No,Yes
22,Other,No,Moderate,Never,High,High,Low,0.787879,0.511278,0.697479,0.78,0.577154,Regular,Yes,No
