# Importing all the necessary libraries and api

In [None]:
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder,MinMaxScaler,StandardScaler,RobustScaler,MaxAbsScaler,FunctionTransformer
oe = OrdinalEncoder()
le = LabelEncoder()
ohe = OneHotEncoder(handle_unknown="ignore",sparse_output=False).set_output(transform="pandas")
min_max = MinMaxScaler(feature_range=(0, 1))
std = StandardScaler()
rb = RobustScaler()
maxabs = MaxAbsScaler()
ft = FunctionTransformer(np.log1p)
# "Import libraries and initialize common preprocessing tools"

# Loading the Data

In [None]:
DF = pd.read_excel("synthetic_health_dataset.xlsx") # "Loading the data"
DF 

Unnamed: 0,Gender,Smoking,Alcohol_Consumption,Exercise_Frequency,Blood_Pressure,Cholesterol_Level,Stress_Level,Age,BMI,Heart_Rate,Sleep_Hours,Blood_Sugar_Level,Medication_Use,Family_History,Illness
0,Female,No,Moderate,Never,Normal,Borderline,High,90.0,16.6,119.0,3.6,143.6,,No,Yes
1,Other,Yes,,Never,Normal,Normal,Low,20.0,29.9,69.0,9.9,121.8,,Yes,No
2,Male,Yes,Heavy,Daily,High,High,Low,52.0,33.5,54.0,8.5,107.0,,Yes,Yes
3,Male,Yes,Heavy,Daily,Normal,High,Low,15.0,20.3,72.0,9.5,92.1,,No,Yes
4,Male,No,Moderate,Often,High,High,Medium,60.0,36.0,58.0,4.4,113.6,,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Female,No,Heavy,Daily,Normal,Borderline,Medium,70.0,21.7,95.0,9.3,187.6,Regular,No,Yes
996,Female,No,Heavy,Rarely,Normal,Borderline,Low,61.0,29.8,80.0,3.8,114.3,Regular,Yes,Yes
997,Female,No,,Rarely,,High,Low,40.0,33.0,81.0,4.2,119.0,Regular,No,No
998,Male,Yes,Heavy,Never,High,Normal,High,94.0,38.5,81.0,6.0,187.8,Occasional,Yes,No


# Cleaning the Data

In [6]:
DF.isna().sum() # Checking how many null value present in the data set

Gender                  12
Smoking                  7
Alcohol_Consumption    351
Exercise_Frequency       7
Blood_Pressure          10
Cholesterol_Level        7
Stress_Level             5
Age                      8
BMI                      6
Heart_Rate               8
Sleep_Hours              6
Blood_Sugar_Level        3
Medication_Use         345
Family_History           5
Illness                  0
dtype: int64

In [None]:
DF.dropna(inplace=True) # "Removing null value and applying it at the same time"

In [9]:
DF.duplicated().sum() # Checking dupicated value

np.int64(0)

In [8]:
DF1 = DF.copy()
DF2 = DF.copy()
DF3 = DF.copy()
DF4 = DF.copy()
DF5 = DF.copy()
DF6 = DF.copy()
DF7 = DF.copy()
DF8 = DF.copy() # Making multiple copies of main data frame

In [None]:
# Applying LabelEncoder

In [10]:
for col in DF.columns:
    if not is_numeric_dtype(DF[col]):
        print(col) 

# Finding all the categorical columns

Gender
Smoking
Alcohol_Consumption
Exercise_Frequency
Blood_Pressure
Cholesterol_Level
Stress_Level
Medication_Use
Family_History
Illness


In [11]:
DF1["Smoking"].unique() # Checking unique values

<StringArray>
['No', 'Yes']
Length: 2, dtype: str

In [12]:
DF1["Smoking"] = le.fit_transform(DF1["Smoking"]) # Applying LabelEncoder 

In [13]:
DF1["Family_History"].unique() # Checking unique values

<StringArray>
['No', 'Yes']
Length: 2, dtype: str

In [14]:
DF1["Family_History"] = le.fit_transform(DF1["Family_History"]) # Applying LabelEncoder 

# Applying OrdinalEncoder

In [15]:
DF1["Alcohol_Consumption"].unique() # Checking unique values

<StringArray>
['Moderate', 'Heavy']
Length: 2, dtype: str

In [16]:
encoder = OrdinalEncoder(
    categories=[['Moderate', 'Heavy']])
DF1["Alcohol_Consumption"] = encoder.fit_transform(DF1[["Alcohol_Consumption"]])
# Encode the 'Alcohol_Consumption' column as ordinal values:
# 'Moderate' -> 0, 'Heavy' -> 1 (preserving the order of categories)

In [17]:
DF1["Exercise_Frequency"].unique() # Checking unique values

<StringArray>
['Never', 'Daily', 'Rarely', 'Often']
Length: 4, dtype: str

In [18]:
encoder = OrdinalEncoder(
    categories=[['Never', 'Rarely', 'Often', 'Daily']])
DF1["Exercise_Frequency"] = encoder.fit_transform(DF1[["Exercise_Frequency"]])
# Encode 'Exercise_Frequency' as ordinal numeric values based on frequency level:
# Never -> 0, Rarely -> 1, Often -> 2, Daily -> 3

In [19]:
DF1["Blood_Pressure"].unique() # Checking unique values

<StringArray>
['High', 'Low', 'Normal']
Length: 3, dtype: str

In [20]:
encoder1 = OrdinalEncoder(
    categories=[['Low', 'Normal', 'High']])
DF1["Blood_Pressure"] = encoder1.fit_transform(DF1[["Blood_Pressure"]])
# Encode 'Blood_Pressure' as ordinal numeric values by severity:
# Low -> 0, Normal -> 1, High -> 2

In [21]:
DF1["Cholesterol_Level"].unique() # Checking unique values

<StringArray>
['Borderline', 'Normal', 'High']
Length: 3, dtype: str

In [22]:
encoder2 = OrdinalEncoder(
    categories=[['Normal', 'Borderline', 'High']])
DF1["Cholesterol_Level"] = encoder2.fit_transform(DF1[["Cholesterol_Level"]])
# Encode 'Cholesterol_Level' as ordinal numeric values by increasing risk:
# Normal -> 0, Borderline -> 1, High -> 2

In [23]:
DF1["Stress_Level"].unique() # Checking unique values

<StringArray>
['Medium', 'Low', 'High']
Length: 3, dtype: str

In [24]:
encoder3 = OrdinalEncoder(
    categories=[['Low', 'Medium', 'High']])
DF1["Stress_Level"] = encoder3.fit_transform(DF1[["Stress_Level"]])
# Encode 'Stress_Level' as ordinal numeric values by intensity:
# Low -> 0, Medium -> 1, High -> 2

In [25]:
DF1["Medication_Use"].unique() # Checking unique values

<StringArray>
['Occasional', 'Regular']
Length: 2, dtype: str

In [26]:
encoder4 = OrdinalEncoder(
    categories=[['Occasional', 'Regular']])
DF1["Medication_Use"] = encoder4.fit_transform(DF1[["Medication_Use"]])
# Encode 'Medication_Use' as ordinal numeric values by frequency:
# Occasional -> 0, Regular -> 1