## Importing libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif


### Load the dataset
### Drop unnecessary column
### Handle missing values (if any)

In [26]:
# Load the dataset
df = pd.read_csv("Health.csv")

# Drop unnecessary column
df = df.drop(columns=['Unnamed: 0'])

# Handle missing values (if any)
df = df.dropna()

# Handle categorical values

In [None]:
# Handle categorical values
categorical_columns = df.select_dtypes(include=['object']).columns
if len(categorical_columns) > 0:
    encoder = OneHotEncoder(drop='first', sparse=False)
    encoded_cats = encoder.fit_transform(df[categorical_columns])
    df = df.drop(columns=categorical_columns)
    df = pd.concat([df, pd.DataFrame(encoded_cats)], axis=1)


##  Scale the features

In [None]:
# Scale the numerical features
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1]).astype(float)  


  0.3132473 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1]).astype(float)  # Explicitly cast to float
  1.00184116]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1]).astype(float)  # Explicitly cast to float
 -1.27819966]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1]).astype(float)  # Explicitly cast to float
 -0.43468842]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1]).astype(float)  # Explicitly cast to float
 -0.63961498]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_scaled.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1

## remove outliers

In [30]:
# Detect and remove outliers using Z-score method (threshold = 3)
z_scores = np.abs(df_scaled.iloc[:, :-1])  # Compute Z-scores for all features except the target
df_cleaned = df_scaled[(z_scores < 3).all(axis=1)]



## Feature Selection

In [40]:
# Feature selection using SelectKBest
X = df_cleaned.iloc[:, :-1]
y = df_cleaned.iloc[:, -1]
selector = SelectKBest(score_func=f_classif, k=X.shape[1])
X_selected = selector.fit_transform(X, y)



## PCA

In [41]:
# Perform PCA to reduce dimensionality
pca = PCA(n_components=0.95)  # Retain 95% of variance
df_pca = pca.fit_transform(X_selected)

# Display the number of components selected
print(f"Original feature count: {df.shape[1] - 1}")  # Excluding target
print(f"Reduced feature count after PCA: {df_pca.shape[1]}")

# Show explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)

Original feature count: 8
Reduced feature count after PCA: 7
Explained variance ratio: [0.19795352 0.17600756 0.12633067 0.12561105 0.12476293 0.12452003
 0.12372267]
