## Use Heart Disease [Dataset](https://github.com/cksajil/DSAIRP25/blob/main/datasets/heart_disease.csv) and answer the following questions

## 1. Find the top 5 important features to the target column

In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Load dataset
import pandas as pd

df = pd.read_csv("/heart_disease.csv")  # exact filename



X = df.drop('target', axis=1)
y = df['target']

model = RandomForestClassifier(random_state=42)
model.fit(X, y)

importances = pd.Series(model.feature_importances_, index=X.columns)
top5 = importances.sort_values(ascending=False).head(5)
top5

Unnamed: 0,0
cp,0.134201
thalach,0.120473
ca,0.116755
oldpeak,0.116151
thal,0.097043


## 2. Perform Box-Cox Transformations to relevant features

In [12]:
from scipy import stats

# Select numeric features excluding target
numeric_cols = df.select_dtypes(include=np.number).columns.drop('target')

df_boxcox = df.copy()
for col in numeric_cols:
    if (df[col] > 0).all():
        df_boxcox[col], _ = stats.boxcox(df[col])

df_boxcox.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,272.372422,1,0,1.313871,4.138423,0,1,31304.528038,0,1.0,2,2,3,0
1,280.42939,1,0,1.316927,4.113095,1,0,26281.499672,1,3.1,0,0,3,0
2,429.185698,1,0,1.317822,4.022191,0,1,16473.093059,1,2.6,0,0,3,0
3,347.72537,1,0,1.318334,4.113095,0,1,28540.973582,0,0.0,2,1,3,0
4,356.482692,0,0,1.316553,4.325815,1,1,11515.358896,0,1.9,1,3,2,0


## 3. Perform Feature Binning to Age Column and add it as a new column to the dataset

In [13]:
# Feature binning for age
bins = [0, 40, 55, 70, 100]
labels = ['Young', 'Middle', 'Senior', 'Elder']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)
df[['age', 'age_group']].head()

Unnamed: 0,age,age_group
0,52,Middle
1,53,Middle
2,70,Senior
3,61,Senior
4,62,Senior


## 4. Find the most orthogonal feature to the 'chol' feature

In [14]:
# Find most orthogonal feature to 'chol'
from numpy.linalg import norm

chol_vec = df['chol'].values
orthogonality = {}

for col in numeric_cols:
    if col != 'chol':
        v = df[col].values
        cos_sim = np.dot(chol_vec, v) / (norm(chol_vec) * norm(v))
        orthogonality[col] = abs(cos_sim)

min(orthogonality, key=orthogonality.get)

'fbs'