6 Feature Engineering in the Heart Disease Dataset

In [1]:
import pandas as pd

file_path = r'E:\data set\heart_disease.csv'
heart_disease = pd.read_csv(file_path)
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Age Groups:

In [4]:
# Define age bins and labels
bins = [29, 39, 49, 59, 69, 79]
labels = ['30-39', '40-49', '50-59', '60-69', '70-79']

# Create age group feature
heart_disease['age_group'] = pd.cut(heart_disease['age'], bins=bins, labels=labels, right=True)

# Display the DataFrame with the new 'age_group' feature
print("\nDataFrame after adding 'age_group':")
heart_disease[['age', 'age_group']].head()



DataFrame after adding 'age_group':


Unnamed: 0,age,age_group
0,63,60-69
1,37,30-39
2,41,40-49
3,56,50-59
4,57,50-59


Create Cholesterol Level Categories

In [13]:

print(heart_disease.columns)

Column names in the dataset:
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target', 'age_group'],
      dtype='object')


Create BMI Categories (if weight and height columns exist)

In [14]:
# Check if 'weight' and 'height' columns exist
if 'weight' in heart_disease.columns and 'height' in heart_disease.columns:
    # Create BMI feature
    heart_disease['bmi'] = heart_disease['weight'] / (heart_disease['height'] ** 2)
    
    # Define BMI bins and labels
    bmi_bins = [0, 18.5, 24.9, 29.9, float('inf')]
    bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obesity']
    
    # Create BMI category feature
    heart_disease['bmi_category'] = pd.cut(heart_disease['bmi'], bins=bmi_bins, labels=bmi_labels, right=False)

    # Display the DataFrame with the new 'bmi' and 'bmi_category' features
    print("\nDataFrame after adding 'bmi' and 'bmi_category':")
    print(heart_disease[['weight', 'height', 'bmi', 'bmi_category']].head())
else:
    print("\nColumns 'weight' and/or 'height' not found in the dataset.")



Columns 'weight' and/or 'height' not found in the dataset.


Create Binary Features (example: high blood pressure)

In [15]:
# Check if 'high_blood_pressure' column exists
if 'high_blood_pressure' in heart_disease.columns:
    # Create a binary feature for high blood pressure
    heart_disease['has_high_blood_pressure'] = heart_disease['high_blood_pressure'].apply(lambda x: 'Yes' if x == 1 else 'No')

    # Display the DataFrame with the new 'has_high_blood_pressure' feature
    print("\nDataFrame after adding 'has_high_blood_pressure':")
    print(heart_disease[['high_blood_pressure', 'has_high_blood_pressure']].head())
else:
    print("\nColumn 'high_blood_pressure' not found in the dataset.")



Column 'high_blood_pressure' not found in the dataset.


In [18]:
# Display the updated DataFrame with all new features
print("\nUpdated DataFrame with all new features:")
heart_disease.head()



Updated DataFrame with all new features:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_group
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,60-69
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,30-39
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,40-49
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,50-59
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,50-59
