# Feature Engineering for HeartDisease Prediction

## 1.Data Exploration

In [2]:
import pandas as pd
df = pd.read_csv('heart_disease_uci.csv')

In [3]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [4]:
# Display the first 5 rows of the dataset
print(df.head())


   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal    0  


In [5]:
# Identify numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
print("Numerical features:", numerical_features)

# Identify categorical features
categorical_features = df.select_dtypes(include=['object']).columns
print("Categorical features:", categorical_features)


Numerical features: Index(['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num'], dtype='object')
Categorical features: Index(['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal'], dtype='object')


In [9]:
df.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [11]:
df.dtypes

id            int64
age           int64
sex          object
dataset      object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalch      float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
num           int64
dtype: object

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [5]:
df.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

## 2. Handling Missing Data

In [18]:
df['thal'].fillna(df['thal'].mode()[0], inplace=True)
df['ca'].fillna(df['ca'].mode()[0], inplace=True)


In [19]:
df.dropna(inplace=True)

In [20]:
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
904,905,57,Male,VA Long Beach,asymptomatic,130.0,207.0,False,st-t abnormality,96.0,True,1.0,flat,0.0,normal,0
908,909,74,Male,VA Long Beach,asymptomatic,155.0,310.0,False,normal,112.0,True,1.5,downsloping,0.0,normal,2
910,911,51,Female,VA Long Beach,asymptomatic,114.0,258.0,True,lv hypertrophy,96.0,False,1.0,upsloping,0.0,normal,0
911,912,62,Male,VA Long Beach,asymptomatic,160.0,254.0,True,st-t abnormality,108.0,True,3.0,flat,0.0,normal,4


In [21]:
df.isnull().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

## 3. Feature Creation

In [43]:
bins = [0, 40, 60, 100]
labels = ['<40', '40-60', '>60']
df['AgeGroup'] = pd.cut(df['age'], bins=bins, labels=labels)

In [44]:
def cholesterol_level(chol):
    if chol < 200:
        return 'Low'
    elif 200 <= chol <= 239:
        return 'Normal'
    else:
        return 'High'

df['CholesterolLevel'] = df['chol'].apply(cholesterol_level)

In [45]:
df['IsRisk'] = ((df['chol'] > 240) | (df['trestbps'] > 140) | (df['age'] > 60)).astype(int)

In [10]:
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


## 4. Feature Transformation

In [46]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in ['sex', 'cp', 'thal', 'AgeGroup']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [47]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_cols = ['chol', 'trestbps', 'thalch']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [48]:
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,AgeGroup,CholesterolLevel,IsRisk
0,1,63,1,Cleveland,3,0.725,0.386401,True,lv hypertrophy,0.633803,False,2.3,downsloping,0.0,0,0,2,Normal,1
1,2,67,1,Cleveland,0,0.800,0.474295,False,lv hypertrophy,0.338028,True,1.5,flat,3.0,1,2,2,High,1
2,3,67,1,Cleveland,0,0.600,0.379768,False,lv hypertrophy,0.485915,True,2.6,flat,2.0,2,1,2,Normal,1
3,4,37,1,Cleveland,2,0.650,0.414594,False,normal,0.894366,False,3.5,downsloping,0.0,1,0,1,High,1
4,5,41,0,Cleveland,1,0.650,0.338308,False,lv hypertrophy,0.788732,False,1.4,upsloping,0.0,1,0,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
904,905,57,1,VA Long Beach,0,0.650,0.343284,False,st-t abnormality,0.253521,True,1.0,flat,0.0,1,0,0,Normal,0
908,909,74,1,VA Long Beach,0,0.775,0.514096,False,normal,0.366197,True,1.5,downsloping,0.0,1,2,2,High,1
910,911,51,0,VA Long Beach,0,0.570,0.427861,True,lv hypertrophy,0.253521,False,1.0,upsloping,0.0,1,0,0,High,1
911,912,62,1,VA Long Beach,0,0.800,0.421227,True,st-t abnormality,0.338028,True,3.0,flat,0.0,1,4,2,High,1


## 5. Feature Interaction

In [49]:
df['BP_Chol_Interaction'] = df['trestbps'] * df['chol']

In [50]:
threshold = 100
df['ExerciseRisk'] = ((df['exang'] == 1) & (df['thalch'] < threshold)).astype(int)

In [51]:
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,...,oldpeak,slope,ca,thal,num,AgeGroup,CholesterolLevel,IsRisk,BP_Chol_Interaction,ExerciseRisk
0,1,63,1,Cleveland,3,0.725,0.386401,True,lv hypertrophy,0.633803,...,2.3,downsloping,0.0,0,0,2,Normal,1,0.280141,0
1,2,67,1,Cleveland,0,0.800,0.474295,False,lv hypertrophy,0.338028,...,1.5,flat,3.0,1,2,2,High,1,0.379436,1
2,3,67,1,Cleveland,0,0.600,0.379768,False,lv hypertrophy,0.485915,...,2.6,flat,2.0,2,1,2,Normal,1,0.227861,1
3,4,37,1,Cleveland,2,0.650,0.414594,False,normal,0.894366,...,3.5,downsloping,0.0,1,0,1,High,1,0.269486,0
4,5,41,0,Cleveland,1,0.650,0.338308,False,lv hypertrophy,0.788732,...,1.4,upsloping,0.0,1,0,0,Normal,0,0.219900,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
904,905,57,1,VA Long Beach,0,0.650,0.343284,False,st-t abnormality,0.253521,...,1.0,flat,0.0,1,0,0,Normal,0,0.223134,1
908,909,74,1,VA Long Beach,0,0.775,0.514096,False,normal,0.366197,...,1.5,downsloping,0.0,1,2,2,High,1,0.398425,1
910,911,51,0,VA Long Beach,0,0.570,0.427861,True,lv hypertrophy,0.253521,...,1.0,upsloping,0.0,1,0,0,High,1,0.243881,0
911,912,62,1,VA Long Beach,0,0.800,0.421227,True,st-t abnormality,0.338028,...,3.0,flat,0.0,1,4,2,High,1,0.336982,1
