In [1]:
# Data Exploration

In [2]:
#Load the dataset using Pandas.

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('heart_disease_uci.csv')

In [5]:
#Display the first 5 rows and check for missing values and data types

In [6]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [7]:
df.dtypes

id            int64
age           int64
sex          object
dataset      object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalch      float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
num           int64
dtype: object

In [8]:
df.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [9]:
#Identify the types of features (categorical vs. numerical) and any possible missing 
#data.

In [10]:
Numerical_Features = df.select_dtypes(include = ['int64','float64']).columns

In [11]:
Numerical_Features

Index(['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num'], dtype='object')

In [12]:
Categorical_Features = df.select_dtypes(include = ['object','category']).columns

In [13]:
Categorical_Features

Index(['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal'], dtype='object')

# Handling Missing Data

In [14]:
#Handle missing values by imputing or removing rows/columns with missing values.

In [15]:
df.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [16]:
# Imputing numerical col
for column in ['trestbps', 'chol', 'thalch', 'oldpeak']:
    df[column].fillna(df[column].mean(), inplace=True)

# Imputing categorical columns
for column in ['fbs', 'restecg', 'exang']:
    df[column].fillna(df[column].mode()[0], inplace=True)

In [17]:
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.000000,233.0,True,lv hypertrophy,150.000000,False,2.300000,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.000000,286.0,False,lv hypertrophy,108.000000,True,1.500000,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.000000,229.0,False,lv hypertrophy,129.000000,True,2.600000,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.000000,250.0,False,normal,187.000000,False,3.500000,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.000000,204.0,False,lv hypertrophy,172.000000,False,1.400000,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.000000,333.0,True,st-t abnormality,154.000000,False,0.000000,,,,1
916,917,62,Male,VA Long Beach,typical angina,132.132404,139.0,False,st-t abnormality,137.545665,False,0.878788,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.000000,223.0,True,st-t abnormality,100.000000,False,0.000000,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,132.132404,385.0,True,lv hypertrophy,137.545665,False,0.878788,,,,0


In [19]:
df.drop(['ca'],axis = 1, inplace=True)

In [20]:
for column in ['thal', 'slope']:
    df[column].fillna(df[column].mode()[0], inplace=True)

In [21]:
df.isnull().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
thal        0
num         0
dtype: int64

# Feature Creation

In [22]:
#Create new features based on existing columns to add more information:

In [23]:
#Age Groups: Create age group categories (e.g., <40, 40-60, >60)

In [24]:
def age_group(age):
    if age<40:
        return ("<40")
    elif 40<= age <=60:
        return ("40-60")
    else:
        return (">60")

In [25]:
df["age_group"] = df["age"].apply(age_group)
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,num,age_group
0,1,63,Male,Cleveland,typical angina,145.000000,233.0,True,lv hypertrophy,150.000000,False,2.300000,downsloping,fixed defect,0,>60
1,2,67,Male,Cleveland,asymptomatic,160.000000,286.0,False,lv hypertrophy,108.000000,True,1.500000,flat,normal,2,>60
2,3,67,Male,Cleveland,asymptomatic,120.000000,229.0,False,lv hypertrophy,129.000000,True,2.600000,flat,reversable defect,1,>60
3,4,37,Male,Cleveland,non-anginal,130.000000,250.0,False,normal,187.000000,False,3.500000,downsloping,normal,0,<40
4,5,41,Female,Cleveland,atypical angina,130.000000,204.0,False,lv hypertrophy,172.000000,False,1.400000,upsloping,normal,0,40-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.000000,333.0,True,st-t abnormality,154.000000,False,0.000000,flat,normal,1,40-60
916,917,62,Male,VA Long Beach,typical angina,132.132404,139.0,False,st-t abnormality,137.545665,False,0.878788,flat,normal,0,>60
917,918,55,Male,VA Long Beach,asymptomatic,122.000000,223.0,True,st-t abnormality,100.000000,False,0.000000,flat,fixed defect,2,40-60
918,919,58,Male,VA Long Beach,asymptomatic,132.132404,385.0,True,lv hypertrophy,137.545665,False,0.878788,flat,normal,0,40-60


In [26]:
def chol_types(chol):
    if chol < 200:
        return "Low"
    elif 200 <= chol <= 239:
        return "Medium"
    else:
        return "High"

In [27]:
df["chol_types"] = df["chol"].apply(chol_types)
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,num,age_group,chol_types
0,1,63,Male,Cleveland,typical angina,145.000000,233.0,True,lv hypertrophy,150.000000,False,2.300000,downsloping,fixed defect,0,>60,Medium
1,2,67,Male,Cleveland,asymptomatic,160.000000,286.0,False,lv hypertrophy,108.000000,True,1.500000,flat,normal,2,>60,High
2,3,67,Male,Cleveland,asymptomatic,120.000000,229.0,False,lv hypertrophy,129.000000,True,2.600000,flat,reversable defect,1,>60,Medium
3,4,37,Male,Cleveland,non-anginal,130.000000,250.0,False,normal,187.000000,False,3.500000,downsloping,normal,0,<40,High
4,5,41,Female,Cleveland,atypical angina,130.000000,204.0,False,lv hypertrophy,172.000000,False,1.400000,upsloping,normal,0,40-60,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.000000,333.0,True,st-t abnormality,154.000000,False,0.000000,flat,normal,1,40-60,High
916,917,62,Male,VA Long Beach,typical angina,132.132404,139.0,False,st-t abnormality,137.545665,False,0.878788,flat,normal,0,>60,Low
917,918,55,Male,VA Long Beach,asymptomatic,122.000000,223.0,True,st-t abnormality,100.000000,False,0.000000,flat,fixed defect,2,40-60,Medium
918,919,58,Male,VA Long Beach,asymptomatic,132.132404,385.0,True,lv hypertrophy,137.545665,False,0.878788,flat,normal,0,40-60,High


In [28]:
df['IsRisk'] = ((df['chol'] > 240) & (df['trestbps'] > 140) & (df['age'] > 60)).astype(int)
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,num,age_group,chol_types,IsRisk
0,1,63,Male,Cleveland,typical angina,145.000000,233.0,True,lv hypertrophy,150.000000,False,2.300000,downsloping,fixed defect,0,>60,Medium,0
1,2,67,Male,Cleveland,asymptomatic,160.000000,286.0,False,lv hypertrophy,108.000000,True,1.500000,flat,normal,2,>60,High,1
2,3,67,Male,Cleveland,asymptomatic,120.000000,229.0,False,lv hypertrophy,129.000000,True,2.600000,flat,reversable defect,1,>60,Medium,0
3,4,37,Male,Cleveland,non-anginal,130.000000,250.0,False,normal,187.000000,False,3.500000,downsloping,normal,0,<40,High,0
4,5,41,Female,Cleveland,atypical angina,130.000000,204.0,False,lv hypertrophy,172.000000,False,1.400000,upsloping,normal,0,40-60,Medium,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.000000,333.0,True,st-t abnormality,154.000000,False,0.000000,flat,normal,1,40-60,High,0
916,917,62,Male,VA Long Beach,typical angina,132.132404,139.0,False,st-t abnormality,137.545665,False,0.878788,flat,normal,0,>60,Low,0
917,918,55,Male,VA Long Beach,asymptomatic,122.000000,223.0,True,st-t abnormality,100.000000,False,0.000000,flat,fixed defect,2,40-60,Medium,0
918,919,58,Male,VA Long Beach,asymptomatic,132.132404,385.0,True,lv hypertrophy,137.545665,False,0.878788,flat,normal,0,40-60,High,0


# Feature Transformation

In [29]:
#Convert categorical features into numerical ones using Label Encoding or One-Hot Encoding

In [31]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for column in ['sex','cp','thal','age_group']:
    df[column] = label_encoder.fit_transform(df[column])

In [37]:
from sklearn.preprocessing import MinMaxScaler

In [38]:
scaler = MinMaxScaler()
df[['chol','trestbps','thalch']] = scaler.fit_transform(df[['chol','trestbps','thalch']])

# Feature Interaction

In [39]:
#Create interaction features by combining two or more features:
#BP-Chol Interaction: Multiply trestbps (resting blood pressure) and chol (cholesterol level) to create an interaction feature

In [40]:
df['BP-Chol_Interaction'] = df['trestbps'] * df['chol']

In [42]:
df['Exercise_Angina_Risk'] = ((df['exang'] == 1) & (df['thalch'] < 100)).astype(int)

In [43]:
df.dtypes

id                        int64
age                       int64
sex                       int32
dataset                  object
cp                        int32
trestbps                float64
chol                    float64
fbs                        bool
restecg                  object
thalch                  float64
exang                      bool
oldpeak                 float64
slope                    object
thal                      int32
num                       int64
age_group                 int32
chol_types               object
IsRisk                    int32
BP-Chol_Interaction     float64
Exercise_Angina_Risk      int32
dtype: object

# Feature Selection

In [44]:
#Perform feature selection by evaluating feature importance using a basic model (optional).

In [45]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [46]:
df_encoded = pd.get_dummies(df, drop_first=True)
X = df_encoded.drop('num', axis=1)  
y = df_encoded['num']               

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [48]:
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Feature Importance Scores:")
print(feature_importances)


Feature Importance Scores:
                     Feature  Importance
0                         id    0.187858
7                     thalch    0.088808
9                    oldpeak    0.088502
1                        age    0.078909
13       BP-Chol_Interaction    0.069013
4                   trestbps    0.066099
5                       chol    0.061655
3                         cp    0.057544
10                      thal    0.035619
8                      exang    0.032589
15           dataset_Hungary    0.032103
14      Exercise_Angina_Risk    0.024138
2                        sex    0.021106
11                 age_group    0.020313
18            restecg_normal    0.018541
20                slope_flat    0.017590
17     dataset_VA Long Beach    0.016869
16       dataset_Switzerland    0.016362
21           slope_upsloping    0.014858
6                        fbs    0.014335
23         chol_types_Medium    0.012508
19  restecg_st-t abnormality    0.012444
22            chol_types_Low  