In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("heart_disease_dataset.csv")
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,67.0,1.0,0.0,176.0,148.0,1.0,,,1.0,5.6,3.0,3.0,fixed,0.0
1,57.0,1.0,1.0,155.0,551.0,0.0,1.0,98.0,1.0,,2.0,4.0,fixed,0.0
2,43.0,1.0,0.0,125.0,519.0,1.0,0.0,113.0,1.0,,1.0,4.0,reversable,1.0
3,71.0,,0.0,123.0,285.0,0.0,2.0,156.0,0.0,6.2,3.0,1.0,reversable,1.0
4,36.0,0.0,0.0,122.0,488.0,1.0,1.0,,1.0,3.3,3.0,1.0,normal,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,47.0,1.0,1.0,,239.0,1.0,2.0,105.0,1.0,2.9,2.0,1.0,,0.0
496,60.0,1.0,2.0,98.0,270.0,1.0,,100.0,1.0,5.4,1.0,0.0,reversable,0.0
497,29.0,1.0,2.0,138.0,,1.0,0.0,127.0,0.0,1.3,2.0,1.0,reversable,0.0
498,33.0,0.0,2.0,151.0,261.0,0.0,0.0,117.0,1.0,0.8,3.0,3.0,fixed,


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       448 non-null    float64
 1   sex       441 non-null    float64
 2   cp        456 non-null    float64
 3   trestbps  457 non-null    float64
 4   chol      448 non-null    float64
 5   fbs       456 non-null    float64
 6   restecg   446 non-null    float64
 7   thalach   456 non-null    float64
 8   exang     464 non-null    float64
 9   oldpeak   444 non-null    float64
 10  slope     468 non-null    float64
 11  ca        453 non-null    float64
 12  thal      450 non-null    object 
 13  target    453 non-null    float64
dtypes: float64(13), object(1)
memory usage: 54.8+ KB


In [3]:
# Check total missing values

print(df.isnull().sum())


age         52
sex         59
cp          44
trestbps    43
chol        52
fbs         44
restecg     54
thalach     44
exang       36
oldpeak     56
slope       32
ca          47
thal        50
target      47
dtype: int64


In [4]:
# Numerical columns: impute with mean
num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
df[num_cols] = df[num_cols].fillna(df[num_cols].median())


# Categorical columns encode with mode
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [5]:
df.dropna(subset = ['target'], inplace = True)

In [6]:
# Check and convert data types (especially for categorical)

df['age'] = df['age'].astype(int)
df['sex'] = df['sex'].astype(int)
df['cp'] = df['cp'].astype(int)
df['fbs'] = df['fbs'].astype(int)
df['restecg'] = df['restecg'].astype(int)
df['exang'] = df['exang'].astype(int)
df['slope'] = df['slope'].astype(int)
df['ca'] = df['ca'].astype(int)


In [7]:
from sklearn.preprocessing import LabelEncoder

le_thal = LabelEncoder()

df['thal_encoded'] = le_thal.fit_transform(df['thal'])

df.drop('thal', axis=1, inplace=True)

thal_mapping = dict(zip(le_thal.classes_, le_thal.transform(le_thal.classes_)))
print("Encoding Map:", thal_mapping)


Encoding Map: {'fixed': np.int64(0), 'normal': np.int64(1), 'reversable': np.int64(2)}


In [8]:
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)

0


In [9]:
# Confirm no missing values and correct data types
print(df.info())
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
Index: 453 entries, 0 to 499
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           453 non-null    int64  
 1   sex           453 non-null    int64  
 2   cp            453 non-null    int64  
 3   trestbps      453 non-null    float64
 4   chol          453 non-null    float64
 5   fbs           453 non-null    int64  
 6   restecg       453 non-null    int64  
 7   thalach       453 non-null    float64
 8   exang         453 non-null    int64  
 9   oldpeak       453 non-null    float64
 10  slope         453 non-null    int64  
 11  ca            453 non-null    int64  
 12  target        453 non-null    float64
 13  thal_encoded  453 non-null    int64  
dtypes: float64(5), int64(9)
memory usage: 53.1 KB
None
age             0
sex             0
cp              0
trestbps        0
chol            0
fbs             0
restecg         0
thalach         0
exang       

In [10]:
df.to_csv("heart_disease_cleaned.csv")


In [11]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,target,thal_encoded
0,67,1,0,176.0,148.0,1,0,137.5,1,5.6,3,3,0.0,0
1,57,1,1,155.0,551.0,0,1,98.0,1,3.2,2,4,0.0,0
2,43,1,0,125.0,519.0,1,0,113.0,1,3.2,1,4,1.0,2
3,71,0,0,123.0,285.0,0,2,156.0,0,6.2,3,1,1.0,2
4,36,0,0,122.0,488.0,1,1,137.5,1,3.3,3,1,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,38,0,0,153.0,539.0,1,1,91.0,0,4.7,3,4,0.0,2
495,47,1,1,147.0,239.0,1,2,105.0,1,2.9,2,1,0.0,2
496,60,1,2,98.0,270.0,1,0,100.0,1,5.4,1,0,0.0,2
497,29,1,2,138.0,353.5,1,0,127.0,0,1.3,2,1,0.0,2
