In [1]:
# Import libraries
import pandas as pd
import numpy as np

# Load the dataset
heart_disease_df = pd.read_csv("../data/heart_disease_df_1.csv")

## 1. Drop empty columns

In [2]:
heart_disease_column_dropped = heart_disease_df.drop(['oldpeak'], axis=1)

# 2. Drop duplicate rows

In [3]:
heart_disease_duplicates_dropped = heart_disease_column_dropped.drop_duplicates()

# 3. Handle missing values

In [4]:
# 3a. Impute 'restecg' with mean
restecg_mean = heart_disease_duplicates_dropped['restecg'].mean()
heart_disease_duplicates_dropped['restecg'].fillna(restecg_mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  heart_disease_duplicates_dropped['restecg'].fillna(restecg_mean, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_disease_duplicates_dropped['restecg'].fillna(restecg_mean, inplace=True)


In [5]:
# 3b. Impute 'cp' (chest pain type) with mode

# Define the possible categories
cp_categories = [1, 2, 3]

# Find missing indices
missing_cp_indices = heart_disease_duplicates_dropped[heart_disease_duplicates_dropped['cp'].isna()].index

# Randomly assign a category to each missing value
heart_disease_duplicates_dropped.loc[missing_cp_indices, 'cp'] = np.random.choice(cp_categories, size=len(missing_cp_indices))

In [6]:
# 3c. Impute 'chol' (cholesterol) with median
chol_median = heart_disease_duplicates_dropped['chol'].median()
heart_disease_duplicates_dropped['chol'].fillna(chol_median, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  heart_disease_duplicates_dropped['chol'].fillna(chol_median, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_disease_duplicates_dropped['chol'].fillna(chol_median, inplace=True)


# 4. Verify missing values

In [7]:
print("Missing values after imputation:")
print(heart_disease_duplicates_dropped.isna().sum())

Missing values after imputation:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
slope       0
ca          0
thal        0
target      0
dtype: int64


# 5. Verify data types and basic info

In [12]:
print("\nDataFrame info:")
print(heart_disease_df.info())
print("\n ------------------------------------------------ \n")
print(heart_disease_duplicates_dropped.info())


DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1076 entries, 0 to 1075
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1076 non-null   int64  
 1   sex       1076 non-null   int64  
 2   cp        1023 non-null   float64
 3   trestbps  1076 non-null   int64  
 4   chol      1021 non-null   float64
 5   fbs       1076 non-null   int64  
 6   restecg   1028 non-null   float64
 7   thalach   1076 non-null   int64  
 8   exang     1076 non-null   int64  
 9   oldpeak   0 non-null      float64
 10  slope     1076 non-null   int64  
 11  ca        1076 non-null   int64  
 12  thal      1076 non-null   int64  
 13  target    1076 non-null   int64  
dtypes: float64(4), int64(10)
memory usage: 117.8 KB
None

 ------------------------------------------------ 

<class 'pandas.core.frame.DataFrame'>
Index: 438 entries, 0 to 1015
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
-

# 6. Optional: Reset index

In [9]:
heart_disease_prepared = heart_disease_duplicates_dropped.reset_index(drop=True)

# 7. Save prepared dataset (optional)

In [10]:
heart_disease_prepared.to_csv("../data/heart_disease_prepared.csv", index=False)