In [1]:
# 1. Import libraries
import pandas as pd
from ydata_profiling import ProfileReport

# 2. load data
df = pd.read_csv("./heart_cleveland_upload.csv")

print("=== HEAD ===")
print(df.head())
print("=== INFO ===")
print(df.info())
print("=== DESCRIBE ===")
print(df.describe())



  from .autonotebook import tqdm as notebook_tqdm


=== HEAD ===
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   69    1   0       160   234    1        2      131      0      0.1      1   
1   69    0   0       140   239    0        0      151      0      1.8      0   
2   66    0   0       150   226    0        0      114      0      2.6      2   
3   65    1   0       138   282    1        2      174      0      1.4      1   
4   64    1   0       110   211    0        2      144      1      1.8      1   

   ca  thal  condition  
0   1     0          0  
1   2     0          0  
2   0     0          0  
3   1     0          1  
4   0     0          0  
=== INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    

In [3]:
# 4. Check for missing values and duplicates
missing_values = df.isnull().sum()
print("=== NULL VALUES ===")
print(missing_values)

print("=== DUPLICATES ===")
duplicates = df.duplicated().sum()

=== NULL VALUES ===
age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
condition    0
dtype: int64
=== DUPLICATES ===


In [4]:
# 5. Handle duplicate rows and missing values
df.drop_duplicates(inplace=True)
df.fillna(df.median(), inplace=True)

In [5]:
# 6. Data type conversion for label columns
# change 'target' to 'condition' based on actual dataset
if df['condition'].dtype != 'int':
    df['condition'] = df['condition'].astype(int)

In [6]:
# 7. Generate a profile report
profile = ProfileReport(df, title="Heart Disease Dataset Profiling Report", explorative=True)
# 8. Save the profile report
profile.to_file("heart_disease_profile_report.html")

100%|██████████| 14/14 [00:00<00:00, 943.98it/s]00:00, 88.85it/s, Describe variable: condition]
Summarize dataset: 100%|██████████| 48/48 [00:01<00:00, 39.93it/s, Completed]                  
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.02it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 107.04it/s]


In [7]:
# 8. Export the cleaned data
df.to_csv("cleaned_heart_disease_data.csv", index=False)
df.to_json("cleaned_heart_disease_data.json", orient='records', lines=True)