In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../data/Heart_disease_cleveland_new.csv")

In [None]:
df

1. **Age**: Patients Age in years 
2. **Sex**: Gender (Male : 1; Female : 0) 
3. **cp**: Type of chest pain experienced by patient. This term categorized into 4 category.
    - 0 typical angina,
    - 1 atypical angina,
    - 2 non-anginal pain,
    - 3 asymptomatic 
4. **trestbps**: patient's level of blood pressure at resting mode in mm/HG 
5. **chol**: Serum cholesterol in mg/dl 
6. **fbs**: Blood sugar levels on fasting > 120 mg/dl represents as 1 in case of true and 0 as false 
7. **restecg**: Result of electrocardiogram while at rest are represented in 3 distinct values
   - 0 : Normal
   - 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
   - 2: showing probable or definite left ventricular hypertrophyby Estes' criteria 
8. **thalach**: Maximum heart rate achieved 
9. **exang**: Angina induced by exercise 0 depicting NO 1 depicting Yes
10. **oldpeak**: Exercise induced ST-depression in relative with the state of rest 
11. **slope**: ST segment measured in terms of slope during peak exercise 0: up sloping; 1: flat; 2: down sloping
12. **ca**: The number of major vessels (0â€“3)
13. **thal**: A blood disorder called thalassemia
    - 0: NULL
    - 1: normal blood flow
    - 2: fixed defect (no blood flow in some part of the heart)
    - 3: reversible defect (a blood flow is observed but it is not normal
14. ***target***: It is the target variable which we have to predict 1 means patient is suffering from heart disease and 0 means patient is normal.

In [None]:
df.shape

In [None]:
df.info()

In [None]:
missing_vals = df.isnull().sum()
missing_vals

In [None]:
missing_pct = np.around((missing_vals / len(df) * 100), 2)
missing_pct

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate records: {duplicate_count}")

In [None]:
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca']

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[col], kde=True, bins=30, color='mediumseagreen')
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, 3, i + 1)
    sns.violinplot(x=df[col], color='orchid')
    plt.title(f'Violin Plot of {col}')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap='RdBu_r', square=True)
plt.title("Correlation Matrix of Numerical Features")
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

In [None]:
df[numerical_cols].skew()

In [None]:
missing_vals = df.isnull().sum()
missing_pct = (missing_vals / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_vals, 'Percentage': missing_pct})
print(missing_df[missing_df['Missing Values'] > 0])

plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()