In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Load the dataset
df = pd.read_csv('../data/raw/heart.csv')
print('Dataset shape:', df.shape)
print('\nFirst 5 rows:')
df.head()

In [None]:
# Basic information
print('Data types:')
print(df.dtypes)
print('\nMissing values:')
print(df.isnull().sum())
print('\nSummary statistics:')
df.describe()

In [None]:
# Class balance
plt.figure(figsize=(8, 6))
sns.countplot(x='target', data=df)
plt.title('Class Balance: Heart Disease Presence')
plt.xlabel('Target (0: No Disease, 1: Disease)')
plt.ylabel('Count')
plt.show()

print('Class distribution:')
print(df['target'].value_counts(normalize=True))

In [None]:
# Histograms for numerical features
num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
df[num_cols].hist(bins=20, figsize=(15, 10))
plt.suptitle('Histograms of Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Categorical features analysis
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for i, col in enumerate(cat_cols):
    if col in df.columns:
        sns.countplot(x=col, hue='target', data=df, ax=axes[i])
        axes[i].set_title(f'{col} vs Target')

plt.tight_layout()
plt.show()

In [None]:
# Age distribution by target
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='age', hue='target', multiple='stack', bins=20)
plt.title('Age Distribution by Heart Disease Status')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()