# Exploratory Data Analysis (EDA) - Heart Disease Dataset

In [None]:
# Install required libraries
!pip install seaborn

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# Load dataset
url = 'https://raw.githubusercontent.com/datablist/sample-csv-files/main/files/people/people-100.csv'  # Replace with heart dataset if available
# For demo, we'll use this sample dataset with Age
df = pd.read_csv(url)
df.head()

In [None]:
# 1. Replace null values with meaningful estimates (mean/mode)
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)
df.isnull().sum()

In [None]:
# 2. Clean data inconsistencies (convert text case, strip spaces)
df.columns = df.columns.str.strip().str.lower()
df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
df.head()

In [None]:
# 3. Boxplot for numeric features to detect outliers
df.select_dtypes(include=np.number).boxplot(figsize=(10,5))
plt.title("Boxplot of Numeric Features")
plt.xticks(rotation=45)
plt.show()

In [None]:
# 4. Histograms for selected columns
df[['age']].hist(bins=10, figsize=(6,4))
plt.title("Histogram of Age")
plt.show()

In [None]:
# 5. Identify data types
df.dtypes

In [None]:
# 6. Count number of zero values in data
(df == 0).sum()

In [None]:
# 7. Compute mean of Age
df['age'].mean()

In [None]:
# 8. Find dimensions of dataset
df.shape

In [None]:
# 9. Draw scatter plot (age vs salary if available)
if 'salary' in df.columns:
    plt.scatter(df['age'], df['salary'])
    plt.xlabel("Age")
    plt.ylabel("Salary")
    plt.title("Age vs Salary")
    plt.show()
else:
    print("Column 'salary' not in dataset.")

In [None]:
# 10. Mean, Median, Mode of Age
mean = df['age'].mean()
median = df['age'].median()
mode = df['age'].mode()[0]
mean, median, mode

In [None]:
# 11. Quantile plot of Age
stats.probplot(df['age'], dist="norm", plot=plt)
plt.title("Quantile Plot of Age")
plt.show()