In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
# Load the processed data
data = pd.read_csv('heart_disease_processed.csv')

In [3]:
# Basic statistical summary
print("Basic Statistical Summary:")
print(data.describe())

Basic Statistical Summary:
                age      trestbps          chol       thalach       oldpeak  \
count  1.025000e+03  1.025000e+03  1.025000e+03  1.025000e+03  1.025000e+03   
mean  -3.396741e-16 -7.070767e-16 -3.466062e-18 -4.297917e-16 -2.443574e-16   
std    1.000488e+00  1.000488e+00  1.000488e+00  1.000488e+00  1.000488e+00   
min   -2.804866e+00 -2.148237e+00 -2.327054e+00 -3.397080e+00 -9.123291e-01   
25%   -7.095548e-01 -6.632165e-01 -6.787242e-01 -7.442713e-01 -9.123291e-01   
50%    1.726817e-01 -9.205458e-02 -1.163527e-01  1.255019e-01 -2.311765e-01   
75%    7.240794e-01  4.791073e-01  5.623715e-01  7.343432e-01  6.202642e-01   
max    2.488552e+00  3.906079e+00  6.166694e+00  2.299935e+00  4.366603e+00   

            target  
count  1025.000000  
mean      0.513171  
std       0.500070  
min       0.000000  
25%       0.000000  
50%       1.000000  
75%       1.000000  
max       1.000000  


In [14]:
# Correlation analysis
plt.figure(figsize=(20, 16))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()

In [5]:
# Distribution of numerical features
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 3, i)
    sns.histplot(data=data, x=col, kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.savefig('numerical_distributions.png')
plt.close()

  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):


In [6]:
# Target variable distribution
plt.figure(figsize=(8, 6))
data['target'].value_counts().plot(kind='bar')
plt.title('Distribution of Target Variable')
plt.xlabel('Heart Disease')
plt.ylabel('Count')
plt.xticks([0, 1], ['No', 'Yes'])
plt.savefig('target_distribution.png')
plt.close()

In [7]:
# Relationship between age, cholesterol, and heart disease
plt.figure(figsize=(10, 8))
sns.scatterplot(data=data, x='age', y='chol', hue='target', style='target')
plt.title('Age vs Cholesterol, colored by Heart Disease')
plt.savefig('age_chol_scatter.png')
plt.close()

In [8]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(data=data, x='target', y=col)
    plt.title(f'{col} by Heart Disease')
plt.tight_layout()
plt.savefig('boxplots_by_target.png')
plt.close()

In [9]:
# Feature importance using mutual information
from sklearn.feature_selection import mutual_info_classif

X = data.drop('target', axis=1)
y = data['target']
mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)


In [10]:
plt.figure(figsize=(10, 8))
mi_scores.plot.bar()
plt.title('Feature Importance based on Mutual Information')
plt.xlabel('Features')
plt.ylabel('Mutual Information Score')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

In [11]:
print("\nEDA Completed. Visualization files have been saved.")
print("\nTop 5 Important Features based on Mutual Information:")
print(mi_scores.head())


EDA Completed. Visualization files have been saved.

Top 5 Important Features based on Mutual Information:
chol       0.259706
thalach    0.173722
cp_0.0     0.129815
oldpeak    0.125181
ca_0.0     0.121016
Name: MI Scores, dtype: float64


In [12]:
# Statistical tests
print("\nStatistical Tests:")
for col in numerical_cols:
    t_stat, p_value = stats.ttest_ind(data[data['target'] == 0][col], 
                                      data[data['target'] == 1][col])
    print(f"{col}: t-statistic = {t_stat:.4f}, p-value = {p_value:.4f}")


Statistical Tests:
age: t-statistic = 7.5356, p-value = 0.0000
trestbps: t-statistic = 4.4819, p-value = 0.0000
chol: t-statistic = 3.2134, p-value = 0.0014
thalach: t-statistic = -14.9265, p-value = 0.0000
oldpeak: t-statistic = 15.6029, p-value = 0.0000
