# Import data

In [2]:
import pandas as pd
import numpy as np

# datalink: 'https://raw.githubusercontent.com/elabrodsky/tulanemba/main/heart_2020_cleaned.csv'
datalink = 'https://raw.githubusercontent.com/elabrodsky/tulanemba/main/heart_2020_cleaned.csv'
df = pd.read_table(datalink, sep=',', header=(0))

# Numerical Exploratory Data Analysis:

## Column Data Types:

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

## Summary of Numerical Columns:

In [6]:
# summary of data
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


In [7]:
# make the statistical summary presentable
df.describe()[1:][['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
BMI,28.325399,6.3561,12.02,24.03,27.34,31.42,94.85
PhysicalHealth,3.37171,7.95085,0.0,0.0,0.0,2.0,30.0
MentalHealth,3.898366,7.955235,0.0,0.0,0.0,3.0,30.0
SleepTime,7.097075,1.436007,1.0,6.0,7.0,8.0,24.0


In [8]:
# summary of data -- leave out count, transpose and beautify the output
df.describe()[1:][['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']].T.style.background_gradient(cmap='Reds')

Unnamed: 0,mean,std,min,25%,50%,75%,max
BMI,28.325399,6.3561,12.02,24.03,27.34,31.42,94.85
PhysicalHealth,3.37171,7.95085,0.0,0.0,0.0,2.0,30.0
MentalHealth,3.898366,7.955235,0.0,0.0,0.0,3.0,30.0
SleepTime,7.097075,1.436007,1.0,6.0,7.0,8.0,24.0


## Summary of Categorical Columns:

### Count the number of YES and NO in each of respective chosen columns

In [10]:
df['HeartDisease'].value_counts()

No     292422
Yes     27373
Name: HeartDisease, dtype: int64

In [11]:
# Percentage of Yes and No in HeartDisease column
print("HeartDisease statistics in this data: \n")
print(df['HeartDisease'].value_counts() / df['HeartDisease'].value_counts().sum() * 100)

HeartDisease statistics in this data: 

No     91.440454
Yes     8.559546
Name: HeartDisease, dtype: float64


## Summary of different factors and their association with Heart Disease:

**Two factor association**: cross tab function making simple to complex counting of categorical columns according to an index and their related columns

In [12]:
pd.crosstab(df['HeartDisease'], df['Smoking'], margins=True)

Smoking,No,Yes,All
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,176551,115871,292422
Yes,11336,16037,27373
All,187887,131908,319795


In [13]:
# with colors
pd.crosstab(df['HeartDisease'], df['Smoking'], margins=True).style.background_gradient(cmap='Greens')

Smoking,No,Yes,All
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,176551,115871,292422
Yes,11336,16037,27373
All,187887,131908,319795


### Normalize by rows:

In [15]:
pd.crosstab(index=df['HeartDisease'], columns=df['Smoking'], margins=True, normalize='index')

Smoking,No,Yes
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.603754,0.396246
Yes,0.414131,0.585869
All,0.587523,0.412477


In [16]:
pd.crosstab(index=df['HeartDisease'], columns=df['Smoking'], margins=True, normalize='index').style.background_gradient(cmap='Greens', axis='index')

Smoking,No,Yes
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.603754,0.396246
Yes,0.414131,0.585869
All,0.587523,0.412477


In [17]:
# with colors and percentage instead of ratio
pd.crosstab(index=df['HeartDisease'], columns=df['Smoking'], margins=True, normalize='index').style.format("{:.0%}").background_gradient(cmap='Greens', axis='index')

Smoking,No,Yes
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1
No,60%,40%
Yes,41%,59%
All,59%,41%


### Normalize by columns:

In [18]:
## How many of the Heart Disease patients have a smoking history?
pd.crosstab(index=df['HeartDisease'], columns=df['Smoking'], margins=True, normalize='columns').style.format("{:.0%}").background_gradient(cmap='Greens', axis='columns')

Smoking,No,Yes,All
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,94%,88%,91%
Yes,6%,12%,9%


### Two factors and multi-level associations:

Is heart disease prevalent in smoking males or smoking females?

In [20]:
pd.crosstab(index=df['HeartDisease'],
            columns=[df['Sex'], df['Smoking']],
            margins=True,
            normalize='index'
            ).style.format("{:.0%}").background_gradient(cmap='Purples', axis='columns')

Sex,Female,Female,Male,Male
Smoking,No,Yes,No,Yes
HeartDisease,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
No,34%,19%,26%,20%
Yes,20%,21%,22%,37%
All,33%,20%,26%,22%


Is stroke a risk factor for Heart Disease in females or males?

In [21]:
pd.crosstab(index=df['HeartDisease'],
            columns=[df['Sex'], df['Stroke']],
            margins=True,
            normalize='index'
            ).style.format("{:.0%}").background_gradient(cmap='Oranges', axis='columns')

Sex,Female,Female,Male,Male
Stroke,No,Yes,No,Yes
HeartDisease,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
No,52%,2%,45%,1%
Yes,34%,7%,50%,9%
All,50%,2%,46%,2%


### Multi-association between Heart Disease, Stroke, Sex and Smoking?

In [23]:
pd.crosstab(index=df['HeartDisease'],
            columns=[df['Sex'], df['Stroke'], df['Smoking']],
            margins=True,
            normalize='index'
            ).style.format("{:.0%}").background_gradient(cmap='Oranges', axis='columns')

Sex,Female,Female,Female,Female,Male,Male,Male,Male
Stroke,No,No,Yes,Yes,No,No,Yes,Yes
Smoking,No,Yes,No,Yes,No,Yes,No,Yes
HeartDisease,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
No,33%,19%,1%,1%,26%,20%,0%,1%
Yes,16%,17%,3%,4%,19%,31%,3%,6%
All,32%,19%,1%,1%,25%,21%,1%,1%
