In [1]:
import pandas as pd
import matplotlib.pyplot as plt

The ST segment shift relative to exercise-induced increments in heart rate, the ST/heart rate slope (ST/HR slope)
Source: The ST segment/heart rate slope as a predictor of coronary artery disease: comparison with quantitative thallium imaging and conventional ST segment criteria https://pubmed.ncbi.nlm.nih.gov/3739881/

Korzystając z biblioteki pandas, wczytaj plik heart disease dataset.csv do struktury DataFrame (jest to reprezentacja tabeli)

In [2]:
# Opening heart_disease_dataset.csv using pandas library
heart_disease = pd.read_csv("heart_disease_dataset.csv")


### Zadanie 1. 
Oblicz podstawowe statystyki opisowe (średnia, mediana, odchylenie standardowe) dla kolumn'Age','Resting blood pressure','Serum cholesterol in mg/dl', 'Maximum heart rate achieved'.

In [3]:
# Creating a subset DataFrame with Age, Resting blood pressure, Serum cholesterol in mg/dl, Maximum heart rate achieved columns

heart_disease_subset = heart_disease[['Age','Resting blood pressure','Serum cholesterol in mg/dl','Maximum heart rate achieved']]

In [4]:
# Calculating basic statistics using describe() function and picking mean, median (50th percentile) and standard deviation using loc property

# Transposing basic_stats DataFrame for better readability using T function
basic_stats = heart_disease_subset.describe().loc[['mean', '50%', 'std']].T

# Rounding all values to .1 for better data readability
round(basic_stats,1)


Unnamed: 0,mean,50%,std
Age,54.4,56.0,9.1
Resting blood pressure,131.6,130.0,17.5
Serum cholesterol in mg/dl,246.0,240.0,51.6
Maximum heart rate achieved,149.1,152.0,23.0


In [5]:
# Custom function to calculate basic stats
# It consumes less memory than describe() as it calculates less

def calculate_basic_stats(x):
    return pd.Series({
        'Mean': x.mean(),
        'Median': x.median(),
        'Standard Deviation': x.std()
    })

In [6]:
heart_disease_subset2 = heart_disease[['Age','Resting blood pressure','Serum cholesterol in mg/dl','Maximum heart rate achieved']]

round(heart_disease_subset.agg(calculate_basic_stats),1).T

Unnamed: 0,Mean,Median,Standard Deviation
Age,54.4,56.0,9.1
Resting blood pressure,131.6,130.0,17.5
Serum cholesterol in mg/dl,246.0,240.0,51.6
Maximum heart rate achieved,149.1,152.0,23.0


#### Zadanie 2. 
Jaki jest rozkład płci w zestawie danych? Przedstaw go na wykresie kołowym.

In [7]:
gender_distribution = heart_disease['Sex'].value_counts()
gender_distribution

Sex
male      713
female    312
Name: count, dtype: int64

In [9]:
number_male_patients = gender_distribution.loc['male']
number_female_patients = gender_distribution.loc['female']

# to add printing the result

In [None]:
gender_distribution.plot(kind="pie")

#### Zadanie 3. 
Czy (w analizowanej próbie) na choroby serca choruje więcej kobiet, czy mężczyzn?
O ile procent więcej?

In [None]:
heart_disease_subset3 = heart_disease[['Sex','Disease']]
heart_disease_subset3.head(5)

In [None]:
# Grouping the dataset by gender and calculating the value counts
counts_by_gender = heart_disease_subset3.groupby('Sex').value_counts()
counts_by_gender

In [None]:
# Calculating the percentage of patients who suffer more frequently from heart disease
counts_by_gender = heart_disease_subset3[heart_disease_subset3['Disease']].groupby('Sex').size()

# Checking which gender suffers more frequently by returning index of the maximum element
more_frequently_suffering_gender = counts_by_gender.idxmax()

# Checking number of patients in more frequently suffering group
number_patients_more_suffering = counts_by_gender.max()

# Checking number of patients in less frequently suffering group
number_patients_less_suffering = counts_by_gender.min()

# Calculating a percentage difference between patients by gender
number_patients_with_heart_disease = counts_by_gender.sum()

difference_by_gender = ((number_patients_more_suffering - number_patients_less_suffering) / number_patients_with_heart_disease) * 100

print("There is {:.2f%} more ".format(difference_by_gender) + more_frequently_suffering_gender + " patients suffering from the heart disease in the analysed dataset.")


#### Zadanie 4
Porównaj średnią wartość cholesterolu we krwi osobno dla grupy kobiet i grupy
mężczyzn w zależności od obecności choroby serca.

#### Zadanie 5
Narysuj wykres pudełkowy maksymalnej osiągniętej wartości tętna podczas testu
wysiłkowego w zależności od obecności choroby serca. Co można zauważyć na pod-
stawie tego wykresu?

In [None]:
heart_disease_subset5 = heart_disease[['Maximum heart rate achieved','Disease']]
heart_disease_subset5.head(5)

In [None]:
fig, ax = plt.subplots()

v = plt.boxplot([
    heart_disease_subset5.query("Disease == True")["Maximum heart rate achieved"],
    heart_disease_subset5.query("Disease == False")["Maximum heart rate achieved"],
],
    showmeans=True, showfliers=False,
    medianprops={"color": "orange", "linewidth": 1},
    whiskerprops={"color": "cyan", "linewidth": 1},
    capprops={"color": "lightblue", "linewidth": 1}
    
)

ax.set_xticklabels(['Heart Disease Present', 'Heart Disease Not Present'])
ax.yaxis.set_major_formatter(plt.FormatStrFormatter('%d bpm'))

#### Zadanie 6
Narysuj wykres słupkowy częstości występowania choroby serca w zależności od
informacji, czy u pacjenta występuje ból dławicowy podczas testu wysiłkowego. Co można zauważyć na podstawie wykresu?