In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("cardio_train.csv", delimiter = ";")

In [3]:
df['age_in_years'] = round(df['age'] / 365.25)
df['gender'] = df['gender'].map({1: 'Female', 2: 'Male'})
df['cardio'] = df['cardio'].map({1: 'Yes', 0: 'No'})
df[['gender', 'cardio']] = df[['gender', 'cardio']].astype(str)

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_in_years
0,0,18393,Male,168,62.0,110,80,1,1,0,0,1,No,50.0
1,1,20228,Female,156,85.0,140,90,3,1,0,0,1,Yes,55.0
2,2,18857,Female,165,64.0,130,70,3,1,0,0,0,Yes,52.0
3,3,17623,Male,169,82.0,150,100,1,1,0,0,1,Yes,48.0
4,4,17474,Female,156,56.0,100,60,1,1,0,0,0,No,48.0


In [4]:
df['BMI'] = round(df['weight']/((df['height']/100)**2), 1)

# Define conditions and corresponding categories
conditions = [
    (df['BMI'] <= 18.4),
    (df['BMI'] <= 24.9),
    (df['BMI'] <= 39.9),
    (df['BMI'] >= 40.0),
]

categories = [
    "Underweight",
    "Normal",
    "Overweight",
    "Obese"
]

# Apply categorization using numpy.select
df['BMI'] = np.select(conditions, categories, default="Other")
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_in_years,BMI
0,0,18393,Male,168,62.0,110,80,1,1,0,0,1,No,50.0,Normal
1,1,20228,Female,156,85.0,140,90,3,1,0,0,1,Yes,55.0,Overweight
2,2,18857,Female,165,64.0,130,70,3,1,0,0,0,Yes,52.0,Normal
3,3,17623,Male,169,82.0,150,100,1,1,0,0,1,Yes,48.0,Overweight
4,4,17474,Female,156,56.0,100,60,1,1,0,0,0,No,48.0,Normal


In [5]:
import altair as alt

In [6]:
# Define the age groups
bins = [30, 40, 50, 60, 70]
labels = ['30-40', '41-50', '51-60', '61-70']
df['age_group'] = pd.cut(df['age_in_years'], bins=bins, labels=labels, right=False)

# Check the distribution in each age group
df['age_group'].value_counts()

age_group
51-60    35396
41-50    18389
61-70    15741
30-40      474
Name: count, dtype: int64

In [7]:
# Check the data types of the columns
print(df.dtypes)

id                 int64
age                int64
gender            object
height             int64
weight           float64
ap_hi              int64
ap_lo              int64
cholesterol        int64
gluc               int64
smoke              int64
alco               int64
active             int64
cardio            object
age_in_years     float64
BMI               object
age_group       category
dtype: object


In [None]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_in_years,BMI,age_group
0,0,18393,Male,168,62.0,110,80,1,1,0,0,1,No,50.0,Normal,51-60
1,1,20228,Female,156,85.0,140,90,3,1,0,0,1,Yes,55.0,Overweight,51-60
2,2,18857,Female,165,64.0,130,70,3,1,0,0,0,Yes,52.0,Normal,51-60
3,3,17623,Male,169,82.0,150,100,1,1,0,0,1,Yes,48.0,Overweight,41-50
4,4,17474,Female,156,56.0,100,60,1,1,0,0,0,No,48.0,Normal,41-50


In [8]:
# Aggregate the data
agg_data = df.groupby(['gender', 'cardio', 'age_group', 'BMI', 'active']).size().reset_index(name='counts')
agg_data['percentage'] = agg_data.groupby(['gender', 'age_group', 'BMI', 'active'])['counts'].transform(lambda x: x / x.sum())

# Filter 1: Gender (buttons)
gender_selection = alt.selection_single(
    fields=['gender'],
    name="Gender",
    bind=alt.binding_radio(options=agg_data['gender'].unique(), name="Gender: "),
    init={'gender': agg_data['gender'].unique()[0]}
)

# Filter 2: Age (dropdown)
age_selection = alt.selection_single(
    fields=['age_group'],
    name="Age Group",
    bind=alt.binding_select(options=agg_data['age_group'].cat.categories.tolist(), name="Age Group: "),
    init={'age_group': agg_data['age_group'].cat.categories[0]}
)

# Filter 3: BMI (dropdown)
bmi_selection = alt.selection_single(
    fields=['BMI'],
    name="BMI Category",
    bind=alt.binding_select(options=['Underweight', 'Normal', 'Overweight', 'Obese'], name="BMI Category: "),
    init={'BMI': 'Underweight'}
)

# Define the donut chart
donut_chart = alt.Chart(agg_data).mark_arc(innerRadius=50).encode(
    theta=alt.Theta(field="counts", type="quantitative", stack=True),
    color=alt.Color('cardio', type="nominal", legend=alt.Legend(title="Cardiovascular Disease"),
                   scale=alt.Scale(domain=['Yes', 'No'], range=['darkblue', 'skyblue'])),
    tooltip=[
        alt.Tooltip('counts', title='Count'),
        alt.Tooltip('percentage', title='Percentage', format='.1%'),
        alt.Tooltip('cardio', title='CVD Status')
    ]
).transform_filter(
    gender_selection
).transform_filter(
    age_selection
).transform_filter(
    bmi_selection
).add_selection(
    gender_selection,
    age_selection,
    bmi_selection
).properties(
    title="Distribution of Cardiovascular Disease by Gender, Age, and BMI"
)


In [9]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [10]:
active_data = df[df['active'] == 1]
inactive_data = df[df['active'] == 0]

In [11]:
click = alt.selection_multi(encodings=['color'])

sort_order = ['Underweight',  'Normal', 'Overweight', 'Obese']
count_data = df.groupby(['active', 'smoke', 'alco', 'cardio','gender','BMI','age_group']).size().reset_index(name='count')

# Calculate total count for each 'active' category
total_count = len(count_data)

# Calculate percentage for each category
count_data['percentage'] = count_data['count'] / total_count * 100


# Create the stacked bar chart
stacked_color_scheme = alt.Scale(domain=['No', 'Yes', 'Underweight', 'Normal','Overweight', 'Obese', 'Female', 'Male', '30-40', '41-50','51-60', '61-70'],
                                 range=['skyblue', '#125ca4', '#bab0ac', '#9d755d', '#F69CA6', '#756bb1', '#5DA247', '#E8C92E', '#9ecae1', '#f58518', '#e45756', '#72b7b2' ])
stacked_act = alt.Chart(count_data).mark_bar().encode(
    x=alt.X('sum(percentage):Q', stack='normalize', axis=alt.Axis(title='% Count')),
    y=alt.Y('active:N', axis=alt.Axis(title='Active')),
    color=alt.Color('cardio:N', legend=alt.Legend(title='Cardiovascular Disease'), scale = stacked_color_scheme),
    order=alt.Order('cardio:N', sort='ascending'),  # Sort the bars
    tooltip=['active', 'percentage']
).transform_filter(
    click
)

# Create the stacked bar chart
stacked_smoke = alt.Chart(count_data).mark_bar().encode(
    x=alt.X('sum(percentage):Q', stack='normalize', axis=alt.Axis(title='% Count')),
    y=alt.Y('smoke:N', axis=alt.Axis(title='Smoke')),
    color=alt.Color('cardio:N', legend=alt.Legend(title='Cardiovascular Disease')),
    order=alt.Order('cardio:N', sort='ascending'),  # Sort the bars
    tooltip=['smoke', 'percentage']
).transform_filter(
    click
)

# Create the stacked bar chart
stacked_alco = alt.Chart(count_data).mark_bar().encode(
    x=alt.X('sum(percentage):Q', stack='normalize', axis=alt.Axis(title='% Count')),
    y=alt.Y('alco:N', axis=alt.Axis(title='Alcohol')),
    color=alt.Color('cardio:N', legend=alt.Legend(title='Cardiovascular Disease')),
    order=alt.Order('cardio:N', sort='ascending'),  # Sort the bars
    tooltip=['alco', 'percentage']
).transform_filter(
    click
)

hist_active = alt.Chart(df).mark_bar().encode(
    x='count()',
    y='active:N',
    color='cardio:N'
).transform_filter(
    click
)

hist_BMI = alt.Chart(df).mark_bar().encode(
    x='count()',
    y=alt.Y('BMI:N', sort=sort_order),
    color=alt.condition(click, 'BMI:N', alt.value('lightgray'))
).add_selection(
    click
)

hist_gender = alt.Chart(df).mark_bar().encode(
    x='count()',
    y='gender:N',
    color=alt.condition(click, 'gender:N', alt.value('lightgray'))
).add_selection(
    click
)

hist_age = alt.Chart(df).mark_bar().encode(
    x='count()',
    y='age_group:O',
    color=alt.condition(click, 'age_group:O', alt.value('lightgray'))
).add_selection(
    click
)
stacked_act & stacked_smoke & stacked_alco & hist_BMI & hist_gender &hist_age

Output hidden; open in https://colab.research.google.com to view.

In [13]:
combined_chart = (stacked_act & stacked_smoke & stacked_alco & hist_BMI & hist_gender & hist_age)

combined_chart.save('combined_3plots.html')