# Import and Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import altair as alt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/Shareddrives/Data Visualisation with XLBs/Final Project/cardio_train.csv', delimiter = ";")

In [None]:
# Basic data preprocessing
df['age_in_years'] = round(df['age'] / 365.25)
df['gender'] = df['gender'].map({1: 'Female', 2: 'Male'})
df['cardio'] = df['cardio'].map({1: 'Yes', 0: 'No'})
df[['gender', 'cardio']] = df[['gender', 'cardio']].astype(str)

# BMI
df['BMI'] = round(df['weight']/((df['height']/100)**2), 1)

conditions = [
    (df['BMI'] <= 18.4),
    (df['BMI'] <= 24.9),
    (df['BMI'] <= 39.9),
    (df['BMI'] >= 40.0),
]

categories = [
    "Underweight",
    "Normal",
    "Overweight",
    "Obese"
]

df['BMI'] = np.select(conditions, categories, default="Other")

# Age groups
bins = [30, 40, 50, 60, 70]
labels = ['30-40', '41-50', '51-60', '61-70']
df['age_group'] = pd.cut(df['age_in_years'], bins=bins, labels=labels, right=False)

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_in_years,BMI,age_group
0,0,18393,Male,168,62.0,110,80,1,1,0,0,1,No,50.0,Normal,51-60
1,1,20228,Female,156,85.0,140,90,3,1,0,0,1,Yes,55.0,Overweight,51-60
2,2,18857,Female,165,64.0,130,70,3,1,0,0,0,Yes,52.0,Normal,51-60
3,3,17623,Male,169,82.0,150,100,1,1,0,0,1,Yes,48.0,Overweight,41-50
4,4,17474,Female,156,56.0,100,60,1,1,0,0,0,No,48.0,Normal,41-50


In [None]:
# Define color scale
color_scale = alt.Scale(domain=['Yes', 'No'], range=['darkblue', 'skyblue'])

# Chart 1: Activity Level

## Step 1: Create bar chart depicting the relationship between activity and CVD

In [None]:
# Calculate the total number of active and inactive individuals
total_active = df['active'].sum()
total_inactive = df['active'].count() - total_active

# Calculate the number of individuals for each combination of activity status and cardiovascular disease
agg_data_active_bar = df.groupby(['active', 'cardio']).size().reset_index(name='counts')

# Normalize the counts for each group (active and inactive individuals)
agg_data_active_bar['percentage'] = agg_data_active_bar['counts']
agg_data_active_bar.loc[agg_data_active_bar['active'] == 1, 'percentage'] /= total_active
agg_data_active_bar.loc[agg_data_active_bar['active'] == 0, 'percentage'] /= total_inactive
agg_data_active_bar['percentage'] *= 100

# Dataframe creation
text_data_active = agg_data_active_bar.copy()
text_data_active['summary'] = text_data_active.apply(lambda x: 'Active Individuals' if x['active'] == 1 else 'Inactive Individuals', axis=1)

# Y-axis
y_order_active = ['Inactive Individuals', 'Active Individuals']

# Normalized horizontal bar chart
bar_active = alt.Chart(text_data_active).mark_bar().encode(
    y=alt.Y('summary:N', title=None, sort=y_order_active),
    x=alt.X('percentage:Q', title='Percentage'),
    color=alt.Color('cardio:N', legend=alt.Legend(title="Cardiovascular Disease", orient='right'), scale=color_scale),
    tooltip=[
        alt.Tooltip('counts:Q', title='Number of Individuals', format=','),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=400,
    height=150,
    title=alt.TitleParams(
        text="Relationship between Activity Level and Cardiovascular Disease",
        anchor="start",
        offset=10,
        orient="top",
        fontSize=14
    )
)

bar_active


## Step 2: Create donut charts to show relationship between activity level and CVD under specified filters (age, gender, BMI)

In [None]:
# Aggregate data (exercise)
agg_data = df.groupby(['gender', 'cardio', 'age_group', 'BMI', 'active']).size().reset_index(name='counts')
agg_data['percentage'] = agg_data.groupby(['gender', 'age_group', 'BMI', 'active'])['counts'].transform(lambda x: x / x.sum())

In [None]:
# Filter 1: Gender (buttons)
gender_selection = alt.selection_single(
    fields=['gender'],
    name="Gender",
    bind=alt.binding_radio(options=agg_data['gender'].unique(), name="Gender: "),
    init={'gender': agg_data['gender'].unique()[0]}
)

# Filter 2: Age (dropdown)
age_selection = alt.selection_single(
    fields=['age_group'],
    name="Age Group",
    bind=alt.binding_select(options=agg_data['age_group'].cat.categories.tolist(), name="Age Group: "),
    init={'age_group': agg_data['age_group'].cat.categories[0]}
)

# Filter 3: BMI (dropdown)
bmi_selection = alt.selection_single(
    fields=['BMI'],
    name="BMI Category",
    bind=alt.binding_select(options=['Underweight', 'Normal', 'Overweight', 'Obese'], name="BMI Category: "),
    init={'BMI': 'Underweight'}
)

In [None]:
# Data Preprocessing
total_counts = df.groupby(['cardio', 'active']).size().reset_index(name='counts')
total_counts['total_percentage'] = total_counts['counts'] / total_counts['counts'].sum()

active_with_cvd_percentage = total_counts[(total_counts['active'] == 1) & (total_counts['cardio'] == 'Yes')]['total_percentage'].iloc[0]
inactive_with_cvd_percentage = total_counts[(total_counts['active'] == 0) & (total_counts['cardio'] == 'Yes')]['total_percentage'].iloc[0]

subtitle_text = f"Active with CVD: {active_with_cvd_percentage:.1%}, Inactive with CVD: {inactive_with_cvd_percentage:.1%}"

In [None]:
# Define donut chart
donut_chart = alt.Chart(agg_data).mark_arc(innerRadius=50).encode(
    theta=alt.Theta(field="percentage", type="quantitative", stack=True),
    color=alt.Color('cardio:N', legend=None, scale=color_scale),
    tooltip=[
        alt.Tooltip('counts:N', title='Number of Individuals'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1%'),
        alt.Tooltip('cardio:N', title='CVD Status')
    ]
).transform_filter(
    gender_selection
).transform_filter(
    age_selection
).transform_filter(
    bmi_selection
).add_selection(
    gender_selection,
    age_selection,
    bmi_selection
)

In [None]:
active_chart = donut_chart.transform_filter(
    alt.datum.active == 1
).properties(
    title=alt.TitleParams("CVD amongst Active Individuals (filtered)", fontSize=14)
).encode(
    color=alt.Color('cardio:N', scale=color_scale, legend=alt.Legend(title="Cardiovascular Disease", orient='bottom', titleFontSize=12, labelFontSize=12))
)

inactive_chart = donut_chart.transform_filter(
    alt.datum.active == 0
).properties(
    title=alt.TitleParams("CVD amongst Inactive Individuals (filtered)", fontSize=14)
)

combined_exercise = alt.hconcat(active_chart, inactive_chart).resolve_scale(
    color='independent'
).properties(
    title=alt.TitleParams(
        text="Distribution of Cardiovascular Disease by Activity Level under Specified Filters",
        fontSize=16
    )
).configure_view(
    stroke=None
)

combined_exercise

## Step 3: Create interactive view

For interactions:
- Click on legend symbol to change view, then click on empty space right below legend to reset view
- Click on bar graph/donut chart to change view, then click on empty space right beside donut chart to reset view

In [None]:
# Define chart selection
chart_selection_active = alt.selection_multi(fields=['cardio'], name="chartSelection", on='click', bind='legend')

# Update bar chart with selection
bar_active = bar_active.add_selection(
    chart_selection_active
).encode(
    opacity=alt.condition(chart_selection_active, alt.value(1), alt.value(0.2)),
    color=alt.Color('cardio:N', scale=color_scale, legend=alt.Legend(symbolSize=200, title="Cardiovascular Disease", titleFontSize=12, labelFontSize=12))
)

# Update donut charts with selection
active_chart = active_chart.add_selection(
    chart_selection_active
).encode(
    opacity=alt.condition(chart_selection_active, alt.value(1), alt.value(0.2)),
    color=alt.Color('cardio:N', scale=color_scale, legend=None)
)

inactive_chart = inactive_chart.add_selection(
    chart_selection_active
).encode(
    opacity=alt.condition(chart_selection_active, alt.value(1), alt.value(0.2)),
    color=alt.Color('cardio:N', scale=color_scale, legend=None)
)

# Combine the donut charts
combined_active = alt.hconcat(active_chart, inactive_chart).resolve_scale(
    color='independent'
)

# Combine charts
final_chart_active = alt.vconcat(
    bar_active,
    combined_active,
    title="Analysis of Activity Level on Cardiovascular Health"
).resolve_scale(color='independent').configure_title(
    fontSize=20,
    anchor='start',
    color='black'
).configure_view(
    stroke=None
)

final_chart_active

## Chart 2: Smoking Habits

## Step 1: Create bar chart depicting the relationship between smoking habit and CVD

In [None]:
# Calculate the total numebr of smokers and non-smokers
total_smokers = df['smoke'].sum()
total_nonsmokers = df['smoke'].count() - total_smokers

agg_data_smoke_bar = df.groupby(['smoke', 'cardio']).size().reset_index(name='counts')

# Normalize data
agg_data_smoke_bar['percentage'] = agg_data_smoke_bar['counts']
agg_data_smoke_bar.loc[agg_data_smoke_bar['smoke'] == 1, 'percentage'] /= total_smokers
agg_data_smoke_bar.loc[agg_data_smoke_bar['smoke'] == 0, 'percentage'] /= total_nonsmokers
agg_data_smoke_bar['percentage'] *= 100

text_data = agg_data_smoke_bar.copy()
text_data['summary'] = text_data.apply(lambda x: 'Smokers' if x['smoke'] == 1 else 'Non-Smokers', axis=1)

# Y-axis
y_order = ['Non-Smokers', 'Smokers']

# Normalized bar chart
bar_smoke = alt.Chart(text_data).mark_bar().encode(
    y=alt.Y('summary:N', title=None, sort=y_order),
    x=alt.X('percentage:Q', title='Percentage'),
    color=alt.Color('cardio:N', legend=alt.Legend(title="Cardiovascular Disease", orient='right'), scale=color_scale),
    tooltip=[
        alt.Tooltip('counts:Q', title='Number of Individuals', format=','),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=400,
    height=150,
    title=alt.TitleParams(
        text="Relationship between Smoking Habits and Cardiovascular Disease",
        anchor="start",
        offset=10,
        orient="top",
        fontSize=14
    )
)

bar_smoke

## Step 2: Create donut charts to show relationship between smoking habits and CVD under specified filters (age, gender, BMI)

In [None]:
# Aggregate data (smoking)
agg_data_smoke = df.groupby(['gender', 'cardio', 'age_group', 'BMI', 'smoke']).size().reset_index(name='counts')
agg_data_smoke['percentage'] = agg_data_smoke.groupby(['gender', 'age_group', 'BMI', 'smoke'])['counts'].transform(lambda x: x / x.sum())


In [None]:
# Filter 1: Gender (buttons)
gender_selection_smoke = alt.selection_single(
    fields=['gender'],
    name="Gender",
    bind=alt.binding_radio(options=agg_data['gender'].unique(), name="Gender: "),
    init={'gender': agg_data['gender'].unique()[0]}
)

# Filter 2: Age (dropdown)
age_selection_smoke = alt.selection_single(
    fields=['age_group'],
    name="Age Group",
    bind=alt.binding_select(options=agg_data['age_group'].cat.categories.tolist(), name="Age Group: "),
    init={'age_group': agg_data['age_group'].cat.categories[0]}
)

# Filter 3: BMI (dropdown)
bmi_selection_smoke = alt.selection_single(
    fields=['BMI'],
    name="BMI Category",
    bind=alt.binding_select(options=['Underweight', 'Normal', 'Overweight', 'Obese'], name="BMI Category: "),
    init={'BMI': 'Underweight'}
)

In [None]:
# Define donut chart
smoke_donut_chart = alt.Chart(agg_data_smoke).mark_arc(innerRadius=50).encode(
    theta=alt.Theta(field="percentage", type="quantitative", stack=True),
    color=alt.Color('cardio:N', legend=None, scale=color_scale),
    tooltip=[
        alt.Tooltip('counts:N', title='Number of Individuals'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1%'),
        alt.Tooltip('cardio:N', title='CVD Status')
    ]
).transform_filter(
    gender_selection_smoke
).transform_filter(
    age_selection_smoke
).transform_filter(
    bmi_selection_smoke
).add_selection(
    gender_selection_smoke,
    age_selection_smoke,
    bmi_selection_smoke
)

In [None]:
smoker_chart = smoke_donut_chart.transform_filter(
    alt.datum.smoke == 1
).properties(
    title=alt.TitleParams("CVD amongst Smokers (filtered)", fontSize=14)
)

non_smoker_chart = smoke_donut_chart.transform_filter(
    alt.datum.smoke == 0
).properties(
    title=alt.TitleParams("CVD amongst Non-Smokers (filtered)", fontSize=14)
).encode(
    color=alt.Color('cardio:N', scale=color_scale, legend=alt.Legend(title="Cardiovascular Disease", orient='bottom', titleFontSize=12, labelFontSize=12))
)

# Combine the charts
combined_smoke = alt.hconcat(non_smoker_chart, smoker_chart).resolve_scale(
    color='independent'
).properties(
    title=alt.TitleParams(
        text="Distribution of Cardiovascular Disease by Smoking Status",
        fontSize=18
    )
).configure_view(
    stroke=None
)

combined_smoke

## Step 3: Create interactive view

For interactions:
- Click on legend symbol to change view, then click on empty space right below legend to reset view
- Click on bar graph/donut chart to change view, then click on empty space right beside donut chart to reset view

In [None]:
# Define selection
chart_selection_smoke = alt.selection_multi(fields=['cardio'], name="chartSelection", on='click', bind='legend')

# Update bar chart
bar_smoke = bar_smoke.add_selection(
    chart_selection_smoke
).encode(
    opacity=alt.condition(chart_selection_smoke, alt.value(1), alt.value(0.2)),
    color=alt.Color('cardio:N', scale=color_scale, legend=alt.Legend(symbolSize=200, title="Cardiovascular Disease", titleFontSize=12, labelFontSize=12))
)

# Update donut charts
smoker_chart = smoker_chart.add_selection(
    chart_selection_smoke
).encode(
    opacity=alt.condition(chart_selection_smoke, alt.value(1), alt.value(0.2)),
    color=alt.Color('cardio:N', scale=color_scale, legend=None)
)

non_smoker_chart = non_smoker_chart.add_selection(
    chart_selection_smoke
).encode(
    opacity=alt.condition(chart_selection_smoke, alt.value(1), alt.value(0.2)),
    color=alt.Color('cardio:N', scale=color_scale, legend=None)
)

# Combine donut charts
combined_smoke = alt.hconcat(smoker_chart, non_smoker_chart).resolve_scale(
    color='independent'
)

# Combine all charts
final_chart_smoke = alt.vconcat(
    bar_smoke,
    combined_smoke,
    title="The Impact of Smoking on Cardiovascular Health"
).resolve_scale(color='independent').configure_title(
    fontSize=20,
    anchor='start',
    color='black'
).configure_view(
    stroke=None
)

final_chart_smoke

# Chart 3: Alcohol Consumption

## Step 1: Create bar chart depicting the relationship between alcochol consumption and CVD

In [None]:
# Data preprocessing
total_alcohol = df['alco'].sum()
total_nonalcohol = df['alco'].count() - total_alcohol

agg_data_alcohol_bar = df.groupby(['alco', 'cardio']).size().reset_index(name='counts')

# Normalization
agg_data_alcohol_bar['percentage'] = agg_data_alcohol_bar['counts']
agg_data_alcohol_bar.loc[agg_data_alcohol_bar['alco'] == 1, 'percentage'] /= total_alcohol
agg_data_alcohol_bar.loc[agg_data_alcohol_bar['alco'] == 0, 'percentage'] /= total_nonalcohol
agg_data_alcohol_bar['percentage'] *= 100

text_data_alcohol = agg_data_alcohol_bar.copy()
text_data_alcohol['summary'] = text_data_alcohol.apply(lambda x: 'Alcohol Consumers' if x['alco'] == 1 else 'Non-Alcohol Consumers', axis=1)

y_order_alcohol = ['Non-Alcohol Consumers', 'Alcohol Consumers']

# Bar chart
bar_alcohol = alt.Chart(text_data_alcohol).mark_bar().encode(
    y=alt.Y('summary:N',
            title=None,
            sort=y_order_alcohol,
            axis=alt.Axis(labelFontSize=12)
           ),
    x=alt.X('percentage:Q', title='Percentage'),
    color=alt.Color('cardio:N', legend=alt.Legend(title="Cardiovascular Disease", orient='right'), scale=color_scale),
    tooltip=[
        alt.Tooltip('counts:Q', title='Number of Individuals', format=','),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
    ]
).properties(
    width=400,
    height=150,
    title=alt.TitleParams(
        text="Relationship between Alcohol Consumption and Cardiovascular Disease",
        anchor="start",
        offset=10,
        orient="top",
        fontSize=14
    )
)

bar_alcohol


## Step 2: Create donut charts to show relationship between alcohol consumption and CVD under specified filters (age, gender, BMI)

In [None]:
# Aggregate data (alcohol)
agg_data_alcohol = df.groupby(['gender', 'cardio', 'age_group', 'BMI', 'alco']).size().reset_index(name='counts')
agg_data_alcohol['percentage'] = agg_data_alcohol.groupby(['gender', 'age_group', 'BMI', 'alco'])['counts'].transform(lambda x: x / x.sum())

In [None]:
# Filter 1: Gender (buttons)
gender_selection_alco = alt.selection_single(
    fields=['gender'],
    name="Gender",
    bind=alt.binding_radio(options=agg_data_alcohol['gender'].unique(), name="Gender: "),
    init={'gender': agg_data_alcohol['gender'].unique()[0]}
)

# Filter 2: Age (dropdown)
age_selection_alco = alt.selection_single(
    fields=['age_group'],
    name="Age Group",
    bind=alt.binding_select(options=agg_data_alcohol['age_group'].cat.categories.tolist(), name="Age Group: "),
    init={'age_group': agg_data_alcohol['age_group'].cat.categories[0]}
)

# Filter 3: BMI (dropdown)
bmi_selection_alco = alt.selection_single(
    fields=['BMI'],
    name="BMI Category",
    bind=alt.binding_select(options=['Underweight', 'Normal', 'Overweight', 'Obese'], name="BMI Category: "),
    init={'BMI': 'Underweight'}
)

In [None]:
# Define donut chart
alcohol_donut_chart = alt.Chart(agg_data_alcohol).mark_arc(innerRadius=50).encode(
    theta=alt.Theta(field="percentage", type="quantitative", stack=True),
    color=alt.Color('cardio:N', legend=None, scale=color_scale),
    tooltip=[
        alt.Tooltip('counts:N', title='Number of Individuals'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1%'),
        alt.Tooltip('cardio:N', title='CVD Status')
    ]
).transform_filter(
    gender_selection_alco
).transform_filter(
    age_selection_alco
).transform_filter(
    bmi_selection_alco
).add_selection(
    gender_selection_alco,
    age_selection_alco,
    bmi_selection_alco
)

In [None]:
alcohol_chart = alcohol_donut_chart.transform_filter(
    alt.datum.alco == 1
).properties(
    title=alt.TitleParams("CVD amongst Alcohol Consumers (filtered)", fontSize=14)
)

non_alcohol_chart = alcohol_donut_chart.transform_filter(
    alt.datum.alco == 0
).properties(
    title=alt.TitleParams("CVD amongst Non-Alcohol Consumers (filtered)", fontSize=14)
).encode(
    color=alt.Color('cardio:N', scale=color_scale, legend=alt.Legend(title="Cardiovascular Disease", orient='bottom', titleFontSize=12, labelFontSize=12))
)

# Combine the charts
combined_alcohol = alt.hconcat(non_alcohol_chart, alcohol_chart).resolve_scale(
    color='independent'
).properties(
    title=alt.TitleParams(
        text="Distribution of Cardiovascular Disease by Alcohol Consumption under Specified Filters",
        fontSize=18
    )
).configure_view(
    stroke=None
)

combined_alcohol

## Step 3: Create interactive view

For interactions:
- Click on legend symbol to change view, then click on empty space right below legend to reset view
- Click on bar graph/donut chart to change view, then click on empty space right beside donut chart to reset view

In [None]:
chart_selection = alt.selection_multi(fields=['cardio'], name="chartSelection", on='click', bind='legend')

# Update the bar chart
bar_alcohol = bar_alcohol.add_selection(
    chart_selection
).encode(
    opacity=alt.condition(chart_selection, alt.value(1), alt.value(0.2)),
    color=alt.Color('cardio:N', scale=color_scale, legend=alt.Legend(symbolSize=200, title="Cardiovascular Disease", titleFontSize=12, labelFontSize=12))
)

# Update the donut charts
alcohol_chart = alcohol_chart.add_selection(
    chart_selection
).encode(
    opacity=alt.condition(chart_selection, alt.value(1), alt.value(0.2)),
    color=alt.Color('cardio:N', scale=color_scale, legend=None)
)

non_alcohol_chart = non_alcohol_chart.add_selection(
    chart_selection
).encode(
    opacity=alt.condition(chart_selection, alt.value(1), alt.value(0.2)),
    color=alt.Color('cardio:N', scale=color_scale, legend=None)
)

# Combine the donut charts
combined_alcohol = alt.hconcat(alcohol_chart, non_alcohol_chart).resolve_scale(
    color='independent'
)

# Combine all charts
final_chart_alco = alt.vconcat(
    bar_alcohol,
    combined_alcohol,
    title="Analysis of Alcohol Consumption on Cardiovascular Health"
).resolve_scale(color='independent').configure_title(
    fontSize=20,
    anchor='start',
    color='black'
).configure_view(
    stroke=None
)

final_chart_alco

# Convert charts to HTML

In [None]:
final_chart_active.save("Activity Level.html")
final_chart_smoke.save("Smoking Habits.html")
final_chart_alco.save("Alcohol Consumption.html")