# PM3: Building Final Visualizations

In [1]:
import sys
import numpy as np
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt

# from tabulate import tabulate
# !pip install altair vega_datasets ipywidgets
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager
# !jupyter nbextension enable --py --sys-prefix widgetsnbextension
# !jupyter nbextension enable --py altair

# alt.renderers.enable('notebook')
# !pip install --upgrade altair vega_datasets

# !pip install altair
# !pip install vega_datasets
# !pip install ipywidgets

# alt.renderers.enable('mimetype')
# import ipywidgets as widgets
# from IPython.display import display
# RendererRegistry.enable('jupyter')

alt.renderers.enable('default')

RendererRegistry.enable('default')

In [2]:
df = pd.read_csv("../../data/raw/mxmh_survey_results.csv")
df.head()

Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,...,Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Permissions
0,8/27/2022 19:29:02,18.0,Spotify,3.0,Yes,Yes,Yes,Latin,Yes,Yes,...,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,,I understand.
1,8/27/2022 19:57:31,63.0,Pandora,1.5,Yes,No,No,Rock,Yes,No,...,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,2.0,1.0,,I understand.
2,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,...,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect,I understand.
3,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,...,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,Improve,I understand.
4,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,...,Very frequently,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve,I understand.


## Wrangling Data

In [3]:
def remove_BPM_outliers(data):
    droppedMax = data["BPM"].nlargest(2).index
    droppedMin = data["BPM"].nsmallest(5).index
    droppedOutliers = droppedMax.union(droppedMin)
    data = data.drop(index=droppedOutliers)
    return data

def drop_all_NAs(data):
    data = data.dropna()
    return data

def fill_BPM_NAs(data):
    bpm_values = data["BPM"].dropna()
    bpm_probs = bpm_values.value_counts(normalize=True) 
    music_filled_random_sampling = data.copy()
    music_filled_random_sampling["BPM"] = music_filled_random_sampling["BPM"].apply(
        lambda x: np.random.choice(bpm_probs.index, p=bpm_probs.values) if pd.isna(x) else x
        )
    return music_filled_random_sampling

def fill_nulls_unknown(data):
    data['Age'] = data['Age'].fillna(data['Age'].median())
    data['Primary streaming service'] = data['Primary streaming service'].fillna("Unknown")
    data['While working'] = data['While working'].fillna("Unknown")
    data['Instrumentalist'] = data['Instrumentalist'].fillna("Unknown")
    data['Composer'] = data['Composer'].fillna("Unknown")
    data['Foreign languages'] = data['Foreign languages'].fillna("Unknown")
    data['Music effects'] = data['Music effects'].fillna("No response")
    return data

def make_frequency_cols_ordered(data):
    order = ["Never", "Rarely", "Sometimes", "Very frequently"]
    for col in data.columns:
        if "Frequency" in col:
            data[col] = pd.Categorical(data[col], categories=order, ordered=True)
    return data

def make_mental_health_levels(data):
    data['Anxiety Level'] = pd.cut(
        data['Anxiety'], 
        bins=[0, 3, 6, 10], 
        labels=['Low', 'Medium', 'High'],
        include_lowest=True)

    data['Depression Level'] = pd.cut(
        data['Depression'], 
        bins=[0, 3, 6, 10],  
        labels=['Low', 'Medium', 'High'],
        include_lowest=True)

    data['Insomnia Level'] = pd.cut(
        data['Insomnia'], 
        bins=[0, 3, 6, 10],  
        labels=['Low', 'Medium', 'High'],
        include_lowest=True)

    data['OCD Level'] = pd.cut(
        data['OCD'], 
        bins=[0, 3, 6, 10],  
        labels=['Low', 'Medium', 'High'],
        include_lowest=True)
    return data

def make_data_time(data):
    data["Timestamp"] = pd.to_datetime(data["Timestamp"])
    data["Date"] = pd.to_datetime(data["Timestamp"].dt.date)
    data["Time"] = data["Timestamp"].dt.strftime("%H:%M:%S")
    return data;

def make_time_of_day(data):
    time_order = ["Morning", "Afternoon", "Evening", "Night"]
    data["Time of Day"] = data["Timestamp"].apply(categorize_time_of_day)
    data["Time of Day"] = pd.Categorical(
        data["Time of Day"], 
        categories=time_order, 
        ordered=True)
    return data

def categorize_time_of_day(timestamp):
    hour = timestamp.hour
    if 6 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 18:
        return "Afternoon"
    elif 18 <= hour < 24:
        return "Evening"
    else:
        return "Night"

def drop_permissions(data):
    data = data.drop(columns=["Permissions"])
    return data

def create_no_music_hobbies_column(data):
    data['Musical_hobbies'] = (
    (data["Composer"] == "No") & (data["Instrumentalist"] == "No")
        ).map({True: "No", False: "Yes"})
    return data

def group_ages(data):
    data["Age_Grouped"] = np.where(data["Age"] > 65, 66, data["Age"])
    return data

def make_genre_diversity_score(data):
    frequency_mapping = {
        "Never": 0,
        "Rarely": 1,
        "Sometimes": 2,
        "Very frequently": 3
    }
    genre_columns = [
        "Frequency [Classical]", "Frequency [Country]", "Frequency [EDM]", "Frequency [Folk]",
        "Frequency [Gospel]", "Frequency [Hip hop]", "Frequency [Jazz]", "Frequency [K pop]",
        "Frequency [Latin]", "Frequency [Lofi]", "Frequency [Metal]", "Frequency [Pop]",
        "Frequency [R&B]", "Frequency [Rap]", "Frequency [Rock]", "Frequency [Video game music]"
    ]
    for col in genre_columns:
        data[col] = data[col].map(frequency_mapping)
    data["Genre Diversity Score"] = (data[genre_columns] > 0).sum(axis=1)
    return data

def stratify_hours_per_day(data):
    bins = [0, 6, 12, 18, 24.0001]
    labels = ['0-6 hours', '7-12 hours', '13-18 hours', '19-24 hours']
    data['Hours per day stratified'] = pd.cut(data['Hours per day'], bins=bins, labels=labels, right=False)
    return data

# make long table with genre and frequency columns
def elongate_genre_frequency(data):
    # Identify the frequency columns
    frequency_cols = [col for col in data.columns if col.startswith("Frequency [")]
    # Melt the dataframe to convert to long format
    data_long = data.melt(id_vars=[col for col in data.columns if col not in frequency_cols], 
                  value_vars=frequency_cols, 
                  var_name="Genre", 
                  value_name="Frequency")
    # Clean the "Genre" column by removing "Frequency [" and the closing bracket "]"
    data_long["Genre"] = data_long["Genre"].str.replace("Frequency \\[|\\]", "", regex=True)
    return data_long

# make longer table with condition and severity score columns
def elongate_condition_severity(data):
    # Identify the columns related to conditions
    conditions = ["Anxiety", "Depression", "Insomnia", "OCD"]
    # Melt the dataframe again to make conditions long format
    data_longer = data.melt(id_vars=[col for col in data.columns if col not in conditions], 
                                  value_vars=conditions, 
                                  var_name="Condition", 
                                  value_name="Severity")
    return data_longer

def stratify_severity(data):
    data['Level'] = pd.cut(data['Severity'], bins=[0, 3, 6, 10], labels=['Low', 'Medium', 'High'], include_lowest=True)
    return data

def rename_columns(data):
    data = data.rename(columns={'While working' : 'Listen while working', 'Exploratory' : 'Explore new music', 'Foreign languages' : 'Listen to foreign languages'})
    return data

def normalize_severity(data):
    # calculate average severity in sample
    avg_severity = data.groupby("Condition")["Severity"].transform("mean")
    data["Difference From Average Severity"] = (data["Severity"] - avg_severity)
    return data

In [4]:
def clean_data_helena(path):
    np.random.seed(42)
    data = pd.read_csv(path)
    # Handle NAs
    data = remove_BPM_outliers(data)
    data = drop_all_NAs(data)
    # Make ordered columns
    data = make_frequency_cols_ordered(data)
    # make long table with genre and frequency columns
    data = elongate_genre_frequency(data)
    data = elongate_condition_severity(data)
    # Derive new columns
    data = stratify_hours_per_day(data)
    data = stratify_severity(data)
    data = normalize_severity(data)
    #drop permissions
    data = drop_permissions(data)
    return data

df_processed_helena = clean_data_helena("../../data/raw/mxmh_survey_results.csv")
df_processed_helena.head()

Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,BPM,Music effects,Genre,Frequency,Condition,Severity,Hours per day stratified,Level,Difference From Average Severity
0,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,132.0,No effect,Classical,Never,Anxiety,7.0,0-6 hours,High,1.1289
1,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,84.0,Improve,Classical,Sometimes,Anxiety,9.0,0-6 hours,High,3.1289
2,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,107.0,Improve,Classical,Never,Anxiety,7.0,0-6 hours,High,1.1289
3,8/27/2022 21:56:50,18.0,Spotify,5.0,Yes,Yes,Yes,Jazz,Yes,Yes,86.0,Improve,Classical,Rarely,Anxiety,8.0,0-6 hours,High,2.1289
4,8/27/2022 22:00:29,18.0,YouTube Music,3.0,Yes,Yes,No,Video game music,Yes,Yes,66.0,Improve,Classical,Sometimes,Anxiety,4.0,0-6 hours,Medium,-1.8711


In [5]:
df_processed_helena.info

<bound method DataFrame.info of                  Timestamp   Age Primary streaming service  Hours per day  \
0       8/27/2022 21:28:18  18.0                   Spotify            4.0   
1       8/27/2022 21:40:40  61.0             YouTube Music            2.5   
2       8/27/2022 21:54:47  18.0                   Spotify            4.0   
3       8/27/2022 21:56:50  18.0                   Spotify            5.0   
4       8/27/2022 22:00:29  18.0             YouTube Music            3.0   
...                    ...   ...                       ...            ...   
38971  10/30/2022 14:37:28  17.0                   Spotify            2.0   
38972   11/1/2022 22:26:42  18.0                   Spotify            1.0   
38973   11/3/2022 23:24:38  19.0   Other streaming service            6.0   
38974   11/4/2022 17:31:47  19.0                   Spotify            5.0   
38975    11/9/2022 1:55:20  29.0             YouTube Music            2.0   

      While working Instrumentalist Compose

In [6]:
def clean_data_helena_2(path):
    np.random.seed(42)
    data = pd.read_csv(path)
    # Handle NAs
    data = remove_BPM_outliers(data)
    data = drop_all_NAs(data)
    # Make ordered columns
    data = make_frequency_cols_ordered(data)
    # make long table with genre and frequency columns
    data = elongate_condition_severity(data)
    data.columns = data.columns.str.replace(r"Frequency \[|\]", "", regex=True)
    # Derive new columns
    data = stratify_hours_per_day(data)
    data = stratify_severity(data)
    data = normalize_severity(data)
    #drop permissions
    data = drop_permissions(data)
    return data

df_processed_helena_2 = clean_data_helena_2("../../data/raw/mxmh_survey_results.csv")
df_processed_helena_2.head()

Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,...,R&B,Rap,Rock,Video game music,Music effects,Condition,Severity,Hours per day stratified,Level,Difference From Average Severity
0,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,...,Never,Rarely,Rarely,Very frequently,No effect,Anxiety,7.0,0-6 hours,High,1.1289
1,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,...,Sometimes,Never,Never,Never,Improve,Anxiety,9.0,0-6 hours,High,3.1289
2,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,...,Very frequently,Very frequently,Never,Rarely,Improve,Anxiety,7.0,0-6 hours,High,1.1289
3,8/27/2022 21:56:50,18.0,Spotify,5.0,Yes,Yes,Yes,Jazz,Yes,Yes,...,Very frequently,Very frequently,Very frequently,Never,Improve,Anxiety,8.0,0-6 hours,High,2.1289
4,8/27/2022 22:00:29,18.0,YouTube Music,3.0,Yes,Yes,No,Video game music,Yes,Yes,...,Rarely,Never,Never,Sometimes,Improve,Anxiety,4.0,0-6 hours,Medium,-1.8711


In [7]:
def clean_data_helena_levels(path):
    np.random.seed(42)
    data = pd.read_csv(path)
    data = rename_columns(data)
    # Handle NAs
    data = remove_BPM_outliers(data)
    data = drop_all_NAs(data)
    # Make ordered columns
    data = make_frequency_cols_ordered(data)
    # Derive new columns
    data = stratify_hours_per_day(data)
    data = make_mental_health_levels(data)
    #drop permissions
    data = drop_permissions(data)
    return data

df_processed_helena_levels = clean_data_helena_levels("../../data/raw/mxmh_survey_results.csv")
df_processed_helena_levels.head()

Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,Listen while working,Instrumentalist,Composer,Fav genre,Explore new music,Listen to foreign languages,...,Anxiety,Depression,Insomnia,OCD,Music effects,Hours per day stratified,Anxiety Level,Depression Level,Insomnia Level,OCD Level
2,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,...,7.0,7.0,10.0,2.0,No effect,0-6 hours,High,High,High,Low
3,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,...,9.0,7.0,3.0,3.0,Improve,0-6 hours,High,High,Low,Low
4,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,...,7.0,2.0,5.0,9.0,Improve,0-6 hours,High,Low,Medium,High
5,8/27/2022 21:56:50,18.0,Spotify,5.0,Yes,Yes,Yes,Jazz,Yes,Yes,...,8.0,8.0,7.0,7.0,Improve,0-6 hours,High,High,High,High
6,8/27/2022 22:00:29,18.0,YouTube Music,3.0,Yes,Yes,No,Video game music,Yes,Yes,...,4.0,8.0,6.0,0.0,Improve,0-6 hours,Medium,High,Medium,Low


In [8]:
df_processed_helena_2.dtypes

Timestamp                             object
Age                                  float64
Primary streaming service             object
Hours per day                        float64
While working                         object
Instrumentalist                       object
Composer                              object
Fav genre                             object
Exploratory                           object
Foreign languages                     object
BPM                                  float64
Classical                           category
Country                             category
EDM                                 category
Folk                                category
Gospel                              category
Hip hop                             category
Jazz                                category
K pop                               category
Latin                               category
Lofi                                category
Metal                               category
Pop       

## Visualizations

### Correlate – How is the severity of each mental illness associated with listening frequency to each genre?

Based on feedback, I added colour encodings to further distinguish conditions.

In [9]:
alt.data_transformers.disable_max_rows()

genres = df_processed_helena['Genre'].unique().tolist()
health_conditions = df_processed_helena['Condition'].unique().tolist()

genre_dropdown = alt.binding_select(options=genres, name="Select Music Genre: ")
genre_selection = alt.param(bind=genre_dropdown, value="Classical")

fig1 = alt.Chart(df_processed_helena).mark_area().encode(
    x=alt.X("Frequency:N", title="Listening Frequency"),
    y=alt.Y("average(Difference From Average Severity):Q", title="Difference From Average Severity", scale=alt.Scale(domain=[-2, 2])),
    tooltip=["Genre:N", "Frequency:N", "average(Difference From Average Severity):Q"],
    # added color encoding to further distinguish conditions
    color=alt.Color('Condition:N',
                    scale=alt.Scale(domain=['Anxiety', 'Depression', 'Insomnia', 'OCD'],
                                    range=['#E04E4E', '#4066A7', '#EA9900', '#3CB5AF']),
                   legend=None)
).properties(
    height=200,
    width=200
).facet(
    row=alt.Row("Condition:N", title="Mental Health Condition")
).add_params(
    genre_selection, alt.selection_interval(bind='scales', encodings=['y'])
).transform_filter(
    alt.datum.Genre == genre_selection
).properties(
    title={
        "text": ["How Mental Health Condition Severity", 
                 "Trends with Genre Listening Frequency"],
        "anchor": "middle"
    }
)

fig1.save('../../images/pm4/helena_viz1_2.0.html')

### Find Anomalies - Are there any disparities in the mental health of participants with a specific favourite genre?

In [10]:
fig2 = alt.Chart(df_processed_helena).mark_rect().encode(
    y=alt.Y("Condition:N", title="Mental Health Condition"),  
    x=alt.X("Fav genre:N", title="Favorite Genre", sort="color"),  
    color=alt.Color("average(Difference From Average Severity):Q", 
                    title=["Difference From", "Average Severity"],
                    scale=alt.Scale(scheme='blueorange', domainMid=0, domain=[-4, 4])),
    tooltip=[
        alt.Tooltip("Fav genre:N", title='Favourite Genre'),
        alt.Tooltip("Condition:N", title='Condition'),
        alt.Tooltip("average(Difference From Average Severity):Q", title='Difference from Average Severity')
    ]
).properties(
    width=600,
    height=400,
    title="Disparities in Mental Health Condition Severity Based on Favourite Genre",

)

fig2.save('../../images/pm4/helena_viz2_2.0.html')

### Filter/Retrieve Value – What are the values of non-genre listening habits (Hours per day, While working, Instrumentalist, Composer, Exploratory, Foreign languages) when you filter by low, medium, and high mental health severity?

Based on feedback, I used opacity to highlight the selected column and cleaned up the condition labels. I also removed primary streaming service from listening habits and added hours per day (deemed less relevant to this visualization based on feedback).

In [11]:
# removed primary streaming service from listening habits, added hours per day
non_genre_listening_habits = ["Hours per day stratified", "Listen while working", "Instrumentalist", 
                              "Composer", "Explore new music", "Listen to foreign languages"]
# removed underscores from labels
health_conditions = ["Anxiety Level", "Depression Level", "Insomnia Level", "OCD Level"]

condition_dropdown = alt.binding_select(options=health_conditions, name="Select Condition: ")
condition_select = alt.selection_single(fields=['condition'], bind=condition_dropdown, name="condition", value="Anxiety Level")

habit_dropdown = alt.binding_select(options=non_genre_listening_habits, name="Select Listening Habit: ")
habit_select = alt.selection_single(fields=['Habit'], bind=habit_dropdown, name="habit", value="Hours per day stratified")

condition_level_select = alt.selection_point(fields=['condition_level'], name="Select Level")

base = alt.Chart(df_processed_helena_levels).transform_fold(
    health_conditions,
    as_=['condition', 'condition_level']
)

bar_chart = base.mark_bar().encode(
    y=alt.Y('condition_level:O', title='Selected Condition Severity', sort=["High", "Medium", "Low"]),
    x=alt.X('count():Q', title='Participants'),
    color=alt.Color('condition_level:O', legend=None, sort=["High", "Medium", "Low"]).scale(scheme="oranges", reverse=True),
    tooltip=['count()'],
    opacity=alt.condition(condition_level_select, alt.value(1), alt.value(0.3))
).transform_filter(
    condition_select 
).add_selection(
    condition_select,
    condition_level_select 
).properties(
    width=400,
    height=300,
    title={
        "text": 'Participants by Selected Condition Severity',
        "subtitle": "Select a specific bar below to filter the plot on the right."
    }
)

habits_chart = base.transform_fold(
    non_genre_listening_habits,
    as_=['Habit', 'habit_value']
).mark_bar().encode(
    y=alt.Y('habit_value:N', title="Selected Listening Habit", sort='-x'),
    x=alt.X('count():Q', title='Participants'),
    tooltip=['Habit:N', 'count()']
).transform_filter(
    condition_select  
).transform_filter(
    condition_level_select  
).transform_filter(
    habit_select  
).add_selection(
    habit_select
).properties(
    width=400,
    height=300,
    title='Participants by Selected Listening Habit, Filtered by Selected Condition Severity'
)

fig3 = bar_chart | habits_chart
fig3.save('../../images/pm4/helena_viz3_2.0.html')

Deprecated since `altair=5.0.0`. Use selection_point instead.
  condition_select = alt.selection_single(fields=['condition'], bind=condition_dropdown, name="condition", value="Anxiety Level")
Deprecated since `altair=5.0.0`. Use selection_point instead.
  habit_select = alt.selection_single(fields=['Habit'], bind=habit_dropdown, name="habit", value="Hours per day stratified")
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).add_selection(
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).add_selection(


### Characterize Distributions - What are the distributions of music hours per day and favourite genre BPM, and how does it correlate with mental health?

Merged figure 4 and 5, changed the median labels to black, and changed the histogram colour to brown to avoid confusing it with the colour used for depression.

In [12]:
# combined fig. 4 and 5
hours_selection = alt.selection_interval(encodings=['x'], name="Hours Selection")
condition_selection_1 = alt.selection_single(fields=['Condition'], bind='legend', name="Condition Selection")

# used brown for histograms to avoid confusing them with the colour used for depression
histogram_1 = alt.Chart(df_processed_helena).transform_density(
    'Hours per day', as_=['Hours per day', 'density'], bandwidth=0.5
).mark_area(color='#82603D').encode(
    x=alt.X('Hours per day:Q', title='Music Listening Hours Per Day'),
    y=alt.Y('density:Q', title='Density'),
    tooltip=['density:Q']
).add_selection(hours_selection).transform_filter(condition_selection_1).properties(
    width=400,
    height=300,
    title={
        "text": "Distribution of Listening Hours Per Day",
        "subtitle": "Select a range below to filter the plot on the right."
    }
)

# changed median labels to black
median_hours = df_processed_helena['Hours per day'].median()
median_hours_rule = alt.Chart(df_processed_helena).transform_aggregate(
    median_hours='median(Hours per day)'
).mark_rule(color='black', strokeDash=[5,5]).encode(
    x='median_hours:Q',
    size=alt.value(2)
)
median_hours_label = alt.Chart(df_processed_helena).transform_aggregate(
    median_hours='median(Hours per day)'
).mark_text(align='left', dx=60, dy=10, color='black', fontSize=12, fontWeight='bold').encode(
    x=alt.value(median_hours),
    y=alt.value(280), 
    text=alt.value(f"Median: {median_hours:.1f}")
)

bar_chart_1 = alt.Chart(df_processed_helena).mark_bar().encode(
    x=alt.X('Condition:N', title='Mental Health Condition'),
    y=alt.Y('average(Difference From Average Severity):Q', title='Difference from Average Severity', scale=alt.Scale(domain=[-5, 5])),
    color=alt.Color('Condition:N', legend=None),
    tooltip=['average(Difference From Average Severity):Q']
).transform_filter(hours_selection).add_selection(condition_selection_1).properties(
    width=400,
    height=300,
    title="Mental Health Condition Severity, Filtered by Selected Listening Hours Range"
)

fig4 = (histogram_1 + median_hours_rule + median_hours_label) | bar_chart_1


bpm_selection = alt.selection_interval(encodings=['x'], name="BPM Selection")
condition_selection_2 = alt.selection_single(fields=['Condition'], bind='legend', name="Condition Selection")

histogram_2 = alt.Chart(df_processed_helena).transform_density(
    'BPM', as_=['BPM', 'density'], bandwidth=10
).mark_area(color='#82603D').encode(
    x=alt.X('BPM:Q', title='BPM'),
    y=alt.Y('density:Q', title='Density'),
    tooltip=['density:Q']
).add_selection(bpm_selection).transform_filter(condition_selection_2).properties(
    width=400,
    height=300,
    title={
        "text": "Distribution of Favourite Genre BPM",
        "subtitle": "Select a range below to filter the plot on the right."
    }
)

median_bpm = df_processed_helena['BPM'].median()
median_bpm_rule = alt.Chart(df_processed_helena).transform_aggregate(
    median_bpm='median(BPM)'
).mark_rule(color='black', strokeDash=[5,5]).encode(
    x='median_bpm:Q',
    size=alt.value(2)
)
median_bpm_label = alt.Chart(df_processed_helena).transform_aggregate(
    median_bpm='median(BPM)'
).mark_text(align='left', dx=90, dy=10, color='black', fontSize=12, fontWeight='bold').encode(
    x=alt.value(median_bpm),
    y=alt.value(280), 
    text=alt.value(f"Median: {median_bpm:.1f}")
)

bar_chart_2 = alt.Chart(df_processed_helena).mark_bar().encode(
    x=alt.X('Condition:N', title='Mental Health Condition'),
    y=alt.Y('average(Difference From Average Severity):Q', title='Difference from Average Severity', scale=alt.Scale(domain=[-5, 5])),
    color=alt.Color('Condition:N', legend=None),
    tooltip=['average(Difference From Average Severity):Q']
).transform_filter(bpm_selection).add_selection(condition_selection_2).properties(
    width=400,
    height=300,
    title="Mental Health Condition Severity, Filtered by Selected BPM Range"
)

fig5 = (histogram_2 + median_bpm_rule + median_bpm_label) | bar_chart_2


fig4_new = (fig4 & fig5)

fig4_new.save('../../images/pm4/helena_viz4-5_2.0.html')

Deprecated since `altair=5.0.0`. Use selection_point instead.
  condition_selection_1 = alt.selection_single(fields=['Condition'], bind='legend', name="Condition Selection")
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).add_selection(hours_selection).transform_filter(condition_selection_1).properties(
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).transform_filter(hours_selection).add_selection(condition_selection_1).properties(
Deprecated since `altair=5.0.0`. Use selection_point instead.
  condition_selection_2 = alt.selection_single(fields=['Condition'], bind='legend', name="Condition Selection")
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).add_selection(bpm_selection).transform_filter(condition_selection_2).properties(
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).transform_filter(bpm_selection).add_selection(condition_selection_2).properties(


### Filter/Characterize Distribution - How do the distributions of mental health severity, music hours per day, and favourite genre BPM appear when filtering by mental health condition?

I made the histogram brown to match the colouring above and updated the subtitle.

In [13]:
health_conditions = ["Anxiety", "Depression", "Insomnia", "OCD"]

condition_dropdown = alt.binding_select(options=health_conditions, name="Select Condition: ")
condition_select = alt.selection_point(fields=['Condition'], bind=condition_dropdown, name="condition", value="Anxiety")

brush = alt.selection_interval(encodings=['x'], resolve='intersect')

# made histogram brown to match colouring above
hist = alt.Chart(df_processed_helena).mark_bar(color='#82603D').encode(
    alt.X(alt.repeat('column'), type='quantitative', bin=alt.Bin(maxbins=50, minstep=1)),
    alt.Y('count():Q', title='Participants')
)

fig6 = alt.layer(
    hist.add_selection(brush).encode(color=alt.value('lightgrey')).transform_filter(condition_select),
    hist.transform_filter(brush).transform_filter(condition_select)
).properties(
    height=200
).repeat(
    column=['Severity', 'BPM', 'Hours per day'],
    data=df_processed_helena 
).add_selection(
    condition_select
).configure_view(
    stroke='transparent' 
).properties(
    title={
        "text": "Distributions by Selected Mental Health Condition",
        # updated subtitle
        "subtitle": "Select a severity or BPM range below to filter all the plots."
    }
)
fig6.save('../../images/pm4/helena_viz6_2.0.html')

Deprecated since `altair=5.0.0`. Use add_params instead.
  hist.add_selection(brush).encode(color=alt.value('lightgrey')).transform_filter(condition_select),
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).add_selection(


In [14]:
# health_conditions = ["Anxiety", "Depression", "Insomnia", "OCD"]
# condition_dropdown = alt.binding_select(options=health_conditions, name="Select Condition: ")
# condition_select = alt.selection_point(fields=['Condition'], bind=condition_dropdown, name="condition", value="Anxiety")

# severity_brush = alt.selection_interval(encodings=['x'], name='severity_brush')
# bpm_brush = alt.selection_interval(encodings=['x'], name='bpm_brush')
# hours_brush = alt.selection_interval(encodings=['x'], name='hours_brush')

# hist_severity = alt.Chart(df_processed_helena).mark_bar(color='#82603D').encode(
#     alt.X('Severity:Q', bin=alt.Bin(maxbins=20, minstep=1)),
#     alt.Y('count():Q', title='Participants'),
#     color=alt.condition(severity_brush, alt.value('#82603D'), alt.value('lightgrey'))
# ).add_selection(severity_brush)

# hist_bpm = alt.Chart(df_processed_helena).mark_bar(color='#82603D').encode(
#     alt.X('BPM:Q', bin=alt.Bin(maxbins=20, minstep=1)),
#     alt.Y('count():Q', title='Participants'),
#     color=alt.condition(bpm_brush, alt.value('#82603D'), alt.value('lightgrey'))
# ).add_selection(bpm_brush)

# hist_hours = alt.Chart(df_processed_helena).mark_bar(color='#82603D').encode(
#     alt.X('Hours per day:Q', bin=alt.Bin(maxbins=20, minstep=1)),
#     alt.Y('count():Q', title='Participants'), 
#     color=alt.condition(hours_brush, alt.value('#82603D'), alt.value('lightgrey'))
# ).add_selection(hours_brush)

# combined = alt.hconcat(
#     hist_severity.transform_filter(bpm_brush).transform_filter(hours_brush),
#     hist_bpm.transform_filter(severity_brush).transform_filter(hours_brush),
#     hist_hours.transform_filter(severity_brush).transform_filter(bpm_brush)
# ).transform_filter(
#     condition_select
# ).add_selection(
#     condition_select
# ).properties(
#     title={
#         "text": "Distributions by Selected Mental Health Condition",
#         "subtitle": "Select a range from any plot to filter all the charts."
#     }
# )