In [17]:
import pandas as pd
import numpy as np

# Load the datasets
merged = pd.read_csv('data/cleaned_data.csv')

In [18]:
# Prepare data for analysis
# Combine Gender_Female, Gender_Male, Gender_Non-Binary into single column
merged['Gender'] = merged.apply(
    lambda row: 'Female' if row['Gender_Female'] else ('Male' if row['Gender_Male'] else ('Non-Binary' if row['Gender_Non-Binary'] else 'Unknown')),
    axis=1)

# Convert one-hot encoded generation columns to a single 'age_group' column
def get_age_group(row):
    if row['Age_Gen_Boomer']:
        return '58+'
    elif row['Age_Gen_Gen_X']:
        return '42-57'
    elif row['Age_Gen_Millenial']:
        return '26-41'
    elif row['Age_Gen_Gen_Z']:
        return '18-25'
    else:
        return 'Unknown'
    
merged['age_group'] = merged.apply(get_age_group, axis=1)

# Convert one-hot encoded tenure group columns to a single 'tenure_group' column
def get_tenure_group(row):
    if row['TenureGroups_ >25_years']:
        return '>25'
    elif row['TenureGroups_16-25_years']:
        return '16-25'
    elif row['TenureGroups_5-15_years']:
        return '5-15'
    elif row['TenureGroups_<5_years']:
        return '<5'
    else:
        return 'Unknown'

merged['tenure_group'] = merged.apply(get_tenure_group, axis=1)

# Convert one-hot encoded department columns to a single 'Department' column
def get_department(row):
    if row['Department_Finance']:
        return 'Finance'
    elif row['Department_HR']:
        return 'HR'
    elif row['Department_IT']:
        return 'IT'
    elif row['Department_Marketing']:
        return 'Marketing'
    elif row['Department_Sales']:
        return 'Sales'
    else:
        return 'Unknown'

merged['Department'] = merged.apply(get_department, axis=1)



## Demographic Insights Discovery
This section analyzes benefit usage across demographic groups (age, gender, department, tenure), visualizes patterns, and highlights actionable insights.

In [19]:
# Demographic Analysis: Usage by Age, Gender, Department, Tenure
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px

# 1. Analyze benefit usage by demographics (Age, Gender, Department, Tenure)
# Group by demographic features and calculate usage statistics
demographics_usage = merged.groupby(['age_group', 'Gender', 'Department', 'tenure_group'])['UsageFrequency'].agg(['mean', 'median', 'std', 'count']).reset_index()
demographics_usage  # View summary statistics

from plotly.subplots import make_subplots
import plotly.graph_objects as go

# categories = ['age_group', 'Gender', 'Department', 'tenure_group']
# fig = make_subplots(rows=2, cols=2, subplot_titles=categories)

# for i, cat in enumerate(categories):
#     row = i // 2 + 1
#     col = i % 2 + 1
#     fig.add_trace(
#         go.Box(
#             y=merged['UsageFrequency'],
#             x=merged[cat],
#             name=cat,
#             hovertemplate='UsageFrequency: %{y:.2f}<br>' + ': %{x}<extra></extra>'
#         ),
#         row=row, col=col
#     )

# fig.update_layout(
#     height=800,
#     width=1000,
#     title_text='UsageFrequency Distribution by Demographic Categories',
#     yaxis=dict(tickformat=".2f"),
#     yaxis2=dict(tickformat=".2f"),
#     yaxis3=dict(tickformat=".2f"),
#     yaxis4=dict(tickformat=".2f")
# )
# fig.show()

# # Compare subtypes across segments (e.g., by age_group, departments)
# def construct_barplot(df, groupby_list, x, color):
#     df_group = df.groupby(groupby_list)['UsageFrequency'].mean().reset_index()
#     return px.bar(df_group, x=x, y='UsageFrequency', color=color, barmode='stack', title=f'Usage by {x} and {color}', hover_data=groupby_list), df_group

# def compare_x_y_heatmap(df, groupby_list, x, y, color='Avg UsageFrequency'):
#     subtype_df = df.groupby(groupby_list)['UsageFrequency'].mean().unstack().reset_index()
#     return px.imshow(
#                 subtype_df.set_index(y),
#                 labels=dict(x=x, y=y, color="Avg UsageFrequency"),
#                 title=f'{x} Usage Heatmap by {y}',
#                 aspect="auto",
#             ).show(), subtype_df.set_index(y)

# Visualize usage by department and age group and gender
fig1 = px.bar(
    merged.groupby(['age_group', 'Gender', 'Department'])['UsageFrequency'].agg(['mean']).reset_index(),
    x='Department',
    y='mean',
    color='Gender',
    barmode='group',
    title='Mean Usage Frequency by Department and Gender',
    hover_data=['age_group'],
    color_discrete_sequence=px.colors.qualitative.Set2)
fig1.show()

# Visualize usage by gender and tenure group
fig2 = px.bar(
        merged.groupby(['tenure_group', 'Gender', 'Department'])['UsageFrequency'].agg(['mean']).reset_index(),
        x='Department', 
        y='mean', 
        color='tenure_group', 
        barmode='group', 
        title='Mean UsageFrequency by Department and Tenure Group', 
        hover_data=['Gender'],
        color_discrete_sequence=px.colors.qualitative.Set1)
fig2.show()

# Visualize usage by gender and tenure group
fig3 = px.bar(
    merged.groupby(['age_group', 'Department'])['UsageFrequency'].agg(['mean']).reset_index(),
    x='Department',
    y='mean',
    color='age_group',
    barmode='stack',
    title='Mean Usage Frequency by Department and Gender',
    color_discrete_sequence=px.colors.qualitative.Set1[::-1])
fig3.show()


In [20]:
# Barplot: Mean UsageFrequency by Department and Gender
import plotly.express as px
dept_gender_df = merged.groupby(['Department', 'Gender'])['UsageFrequency'].mean().reset_index()
fig = px.bar(
    dept_gender_df,
    x='Department',
    y='UsageFrequency',
    color='Gender',
    barmode='group',
    title='Mean UsageFrequency by Department and Gender',
    color_discrete_sequence=px.colors.qualitative.Set2,
    hover_data=['Gender']
 )
fig.update_layout(
    yaxis=dict(title='Mean UsageFrequency', tickformat='.2f'),
    xaxis=dict(title='Department'),
    legend_title_text='Gender'
 )
fig.show()

In [21]:
# Heatmap: UsageFrequency by Demographic Feature and Benefit Subtype
import plotly.express as px
for feature in ['tenure_group', 'age_group', 'Gender', 'Department']:
    heatmap_data = merged.groupby([feature, 'BenefitSubType'])['UsageFrequency'].mean().unstack()
    fig = px.imshow(heatmap_data, labels=dict(x='BenefitSubType', y=feature, color='Avg UsageFrequency'), title=f'UsageFrequency Heatmap: {feature} vs BenefitSubType')
    fig.show()

In [22]:
# Summary statistics for Department vs BenefitSubType heatmap
heatmap_data = merged.groupby(['Department', 'BenefitSubType'])['UsageFrequency'].mean().unstack().fillna(0)
heatmap_data.describe()

BenefitSubType,401k Basic Matching,401k Catch-Up Contributions,401k High Contribution,401k Investment Fees,401k Maximum Matching,401k Standard Matching,After-School Care,Basic Coverage,Conference Attendance,Dependent Coverage,...,PPO Individual,Premium Discount Tier 1,Professional Certification,Supplemental High Amount,Supplemental Standard,Tier 1 Partners,Tier 2 Partners,Tier 3 Partners,Transit Subsidy,Undergraduate Degree
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,3.784433,3.233193,3.956077,3.62733,3.586952,3.06848,2.943048,3.481701,3.414705,3.402955,...,3.129574,3.0731,3.080226,3.44163,3.672371,3.177861,3.212949,3.326341,3.436664,3.267574
std,0.485013,0.181135,0.336872,0.473977,0.450474,0.207561,0.483491,0.492442,0.297328,0.46084,...,0.351282,0.390159,0.278401,0.56841,0.434029,0.418402,0.45027,0.595769,0.434567,0.305136
min,3.253731,3.0,3.595238,2.864865,3.183099,2.867647,2.576923,2.881579,3.140845,2.767442,...,2.568182,2.515152,2.805556,2.887324,3.275862,2.83871,2.529412,2.932203,2.863014,2.816092
25%,3.314286,3.144737,3.7,3.476923,3.307692,2.886076,2.651515,3.242424,3.205128,3.1125,...,3.020833,3.031746,2.916667,2.948276,3.292135,2.84375,2.984375,2.955556,3.2875,3.207792
50%,3.882353,3.230769,3.927711,3.851351,3.388889,3.014706,2.704545,3.478873,3.277778,3.566265,...,3.275362,3.092105,3.061224,3.372549,3.532258,3.031746,3.4375,3.168831,3.297297,3.275
75%,4.133333,3.304348,4.121951,3.911765,3.76,3.273973,3.032258,3.588235,3.616438,3.636364,...,3.328947,3.111111,3.081967,3.761905,4.054054,3.346154,3.508197,3.205882,3.787234,3.381579
max,4.338462,3.486111,4.435484,4.031746,4.295082,3.3,3.75,4.217391,3.833333,3.932203,...,3.454545,3.615385,3.535714,4.238095,4.207547,3.828947,3.605263,4.369231,3.948276,3.657407


In [23]:
# Summary statistics for Gender vs BenefitSubType heatmap
heatmap_data = merged.groupby(['Gender', 'BenefitSubType'])['UsageFrequency'].mean().unstack().fillna(0)
heatmap_data.describe()

BenefitSubType,401k Basic Matching,401k Catch-Up Contributions,401k High Contribution,401k Investment Fees,401k Maximum Matching,401k Standard Matching,After-School Care,Basic Coverage,Conference Attendance,Dependent Coverage,...,PPO Individual,Premium Discount Tier 1,Professional Certification,Supplemental High Amount,Supplemental Standard,Tier 1 Partners,Tier 2 Partners,Tier 3 Partners,Transit Subsidy,Undergraduate Degree
count,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
mean,4.152644,2.847638,4.478438,3.33794,3.983624,3.105939,2.839689,3.644351,2.884845,3.135994,...,3.312031,2.605427,2.899377,4.009398,3.380584,3.775785,2.602778,3.496406,2.678194,3.022236
std,0.75829,0.80163,1.044997,0.679841,0.694459,0.162781,0.300491,0.332663,1.059685,0.611548,...,0.253391,0.958949,0.46802,1.075042,0.49195,1.061983,1.187795,0.379412,1.28063,0.543906
min,3.537931,2.0,3.702381,2.571429,3.231214,2.91875,2.615385,3.265537,1.666667,2.5,...,3.123188,1.5,2.6,3.351724,2.833333,3.10219,1.25,3.131579,1.2,2.444444
25%,3.728966,2.474684,3.884323,3.072948,3.675436,3.051766,2.668983,3.522081,2.530392,2.844118,...,3.168046,2.301205,2.62971,3.389097,3.177813,3.163678,2.166667,3.300164,2.291176,2.771191
50%,3.92,2.949367,4.066265,3.574468,4.119658,3.184783,2.722581,3.778626,3.394118,3.188235,...,3.212903,3.10241,2.65942,3.426471,3.522293,3.225166,3.083333,3.46875,3.382353,3.097938
75%,4.46,3.271458,4.866466,3.721196,4.359829,3.199534,2.951842,3.833757,3.493934,3.45399,...,3.406452,3.158141,3.049065,4.338235,3.65421,4.112583,3.279167,3.678819,3.417291,3.311131
max,5.0,3.593548,5.666667,3.867925,4.6,3.214286,3.181102,3.888889,3.59375,3.719745,...,3.6,3.213873,3.43871,5.25,3.786127,5.0,3.475,3.888889,3.452229,3.524324


In [24]:
merged.groupby(['Gender','Department'])['UsageFrequency'].mean().unstack()

Department,Finance,HR,IT,Marketing,Sales
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,3.346918,3.544379,3.341355,3.105611,3.266735
Male,3.31952,3.288189,3.48642,3.434236,3.394904
Non-Binary,3.081633,3.478261,2.923077,3.533333,2.333333


#### Observations
- Employees below 25 years have most frequent subcategories related to Retirement, Health Insurance and Gym Membership
- Employees aged 30-45 have most frequent subcategory after Retirement connected with Life Insuranc, Cellphone Allowance.
- The most frequent usage of benefits across tenure groups is tenure between 5-15.
- IT department has the highest usage of benefits between all departments. The lowest usage has Marketing department. All age groups below 42 tend to use more benfits than those over 42.
-  Non-Binary employees have very disparse values across departments - in some depatrments like IT, they have a very high usage of benefits on average whereas in sale it is slightly below 3 benefits on average. Apart from HR department, there is a visible drop in usage for employees with tenure >25 years.
- HR departments have high benefit usage across genders, but female employees in Marketing and Sales show lower engagement.
- Certain benefit subtypes (e.g., Gym Membership, Retirement Plan) are favored by younger employees and HR, while Finance does not seem to be interested in the Wellness or Marketing and Sales are not interested in Childcare
- across tenure groups and departments there is a big variaty of different preferences.

### Actionable Recommendations
- Promote Retirement Plan and Gym Membership benefits to younger employees (<25).
- Target Life Insurance and Cellphone Allowance benefits to employees aged 30-45.
- Encourage benefit usage among tenure group 5-10 years, as they are most active.
- HR department shows highest benefit usage across all groups; consider sharing best practices with other departments.
- Non-Binary employees usage of benefits very differs across departments. Try some inclusive benefit options, and feedback to understand how to support them.
- Investigate the lowe engagment of females in Sales and Marketing
- Look at benefits that are barely used and promote those that are in high demand.