In [42]:
#load necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go



In [43]:
pip install plotly

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



##2: Exploratory Data Analysis

###2.1: Utilization Analysis



In [44]:
merged=pd.read_csv("/Users/iexposit/Downloads/cleaned_data.csv")

In [45]:
#Convert LastUsedDate to datetime

merged['LastUsedDate']=pd.to_datetime(merged['LastUsedDate'], errors='coerce')
merged['Month']=merged['LastUsedDate'].dt.to_period('M').astype(str)
merged['Month'] = merged['LastUsedDate'].dt.to_period('M').dt.to_timestamp()


#Recategorization of Engagement Level
def categorize_usage(freq):
    if freq>=7:
        return 'High'
    elif freq >=3:
        return 'Medium'
    else:
        return 'Low'

merged['EngagementLevel']=merged['UsageFrequency'].apply(categorize_usage)

In [46]:
#Section 1: Engagement Level Distribution
eng_counts = merged['EngagementLevel'].value_counts().sort_index().reset_index()
eng_counts.columns = ['EngagementLevel', 'Count']
fig1 = px.bar(
    eng_counts,
    x='EngagementLevel',
    y='Count',
    title='Employee Engagement Levels',
    category_orders={'EngagementLevel': ['Low', 'Medium', 'High']},
    labels={'Count': 'Number of Records'},
)
fig1.show()

In [47]:
#Section 2: Benefits by Usage
benefit_freq=merged.groupby('BenefitID')['UsageFrequency'].sum().sort_values(ascending=False).reset_index()
fig1=px.bar(benefit_freq,x='BenefitID',y='UsageFrequency', title='UsageFrequency per BenefitID')
fig1.show()

In [48]:
#Section 3: Usage by Benefit SubType
sub_totals = (
    merged.groupby('BenefitSubType')['UsageFrequency']
          .sum()
          .reset_index()
          .sort_values('UsageFrequency', ascending=False)
)
fig4 = px.bar(
    sub_totals, x='BenefitSubType', y='UsageFrequency',
    title='Usage by Benefit SubType',
    labels={'UsageFrequency':'Total Usage Frequency','BenefitSubType':'SubType'}
)
fig4.update_layout(xaxis_tickangle=-45)
fig4.show()


In [49]:
#Section 4: Monthly usage trend
monthly_usage=merged.groupby('Month')['UsageFrequency'].sum().reset_index()
fig2=px.line(monthly_usage,x='Month', y='UsageFrequency', title='Monthly Usage Trend')
fig2.show()

In [50]:
print(merged.columns.to_list())

['EmployeeID', 'BenefitID', 'SatisfactionScore', 'Comments', 'UsageFrequency', 'LastUsedDate', 'BenefitType', 'BenefitSubType', 'BenefitCost', 'Gender_Female', 'Gender_Male', 'Gender_Non-Binary', 'Department_Finance', 'Department_HR', 'Department_IT', 'Department_Marketing', 'Department_Sales', 'Age_Gen_Boomer', 'Age_Gen_Gen_X', 'Age_Gen_Gen_Z', 'Age_Gen_Millenial', 'TenureGroups_ >25_years', 'TenureGroups_16-25_years', 'TenureGroups_5-15_years', 'TenureGroups_<5_years', 'BenefitFlag_Cell_Phone_Allowance_Monthly_Communications', 'BenefitFlag_Childcare_After-School_Care', 'BenefitFlag_Childcare_On-Site_Infant_Care', 'BenefitFlag_Commuter_Benefits_Transit_Subsidy', 'BenefitFlag_Flexible_Spending_Account_Healthcare_FSA', 'BenefitFlag_Gym_Membership_Family_Membership', 'BenefitFlag_Gym_Membership_Tier_1_Partners', 'BenefitFlag_Gym_Membership_Tier_2_Partners', 'BenefitFlag_Gym_Membership_Tier_3_Partners', 'BenefitFlag_Health_Insurance_HDHP_Individual', 'BenefitFlag_Health_Insurance_HMO_Fami

In [51]:
# Section 5. Department-level heatmap for 2024 usage
df2024 = merged[ merged['LastUsedDate'].dt.year == 2024 ].copy()

dept_flags = [
    'Department_Finance',
    'Department_HR',
    'Department_IT',
    'Department_Marketing',
    'Department_Sales'
]


df_long = df2024.melt(
    id_vars=['LastUsedDate', 'UsageFrequency'],
    value_vars=dept_flags,
    var_name='Department',
    value_name='IsMember'
)

df_long = df_long[df_long['IsMember'] == True]

df_long['Month'] = df_long['LastUsedDate'].dt.strftime('%b')

heatmap_df = df_long.pivot_table(
    index='Month',
    columns='Department',
    values='UsageFrequency',
    aggfunc='sum'
)

meses = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug']
heatmap_df = heatmap_df.reindex(meses)

# 8) Plot
fig = px.imshow(
    heatmap_df,
    labels=dict(x="Departament", y="Month", color="Usage (sum)"),
    x=heatmap_df.columns,
    y=heatmap_df.index,
    text_auto=True
)
fig.update_layout(title="Department-level heatmap for 2024 usage")
fig.show()


In [52]:
#Section 6: Monthly Usage Heatmap for top 10 subTypes
usage_by_sub=(merged.groupby('BenefitSubType',as_index=False)['UsageFrequency'].sum().sort_values('UsageFrequency', ascending=False))
month_sub=(merged.groupby(['Month','BenefitSubType'],as_index=False)['UsageFrequency'].sum())
top10=usage_by_sub['BenefitSubType'].head(10).tolist()
hm_data=month_sub[month_sub['BenefitSubType'].isin(top10)]
fig5=px.density_heatmap(hm_data,x='Month',y='BenefitSubType',z='UsageFrequency',title='Monthly Usage Heatmap (Top 10 SubTypes)')
fig5.update_xaxes(dtick='M1', tickformat='%Y-%m')

Observations:

-Most employees fall into the Low engagement bucket (about 4,500), with far fewer in Medium (around 3,200) and High (about 1,900).

-Usage per BenefitID generally sits in the 900–1,400 range, with noticeable peaks at IDs 4, 20, and 26, plus a few that see under 1,000 uses.

-A few subtypes—especially 401k High Contribution and Undergraduate Degree—account for the bulk of activity, while family-care options lag behind.

-Tuition assistance, commuting support, and premium discounts are particularly popular, whereas some child-care programs and basic retirement matches may need stronger incentives.

-There’s a clear seasonal cycle: usage spikes in October–November and May–July, dips in January, and shows an additional peak around June.

-Departments behave differently: IT leads in usage, HR trails significantly, and Sales/Finance sit in the middle but dip during the summer months.

-The top subtypes display strong seasonal peaks (for example, 401k fees/contributions), while other popular subtypes maintain steadier, lower‐level usage throughout the year.

Recommendations:

-Help Low Users

Send simple “How to use your benefits” reminders and offer small rewards (like points or badges) to encourage more use.

-Focus on Winners

Find out why BenefitIDs 4, 20, and 26 are so popular and apply those best practices elsewhere. Survey employees on the least-used benefits and remove any hurdles.

-Boost Care Options

Roll out family-friendly campaigns—emails, posters, or webinars—to promote child-care benefits alongside your most-used programs.

-Plan by Season

Launch major benefit announcements 4–6 weeks before the October and May usage peaks. Run a “New Year, New Benefits” push in December to smooth out the January dip.

-Leverage High-Use Teams

Invite IT team members to share tips with HR and Sales. Create short, department-specific guides and encourage managers to highlight benefits in team meetings.

-Match Promotions to Demand

Schedule subtype-specific workshops when usage usually peaks. Run small mid-year challenges to keep momentum and cross-promote subtypes that stay steady throughout the year.