In [1]:
import pandas as pd
import numpy as np

merged = pd.read_csv('data/cleaned_data.csv') 

## Cost Efficiency and Subcategory Analysis

In [2]:
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

# Calculate cost-per-usage by BenefitID/BenefitSubType
merged['CostPerUsage'] = merged['BenefitCost'] / merged['UsageFrequency'].replace(0, np.nan)  # Avoid division by zero
merged['CostPerUsage'].fillna(0, inplace=True)  # Fill NaN with 0 for no usage  

# Develop an ROI score (normalize cost-per-usage and satisfaction score)

# Normalize cost-per-usage
max_cost = merged['CostPerUsage'].max()
min_cost = merged['CostPerUsage'].min()
merged['NormalizedCost'] = (merged['CostPerUsage'] - min_cost) / (max_cost - min_cost)

# Normalize satisfaction score
max_satisfaction = merged['SatisfactionScore'].max()
min_satisfaction = merged['SatisfactionScore'].min()
merged['NormalizedSatisfaction'] = (merged['SatisfactionScore'] - min_satisfaction) / (max_satisfaction - min_satisfaction)

# Calculate ROI score
# Those that have no cost or satisfaction will have NaN ROI, which we can handle later
merged['ROI_Score'] = merged['NormalizedSatisfaction'] / merged['NormalizedCost'].replace(0, np.nan)  # Avoid division by zero

# Distribution of cost per usage by BenefitSubType

def create_distribution_plot(df, x_col):
    fig = px.histogram(
        df,
        x=x_col,
        color='BenefitSubType',
        nbins=20,
        category_orders={'BenefitSubType': sorted(df['BenefitSubType'].unique())},
        labels={x_col: x_col},
        title="Distribution of " + x_col + " by Benefit Subtype"
    )
    fig.update_layout(xaxis_title=f'{x_col}', yaxis_title='Count')
    fig.show()

create_distribution_plot(merged, 'CostPerUsage')
create_distribution_plot(merged, 'ROI_Score')

In [3]:
# Identify underutilized high-cost subcategories
# Group by BenefitSubType and calculate mean cost-per-usage and usage frequency, satisfaction score, and rank ROI score
subcategory_stats = merged.groupby('BenefitSubType').agg({
    'CostPerUsage': 'mean',
    'UsageFrequency': 'mean',
    'SatisfactionScore': 'mean',
    'ROI_Score': 'mean'
}).reset_index()

# Define high-cost threshold as the 75th percentile of cost-per-usage
high_cost_threshold = subcategory_stats['CostPerUsage'].quantile(0.75)

# Define low-usage threshold as the 25th percentile of usage frequency
low_usage_threshold = subcategory_stats['UsageFrequency'].quantile(0.25)

# Visualise and highlight those above/below thresholds
# Highlight subcategories above the high cost threshold and below the low usage threshold with colored bars

def create_bar_plot_treshold(df, x_col, y_col, title, treshold, treshold_label = 'max'):
    if treshold_label == 'max':
        df['ThresholdFlag'] = np.where(df['CostPerUsage'] > treshold, 'Above Threshold', 'Normal')
        fig = px.bar(
                df,
                x=x_col,
                y=y_col,
                color='ThresholdFlag',  # Only bars above threshold are colored
                color_discrete_map={'Above Threshold': 'red', 'Normal': 'lightgray'},
                title=title)
    else:
        df['ThresholdFlag'] = np.where(df[y_col] < treshold, 'Below Threshold', 'Normal')
        fig = px.bar(
            df,
            x=x_col,
            y=y_col,
            color='ThresholdFlag',  # Only bars above threshold are colored
            color_discrete_map={'Below Threshold': 'red', 'Normal': 'lightgray'},
            title=title
        )
    
    fig.add_hline(y=treshold, line_dash="dash", line_color="darkred")
    fig.show()

create_bar_plot_treshold(merged.groupby('BenefitSubType').agg({'CostPerUsage': 'mean'}).reset_index(), 'BenefitSubType', 'CostPerUsage', 'Average Cost Per Usage by Benefit Subtype', high_cost_threshold, 'max')
create_bar_plot_treshold(merged.groupby('BenefitSubType').agg({'UsageFrequency': 'mean'}).reset_index(), 'BenefitSubType', 'UsageFrequency', 'Average Usage Frequency by Benefit Subtype', low_usage_threshold, 'min')

# Find those that have high cost and low usage
underutilized_high_cost = subcategory_stats[
    (subcategory_stats['CostPerUsage'] > high_cost_threshold) &
    (subcategory_stats['UsageFrequency'] < low_usage_threshold)
]

print("Underutilized High-Cost Subcategories:")
print(underutilized_high_cost)

# Create quadrant plots (cost vs. usage, cost vs. satisfaction)
fig = px.scatter(subcategory_stats, x='CostPerUsage', y='UsageFrequency',
                 color='BenefitSubType',
                 title='Cost vs Usage Frequency by Benefit Subtype',
                 labels={'CostPerUsage': 'Average Cost per Usage', 'UsageFrequency': 'Average Usage Frequency'},
                 hover_data=['BenefitSubType'])
fig.add_hline(y=low_usage_threshold, line_dash="dash", line_color="red", annotation_text="Low Usage Threshold")
fig.add_vline(x=high_cost_threshold, line_dash="dash", line_color="blue", annotation_text="High Cost Threshold")
fig.show()

# Find those with have high cost and low satisfaction
underutilized_high_cost_satisfaction = subcategory_stats[
    (subcategory_stats['CostPerUsage'] > high_cost_threshold) &
    (subcategory_stats['SatisfactionScore'] < subcategory_stats['SatisfactionScore'].quantile(0.25))
]

print("Underutilized High-Cost Benefits with Low Satisfaction:")
print(underutilized_high_cost_satisfaction[['BenefitSubType', 'CostPerUsage', 'SatisfactionScore']])

# Visualise and highlight those above/below thresholds
fig = px.scatter(subcategory_stats, x='CostPerUsage', y='SatisfactionScore',
                 color='BenefitSubType',
                 title='Cost vs SatisfactionScore by Benefit Subtype',
                 labels={'CostPerUsage': 'Average Cost per Usage', 'SatisfactionScore': 'Average SatisfactionScore'},
                 hover_data=['BenefitSubType'])
fig.add_hline(y=subcategory_stats['SatisfactionScore'].quantile(0.25), line_dash="dash", line_color="red", annotation_text="Low Satisfaction Threshold")
fig.add_vline(x=high_cost_threshold, line_dash="dash", line_color="blue", annotation_text="High Cost Threshold")
fig.show()

Underutilized High-Cost Subcategories:
         BenefitSubType  CostPerUsage  UsageFrequency  SatisfactionScore  \
11      Graduate Degree    205.603185        3.103976           3.070336   
18  On-Site Infant Care    200.353243        2.777419           2.893548   

    ROI_Score  
11   2.668405  
18   2.263524  


Underutilized High-Cost Benefits with Low Satisfaction:
         BenefitSubType  CostPerUsage  SatisfactionScore
0   401k Basic Matching    226.512933           2.875839
15   Individual Courses    200.472968           2.861446
18  On-Site Infant Care    200.353243           2.893548


In [4]:
# Rank benefits by cost efficiency
subcategory_stats['CostEfficiencyRank'] = (subcategory_stats['ROI_Score'].rank(ascending=False)).astype(int)

# Flag which benefits could be removed
# Define conditions for removal - high cost and low usage or low satisfaction
subcategory_stats['RemoveFlag'] = np.where(
    (subcategory_stats['CostPerUsage'] > high_cost_threshold) &
    ((subcategory_stats['UsageFrequency'] < low_usage_threshold) |
    (subcategory_stats['SatisfactionScore'] < subcategory_stats['SatisfactionScore'].quantile(0.25))),
    'Remove', 'Keep'
)
# Find BenefitType
benefit_types = merged[['BenefitType','BenefitSubType']].drop_duplicates().reset_index(drop=True)

# Merge subcategory stats with benefit types
category_stats = subcategory_stats.merge(
    benefit_types,
    on='BenefitSubType',
    how='left'
)

# Display categories that could be removed
removable_categories = category_stats[category_stats['RemoveFlag'] == 'Remove']
print("Removable Categories:")
print(removable_categories[['BenefitType','BenefitSubType', 'CostPerUsage', 'UsageFrequency', 'SatisfactionScore', 'RemoveFlag']])


Removable Categories:
              BenefitType       BenefitSubType  CostPerUsage  UsageFrequency  \
0         Retirement Plan  401k Basic Matching    226.512933        3.744966   
11  Tuition Reimbursement      Graduate Degree    205.603185        3.103976   
15  Tuition Reimbursement   Individual Courses    200.472968        3.430723   
18              Childcare  On-Site Infant Care    200.353243        2.777419   

    SatisfactionScore RemoveFlag  
0            2.875839     Remove  
11           3.070336     Remove  
15           2.861446     Remove  
18           2.893548     Remove  


## Findings 

- We looked for subcategories and their individual cost, how satisfied employees were on average, how much frequently used they were on average.
- Subcategories that were very costly and were not used a lot are - On-Site Infant Care and Graduate Degree. These subcategories seem to be  underused and costly.
- Subcategories that were very costly (within 25% most costly categories) and on average employees were not satisfied (25% least satisfied) with - 401k Basic Matching,  Individual Courses, On-Site Infant Care. These subcategories seem to be unsatisfactory and costly.
- Conditions for removal were chosen as subcategories that are both costly and underused or unsatisfactory.
- There were all together 3 BenefitTypes and 4 Subcategories that had 'Remove' in the flag column, and should be consider for reviewing or discontinuing it. These were: 
    - Retirement Plan, i.e. 401k Basic Matching (frequently used, but least satisfactory on average), 
    - Tuition Reimbursement, i.e.Graduate Degree and Individual Courses
    - Chilcare, i.e On-Site Infant Care that is not frequent and not satisfactory in general.
